[llvm] f387918 - [TargetLowering][RISCV][ARM][AArch64][Mips] Reduce the number of AND mask constants used by BSWAP expansion.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 15 14:36:12 PST 2022
Author: Craig Topper
Date: 2022-11-15T14:36:01-08:00
New Revision: f387918dd8549331a4f60df70cccd9558eca8df1
URL: https://github.com/llvm/llvm-project/commit/f387918dd8549331a4f60df70cccd9558eca8df1
DIFF: https://github.com/llvm/llvm-project/commit/f387918dd8549331a4f60df70cccd9558eca8df1.diff
LOG: [TargetLowering][RISCV][ARM][AArch64][Mips] Reduce the number of AND mask constants used by BSWAP expansion.
We can reuse constants if we use SRL followed by AND and AND followed by SHL.
Similar was done to bitreverse previously.
Differential Revision: https://reviews.llvm.org/D138045
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
llvm/test/CodeGen/ARM/load-combine-big-endian.ll
llvm/test/CodeGen/ARM/load-combine.ll
llvm/test/CodeGen/Mips/bswap.ll
llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
llvm/test/CodeGen/RISCV/rv32zbb.ll
llvm/test/CodeGen/RISCV/rv64zbb.ll
llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4a34909dbcb69..d5f624a20b68a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8348,36 +8348,36 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
case MVT::i32:
Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
- Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+ Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op,
+ DAG.getConstant(0xFF00, dl, VT));
+ Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(8, dl, SHVT));
Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
- Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
- Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
- DAG.getConstant(0xFF0000, dl, VT));
Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, dl, VT));
+ Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
case MVT::i64:
Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
- Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
- Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
- Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+ Tmp7 = DAG.getNode(ISD::AND, dl, VT, Op,
+ DAG.getConstant(255ULL<<8, dl, VT));
+ Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Tmp7, DAG.getConstant(40, dl, SHVT));
+ Tmp6 = DAG.getNode(ISD::AND, dl, VT, Op,
+ DAG.getConstant(255ULL<<16, dl, VT));
+ Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Tmp6, DAG.getConstant(24, dl, SHVT));
+ Tmp5 = DAG.getNode(ISD::AND, dl, VT, Op,
+ DAG.getConstant(255ULL<<24, dl, VT));
+ Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Tmp5, DAG.getConstant(8, dl, SHVT));
Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
- Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
- Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
- Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
- Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7,
- DAG.getConstant(255ULL<<48, dl, VT));
- Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6,
- DAG.getConstant(255ULL<<40, dl, VT));
- Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5,
- DAG.getConstant(255ULL<<32, dl, VT));
Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4,
DAG.getConstant(255ULL<<24, dl, VT));
+ Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
DAG.getConstant(255ULL<<16, dl, VT));
+ Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2,
- DAG.getConstant(255ULL<<8 , dl, VT));
+ DAG.getConstant(255ULL<<8, dl, VT));
+ Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index 62d18bd92b0a7..1fba00d9f7b6c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -201,29 +201,27 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) #0 {
; CHECK-LABEL: bswap_v2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI14_0
-; CHECK-NEXT: adrp x9, .LCPI14_1
; CHECK-NEXT: adrp x10, .LCPI14_2
+; CHECK-NEXT: adrp x9, .LCPI14_1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
; CHECK-NEXT: adrp x8, .LCPI14_3
+; CHECK-NEXT: ldr d3, [x10, :lo12:.LCPI14_2]
+; CHECK-NEXT: movprfx z4, z0
+; CHECK-NEXT: lsr z4.s, p0/m, z4.s, z1.s
; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI14_1]
; CHECK-NEXT: movprfx z5, z0
-; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z1.s
-; CHECK-NEXT: ldr d3, [x10, :lo12:.LCPI14_2]
-; CHECK-NEXT: movprfx z6, z0
-; CHECK-NEXT: lsr z6.s, p0/m, z6.s, z2.s
-; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI14_3]
-; CHECK-NEXT: adrp x8, .LCPI14_4
+; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z2.s
; CHECK-NEXT: lslr z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT: and z0.d, z0.d, z3.d
+; CHECK-NEXT: and z3.d, z5.d, z3.d
; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT: and z2.d, z6.d, z3.d
-; CHECK-NEXT: and z0.d, z0.d, z4.d
-; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI14_4]
-; CHECK-NEXT: orr z2.d, z2.d, z5.d
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI14_3]
+; CHECK-NEXT: orr z3.d, z3.d, z4.d
; CHECK-NEXT: orr z0.d, z1.d, z0.d
-; CHECK-NEXT: orr z0.d, z0.d, z2.d
-; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z3.s
+; CHECK-NEXT: orr z0.d, z0.d, z3.d
+; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op)
@@ -290,25 +288,24 @@ define void @bswap_v16i16(<16 x i16>* %a) #0 {
define <2 x i32> @bswap_v2i32(<2 x i32> %op) #0 {
; CHECK-LABEL: bswap_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI18_1
-; CHECK-NEXT: adrp x9, .LCPI18_2
-; CHECK-NEXT: adrp x10, .LCPI18_0
+; CHECK-NEXT: adrp x8, .LCPI18_0
+; CHECK-NEXT: adrp x10, .LCPI18_2
+; CHECK-NEXT: adrp x9, .LCPI18_1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI18_1]
-; CHECK-NEXT: adrp x8, .LCPI18_3
-; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI18_2]
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT: movprfx z4, z0
+; CHECK-NEXT: lsr z4.s, p0/m, z4.s, z1.s
+; CHECK-NEXT: ldr d3, [x10, :lo12:.LCPI18_2]
+; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI18_1]
; CHECK-NEXT: movprfx z5, z0
-; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z1.s
-; CHECK-NEXT: ldr d3, [x10, :lo12:.LCPI18_0]
-; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI18_3]
-; CHECK-NEXT: and z2.d, z5.d, z2.d
-; CHECK-NEXT: movprfx z5, z0
-; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z3.s
-; CHECK-NEXT: lslr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT: and z1.d, z1.d, z4.d
-; CHECK-NEXT: orr z2.d, z2.d, z5.d
+; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z2.s
+; CHECK-NEXT: and z5.d, z5.d, z3.d
+; CHECK-NEXT: and z3.d, z0.d, z3.d
+; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: movprfx z1, z3
+; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: orr z2.d, z5.d, z4.d
; CHECK-NEXT: orr z0.d, z0.d, z1.d
; CHECK-NEXT: orr z0.d, z0.d, z2.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
@@ -320,25 +317,24 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) #0 {
define <4 x i32> @bswap_v4i32(<4 x i32> %op) #0 {
; CHECK-LABEL: bswap_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI19_1
-; CHECK-NEXT: adrp x9, .LCPI19_2
-; CHECK-NEXT: adrp x10, .LCPI19_0
+; CHECK-NEXT: adrp x8, .LCPI19_0
+; CHECK-NEXT: adrp x10, .LCPI19_2
+; CHECK-NEXT: adrp x9, .LCPI19_1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_1]
-; CHECK-NEXT: adrp x8, .LCPI19_3
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_2]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT: movprfx z4, z0
+; CHECK-NEXT: lsr z4.s, p0/m, z4.s, z1.s
+; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI19_2]
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1]
; CHECK-NEXT: movprfx z5, z0
-; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z1.s
-; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI19_0]
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3]
-; CHECK-NEXT: and z2.d, z5.d, z2.d
-; CHECK-NEXT: movprfx z5, z0
-; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z3.s
-; CHECK-NEXT: lslr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT: and z1.d, z1.d, z4.d
-; CHECK-NEXT: orr z2.d, z2.d, z5.d
+; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z2.s
+; CHECK-NEXT: and z5.d, z5.d, z3.d
+; CHECK-NEXT: and z3.d, z0.d, z3.d
+; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: movprfx z1, z3
+; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: orr z2.d, z5.d, z4.d
; CHECK-NEXT: orr z0.d, z0.d, z1.d
; CHECK-NEXT: orr z0.d, z0.d, z2.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
@@ -352,35 +348,33 @@ define void @bswap_v8i32(<8 x i32>* %a) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI20_0
; CHECK-NEXT: adrp x9, .LCPI20_1
-; CHECK-NEXT: ldp q3, q2, [x0]
+; CHECK-NEXT: ldp q4, q1, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI20_0]
; CHECK-NEXT: adrp x8, .LCPI20_2
-; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI20_1]
-; CHECK-NEXT: movprfx z5, z2
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI20_1]
+; CHECK-NEXT: movprfx z5, z1
; CHECK-NEXT: lsr z5.s, p0/m, z5.s, z0.s
-; CHECK-NEXT: movprfx z6, z2
-; CHECK-NEXT: lsr z6.s, p0/m, z6.s, z1.s
-; CHECK-NEXT: movprfx z7, z2
+; CHECK-NEXT: movprfx z6, z1
+; CHECK-NEXT: lsr z6.s, p0/m, z6.s, z2.s
+; CHECK-NEXT: movprfx z7, z1
; CHECK-NEXT: lsl z7.s, p0/m, z7.s, z0.s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_2]
-; CHECK-NEXT: adrp x8, .LCPI20_3
-; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z1.s
-; CHECK-NEXT: and z6.d, z6.d, z4.d
-; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI20_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2]
+; CHECK-NEXT: movprfx z16, z4
+; CHECK-NEXT: lsr z16.s, p0/m, z16.s, z2.s
+; CHECK-NEXT: and z1.d, z1.d, z3.d
+; CHECK-NEXT: and z6.d, z6.d, z3.d
+; CHECK-NEXT: and z16.d, z16.d, z3.d
+; CHECK-NEXT: and z3.d, z4.d, z3.d
; CHECK-NEXT: orr z5.d, z6.d, z5.d
-; CHECK-NEXT: movprfx z6, z3
-; CHECK-NEXT: lsr z6.s, p0/m, z6.s, z1.s
-; CHECK-NEXT: and z4.d, z6.d, z4.d
-; CHECK-NEXT: movprfx z6, z3
+; CHECK-NEXT: movprfx z6, z4
; CHECK-NEXT: lsr z6.s, p0/m, z6.s, z0.s
-; CHECK-NEXT: lslr z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT: lslr z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT: and z2.d, z2.d, z16.d
-; CHECK-NEXT: and z1.d, z1.d, z16.d
-; CHECK-NEXT: orr z3.d, z4.d, z6.d
-; CHECK-NEXT: orr z0.d, z0.d, z1.d
-; CHECK-NEXT: orr z1.d, z7.d, z2.d
+; CHECK-NEXT: lslr z0.s, p0/m, z0.s, z4.s
+; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: lslr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: orr z3.d, z16.d, z6.d
+; CHECK-NEXT: orr z0.d, z0.d, z2.d
+; CHECK-NEXT: orr z1.d, z7.d, z1.d
; CHECK-NEXT: orr z0.d, z0.d, z3.d
; CHECK-NEXT: orr z1.d, z1.d, z5.d
; CHECK-NEXT: stp q0, q1, [x0]
@@ -397,48 +391,43 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) #0 {
; CHECK-NEXT: mov w8, #56
; CHECK-NEXT: mov w9, #40
; CHECK-NEXT: mov w10, #65280
+; CHECK-NEXT: mov w11, #24
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: mov w8, #24
+; CHECK-NEXT: mov w8, #16711680
; CHECK-NEXT: fmov d2, x9
-; CHECK-NEXT: mov w9, #16711680
+; CHECK-NEXT: mov w9, #8
; CHECK-NEXT: fmov d3, x10
-; CHECK-NEXT: mov w10, #8
-; CHECK-NEXT: fmov d4, x8
+; CHECK-NEXT: movprfx z7, z0
+; CHECK-NEXT: lsr z7.d, p0/m, z7.d, z1.d
+; CHECK-NEXT: fmov d5, x8
; CHECK-NEXT: mov w8, #-16777216
-; CHECK-NEXT: fmov d5, x9
-; CHECK-NEXT: mov x9, #1095216660480
; CHECK-NEXT: movprfx z16, z0
; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z2.d
-; CHECK-NEXT: and z3.d, z16.d, z3.d
-; CHECK-NEXT: fmov d7, x8
-; CHECK-NEXT: mov x8, #280375465082880
+; CHECK-NEXT: fmov d4, x11
+; CHECK-NEXT: fmov d6, x9
+; CHECK-NEXT: and z16.d, z16.d, z3.d
+; CHECK-NEXT: fmov d17, x8
+; CHECK-NEXT: orr z7.d, z16.d, z7.d
; CHECK-NEXT: movprfx z16, z0
; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z4.d
-; CHECK-NEXT: fmov d6, x10
-; CHECK-NEXT: and z5.d, z16.d, z5.d
-; CHECK-NEXT: movprfx z16, z0
-; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z6.d
-; CHECK-NEXT: fmov d18, x8
-; CHECK-NEXT: mov x8, #71776119061217280
-; CHECK-NEXT: and z7.d, z16.d, z7.d
-; CHECK-NEXT: fmov d17, x9
-; CHECK-NEXT: orr z5.d, z7.d, z5.d
-; CHECK-NEXT: movprfx z16, z0
-; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z1.d
-; CHECK-NEXT: fmov d7, x8
-; CHECK-NEXT: lslr z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT: lslr z4.d, p0/m, z4.d, z0.d
-; CHECK-NEXT: lslr z2.d, p0/m, z2.d, z0.d
-; CHECK-NEXT: and z6.d, z6.d, z17.d
-; CHECK-NEXT: and z4.d, z4.d, z18.d
+; CHECK-NEXT: movprfx z18, z0
+; CHECK-NEXT: lsr z18.d, p0/m, z18.d, z6.d
+; CHECK-NEXT: and z16.d, z16.d, z5.d
+; CHECK-NEXT: and z5.d, z0.d, z5.d
+; CHECK-NEXT: and z18.d, z18.d, z17.d
+; CHECK-NEXT: and z17.d, z0.d, z17.d
+; CHECK-NEXT: lslr z6.d, p0/m, z6.d, z17.d
+; CHECK-NEXT: lslr z4.d, p0/m, z4.d, z5.d
+; CHECK-NEXT: and z3.d, z0.d, z3.d
; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: and z1.d, z2.d, z7.d
-; CHECK-NEXT: orr z3.d, z3.d, z16.d
+; CHECK-NEXT: orr z16.d, z18.d, z16.d
+; CHECK-NEXT: movprfx z1, z3
+; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z2.d
; CHECK-NEXT: orr z2.d, z4.d, z6.d
; CHECK-NEXT: orr z0.d, z0.d, z1.d
-; CHECK-NEXT: orr z1.d, z5.d, z3.d
+; CHECK-NEXT: orr z1.d, z16.d, z7.d
; CHECK-NEXT: orr z0.d, z0.d, z2.d
; CHECK-NEXT: orr z0.d, z0.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
@@ -464,37 +453,32 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) #0 {
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3]
; CHECK-NEXT: adrp x8, .LCPI22_6
; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI22_4]
-; CHECK-NEXT: adrp x9, .LCPI22_7
+; CHECK-NEXT: movprfx z7, z0
+; CHECK-NEXT: lsr z7.d, p0/m, z7.d, z1.d
; CHECK-NEXT: movprfx z16, z0
; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z2.d
-; CHECK-NEXT: and z3.d, z16.d, z3.d
-; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI22_6]
-; CHECK-NEXT: adrp x8, .LCPI22_8
-; CHECK-NEXT: movprfx z16, z0
-; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z4.d
; CHECK-NEXT: ldr q6, [x10, :lo12:.LCPI22_5]
-; CHECK-NEXT: and z5.d, z16.d, z5.d
-; CHECK-NEXT: movprfx z16, z0
-; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z6.d
-; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI22_8]
-; CHECK-NEXT: adrp x8, .LCPI22_9
-; CHECK-NEXT: and z7.d, z16.d, z7.d
-; CHECK-NEXT: ldr q17, [x9, :lo12:.LCPI22_7]
-; CHECK-NEXT: orr z5.d, z7.d, z5.d
+; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI22_6]
+; CHECK-NEXT: and z16.d, z16.d, z3.d
+; CHECK-NEXT: orr z7.d, z16.d, z7.d
; CHECK-NEXT: movprfx z16, z0
-; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z1.d
-; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI22_9]
-; CHECK-NEXT: lslr z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT: lslr z4.d, p0/m, z4.d, z0.d
-; CHECK-NEXT: lslr z2.d, p0/m, z2.d, z0.d
-; CHECK-NEXT: and z6.d, z6.d, z17.d
-; CHECK-NEXT: and z4.d, z4.d, z18.d
+; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z4.d
+; CHECK-NEXT: movprfx z18, z0
+; CHECK-NEXT: lsr z18.d, p0/m, z18.d, z6.d
+; CHECK-NEXT: and z16.d, z16.d, z5.d
+; CHECK-NEXT: and z18.d, z18.d, z17.d
+; CHECK-NEXT: and z17.d, z0.d, z17.d
+; CHECK-NEXT: and z5.d, z0.d, z5.d
+; CHECK-NEXT: lslr z6.d, p0/m, z6.d, z17.d
+; CHECK-NEXT: lslr z4.d, p0/m, z4.d, z5.d
+; CHECK-NEXT: and z3.d, z0.d, z3.d
; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: and z1.d, z2.d, z7.d
-; CHECK-NEXT: orr z3.d, z3.d, z16.d
+; CHECK-NEXT: orr z16.d, z18.d, z16.d
+; CHECK-NEXT: movprfx z1, z3
+; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z2.d
; CHECK-NEXT: orr z2.d, z4.d, z6.d
; CHECK-NEXT: orr z0.d, z0.d, z1.d
-; CHECK-NEXT: orr z1.d, z5.d, z3.d
+; CHECK-NEXT: orr z1.d, z16.d, z7.d
; CHECK-NEXT: orr z0.d, z0.d, z2.d
; CHECK-NEXT: orr z0.d, z0.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
@@ -508,79 +492,72 @@ define void @bswap_v4i64(<4 x i64>* %a) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI23_0
; CHECK-NEXT: adrp x9, .LCPI23_1
-; CHECK-NEXT: adrp x10, .LCPI23_2
+; CHECK-NEXT: adrp x10, .LCPI23_3
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldp q0, q1, [x0]
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_0]
-; CHECK-NEXT: adrp x8, .LCPI23_4
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_1]
-; CHECK-NEXT: adrp x9, .LCPI23_3
-; CHECK-NEXT: ldr q4, [x10, :lo12:.LCPI23_2]
-; CHECK-NEXT: adrp x10, .LCPI23_5
-; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI23_4]
-; CHECK-NEXT: adrp x8, .LCPI23_6
-; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI23_3]
-; CHECK-NEXT: adrp x9, .LCPI23_7
-; CHECK-NEXT: movprfx z6, z1
-; CHECK-NEXT: lsr z6.d, p0/m, z6.d, z2.d
-; CHECK-NEXT: movprfx z17, z1
-; CHECK-NEXT: lsr z17.d, p0/m, z17.d, z3.d
-; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI23_6]
-; CHECK-NEXT: adrp x8, .LCPI23_8
-; CHECK-NEXT: and z6.d, z6.d, z4.d
-; CHECK-NEXT: ldr q16, [x10, :lo12:.LCPI23_5]
-; CHECK-NEXT: orr z6.d, z6.d, z17.d
-; CHECK-NEXT: ldr q17, [x9, :lo12:.LCPI23_7]
-; CHECK-NEXT: ldr q21, [x8, :lo12:.LCPI23_8]
-; CHECK-NEXT: adrp x8, .LCPI23_9
-; CHECK-NEXT: movprfx z19, z1
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_0]
+; CHECK-NEXT: adrp x8, .LCPI23_2
+; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI23_1]
+; CHECK-NEXT: adrp x9, .LCPI23_4
+; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI23_3]
+; CHECK-NEXT: adrp x10, .LCPI23_6
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_2]
+; CHECK-NEXT: adrp x8, .LCPI23_5
+; CHECK-NEXT: ldr q6, [x9, :lo12:.LCPI23_4]
+; CHECK-NEXT: movprfx z16, z2
+; CHECK-NEXT: lsr z16.d, p0/m, z16.d, z3.d
+; CHECK-NEXT: ldr q17, [x10, :lo12:.LCPI23_6]
+; CHECK-NEXT: movprfx z18, z2
+; CHECK-NEXT: lsr z18.d, p0/m, z18.d, z0.d
+; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI23_5]
+; CHECK-NEXT: movprfx z19, z2
; CHECK-NEXT: lsr z19.d, p0/m, z19.d, z5.d
-; CHECK-NEXT: movprfx z20, z1
-; CHECK-NEXT: lsr z20.d, p0/m, z20.d, z16.d
-; CHECK-NEXT: and z19.d, z19.d, z7.d
-; CHECK-NEXT: and z20.d, z20.d, z18.d
-; CHECK-NEXT: orr z19.d, z20.d, z19.d
-; CHECK-NEXT: movprfx z20, z1
-; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z16.d
-; CHECK-NEXT: movprfx z22, z1
-; CHECK-NEXT: lsl z22.d, p0/m, z22.d, z5.d
-; CHECK-NEXT: ldr q23, [x8, :lo12:.LCPI23_9]
+; CHECK-NEXT: movprfx z20, z2
+; CHECK-NEXT: lsr z20.d, p0/m, z20.d, z7.d
+; CHECK-NEXT: and z16.d, z16.d, z4.d
+; CHECK-NEXT: and z19.d, z19.d, z6.d
; CHECK-NEXT: and z20.d, z20.d, z17.d
-; CHECK-NEXT: and z22.d, z22.d, z21.d
-; CHECK-NEXT: orr z6.d, z19.d, z6.d
-; CHECK-NEXT: orr z19.d, z22.d, z20.d
+; CHECK-NEXT: orr z16.d, z16.d, z18.d
+; CHECK-NEXT: orr z18.d, z20.d, z19.d
+; CHECK-NEXT: and z19.d, z2.d, z17.d
+; CHECK-NEXT: and z20.d, z2.d, z6.d
+; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z7.d
+; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z5.d
+; CHECK-NEXT: orr z16.d, z18.d, z16.d
+; CHECK-NEXT: orr z18.d, z20.d, z19.d
+; CHECK-NEXT: movprfx z19, z2
+; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z0.d
+; CHECK-NEXT: and z2.d, z2.d, z4.d
+; CHECK-NEXT: movprfx z20, z1
+; CHECK-NEXT: lsr z20.d, p0/m, z20.d, z3.d
+; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT: movprfx z21, z1
+; CHECK-NEXT: lsr z21.d, p0/m, z21.d, z0.d
+; CHECK-NEXT: and z20.d, z20.d, z4.d
+; CHECK-NEXT: orr z2.d, z19.d, z2.d
+; CHECK-NEXT: orr z19.d, z20.d, z21.d
; CHECK-NEXT: movprfx z20, z1
-; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z3.d
-; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: movprfx z22, z0
-; CHECK-NEXT: lsr z22.d, p0/m, z22.d, z2.d
-; CHECK-NEXT: and z1.d, z1.d, z23.d
-; CHECK-NEXT: and z4.d, z22.d, z4.d
-; CHECK-NEXT: movprfx z22, z0
-; CHECK-NEXT: lsr z22.d, p0/m, z22.d, z3.d
-; CHECK-NEXT: orr z1.d, z20.d, z1.d
-; CHECK-NEXT: orr z4.d, z4.d, z22.d
-; CHECK-NEXT: movprfx z20, z0
; CHECK-NEXT: lsr z20.d, p0/m, z20.d, z5.d
-; CHECK-NEXT: movprfx z22, z0
-; CHECK-NEXT: lsr z22.d, p0/m, z22.d, z16.d
-; CHECK-NEXT: lslr z16.d, p0/m, z16.d, z0.d
-; CHECK-NEXT: lslr z5.d, p0/m, z5.d, z0.d
-; CHECK-NEXT: lslr z2.d, p0/m, z2.d, z0.d
-; CHECK-NEXT: and z7.d, z20.d, z7.d
-; CHECK-NEXT: and z18.d, z22.d, z18.d
-; CHECK-NEXT: and z16.d, z16.d, z17.d
-; CHECK-NEXT: and z5.d, z5.d, z21.d
-; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z3.d
-; CHECK-NEXT: and z2.d, z2.d, z23.d
-; CHECK-NEXT: orr z7.d, z18.d, z7.d
-; CHECK-NEXT: orr z3.d, z5.d, z16.d
-; CHECK-NEXT: orr z0.d, z0.d, z2.d
-; CHECK-NEXT: orr z2.d, z7.d, z4.d
+; CHECK-NEXT: movprfx z21, z1
+; CHECK-NEXT: lsr z21.d, p0/m, z21.d, z7.d
+; CHECK-NEXT: and z20.d, z20.d, z6.d
+; CHECK-NEXT: and z21.d, z21.d, z17.d
+; CHECK-NEXT: and z17.d, z1.d, z17.d
+; CHECK-NEXT: and z6.d, z1.d, z6.d
+; CHECK-NEXT: lslr z7.d, p0/m, z7.d, z17.d
+; CHECK-NEXT: lslr z5.d, p0/m, z5.d, z6.d
+; CHECK-NEXT: lslr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: orr z20.d, z21.d, z20.d
+; CHECK-NEXT: and z4.d, z1.d, z4.d
+; CHECK-NEXT: movprfx z1, z4
+; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z3.d
+; CHECK-NEXT: orr z3.d, z5.d, z7.d
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: orr z1.d, z20.d, z19.d
; CHECK-NEXT: orr z0.d, z0.d, z3.d
-; CHECK-NEXT: orr z1.d, z1.d, z19.d
-; CHECK-NEXT: orr z0.d, z0.d, z2.d
-; CHECK-NEXT: orr z1.d, z1.d, z6.d
+; CHECK-NEXT: orr z2.d, z2.d, z18.d
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: orr z1.d, z2.d, z16.d
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
index e8673b91df8cc..010f22df64fdc 100644
--- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -56,11 +56,11 @@ define i32 @load_i32_by_i8_bswap(i32* %arg) {
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0]
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
@@ -230,22 +230,21 @@ define i32 @load_i32_by_i16_i8(i32* %arg) {
define i64 @load_i64_by_i8_bswap(i64* %arg) {
; CHECK-LABEL: load_i64_by_i8_bswap:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push {r11, lr}
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: mov r12, #65280
; CHECK-NEXT: ldr r0, [r0, #4]
-; CHECK-NEXT: mov lr, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r3, r12, r0, lsr #8
-; CHECK-NEXT: and r2, lr, r0, lsl #8
; CHECK-NEXT: orr r3, r3, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: and r2, r12, r1, lsr #8
; CHECK-NEXT: orr r0, r0, r3
-; CHECK-NEXT: and r3, lr, r1, lsl #8
+; CHECK-NEXT: and r3, r1, #65280
; CHECK-NEXT: orr r2, r2, r1, lsr #24
-; CHECK-NEXT: orr r1, r3, r1, lsl #24
+; CHECK-NEXT: lsl r1, r1, #24
+; CHECK-NEXT: orr r1, r1, r3, lsl #8
; CHECK-NEXT: orr r1, r1, r2
-; CHECK-NEXT: pop {r11, lr}
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -389,11 +388,11 @@ define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0, #1]
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
@@ -447,11 +446,11 @@ define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0, #-4]
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
@@ -603,11 +602,11 @@ define i32 @load_i32_by_bswap_i16(i32* %arg) {
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0]
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
@@ -684,12 +683,12 @@ define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r0, r1
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
; CHECK-NEXT: ldr r0, [r0, #12]
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
@@ -750,12 +749,12 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r1, r0
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
; CHECK-NEXT: ldr r0, [r0, #13]
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll
index 1a4153f8355fa..1720f41895639 100644
--- a/llvm/test/CodeGen/ARM/load-combine.ll
+++ b/llvm/test/CodeGen/ARM/load-combine.ll
@@ -123,11 +123,11 @@ define i32 @load_i32_by_i8_bswap(i32* %arg) {
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0]
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
@@ -243,22 +243,21 @@ define i64 @load_i64_by_i8(i64* %arg) {
define i64 @load_i64_by_i8_bswap(i64* %arg) {
; CHECK-LABEL: load_i64_by_i8_bswap:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push {r11, lr}
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: mov r12, #65280
; CHECK-NEXT: ldr r0, [r0, #4]
-; CHECK-NEXT: mov lr, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r3, r12, r0, lsr #8
-; CHECK-NEXT: and r2, lr, r0, lsl #8
; CHECK-NEXT: orr r3, r3, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: and r2, r12, r1, lsr #8
; CHECK-NEXT: orr r0, r0, r3
-; CHECK-NEXT: and r3, lr, r1, lsl #8
+; CHECK-NEXT: and r3, r1, #65280
; CHECK-NEXT: orr r2, r2, r1, lsr #24
-; CHECK-NEXT: orr r1, r3, r1, lsl #24
+; CHECK-NEXT: lsl r1, r1, #24
+; CHECK-NEXT: orr r1, r1, r3, lsl #8
; CHECK-NEXT: orr r1, r1, r2
-; CHECK-NEXT: pop {r11, lr}
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -425,11 +424,11 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0, #1]
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
@@ -482,11 +481,11 @@ define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0, #-4]
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
@@ -541,11 +540,11 @@ define i32 @load_i32_by_bswap_i16(i32* %arg) {
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0]
; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r2, r0, #65280
; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: and r2, r2, r0, lsl #8
; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r2, lsl #8
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: mov pc, lr
;
diff --git a/llvm/test/CodeGen/Mips/bswap.ll b/llvm/test/CodeGen/Mips/bswap.ll
index 0e8fd2ae4ffef..ace6c3d6021d2 100644
--- a/llvm/test/CodeGen/Mips/bswap.ll
+++ b/llvm/test/CodeGen/Mips/bswap.ll
@@ -23,16 +23,15 @@ entry:
; MIPS16-LABEL: bswap32:
; MIPS16-DAG: srl $[[R0:[0-9]+]], $4, 8
+; MIPS16-DAG: li $[[R4:[0-9]+]], 65280
+; MIPS16-DAG: and $[[R0]], $[[R4]]
; MIPS16-DAG: srl $[[R1:[0-9]+]], $4, 24
-; MIPS16-DAG: sll $[[R2:[0-9]+]], $4, 8
+; MIPS16-DAG: or $[[R1]], $[[R0]]
+; MIPS16-DAG: and $[[R4]], $4
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $[[R4]], 8
; MIPS16-DAG: sll $[[R3:[0-9]+]], $4, 24
-; MIPS16-DAG: li $[[R4:[0-9]+]], 65280
-; MIPS16-DAG: and $[[R4]], $[[R0]]
-; MIPS16-DAG: or $[[R1]], $[[R4]]
-; MIPS16-DAG: lw $[[R7:[0-9]+]], $CPI
-; MIPS16-DAG: and $[[R7]], $[[R2]]
-; MIPS16-DAG: or $[[R3]], $[[R7]]
-; MIPS16-DAG: or $[[R3]], $[[R1]]
+; MIPS16-DAG: or $[[R3]], $[[R2]]
+; MIPS16-DAG: or $[[R3]], $[[R1]]
%or.3 = call i32 @llvm.bswap.i32(i32 %x)
ret i32 %or.3
@@ -58,23 +57,22 @@ entry:
; MIPS16-LABEL: bswap64:
; MIPS16-DAG: srl $[[R0:[0-9]+]], $5, 8
-; MIPS16-DAG: srl $[[R1:[0-9]+]], $5, 24
-; MIPS16-DAG: sll $[[R2:[0-9]+]], $5, 8
-; MIPS16-DAG: sll $[[R3:[0-9]+]], $5, 24
; MIPS16-DAG: li $[[R4:[0-9]+]], 65280
; MIPS16-DAG: and $[[R0]], $[[R4]]
+; MIPS16-DAG: srl $[[R1:[0-9]+]], $5, 24
; MIPS16-DAG: or $[[R1]], $[[R0]]
-; MIPS16-DAG: lw $[[R7:[0-9]+]], 1f
-; MIPS16-DAG: and $[[R2]], $[[R7]]
-; MIPS16-DAG: or $[[R3]], $[[R2]]
-; MIPS16-DAG: or $[[R3]], $[[R1]]
+; MIPS16-DAG: sll $[[R3:[0-9]+]], $5, 24
+; MIPS16-DAG: and $5, $[[R4]]
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $5, 8
+; MIPS16-DAG: or $[[R0]], $[[R3]]
+; MIPS16-DAG: or $[[R0]], $[[R1]]
; MIPS16-DAG: srl $[[R0:[0-9]+]], $4, 8
-; MIPS16-DAG: srl $[[R1:[0-9]+]], $4, 24
-; MIPS16-DAG: sll $[[R2:[0-9]+]], $4, 8
-; MIPS16-DAG: sll $[[R3:[0-9]+]], $4, 24
; MIPS16-DAG: and $[[R0]], $[[R4]]
+; MIPS16-DAG: srl $[[R1:[0-9]+]], $4, 24
; MIPS16-DAG: or $[[R1]], $[[R0]]
-; MIPS16-DAG: and $[[R2]], $[[R7]]
+; MIPS16-DAG: and $[[R4]], $4
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $[[R4]], 8
+; MIPS16-DAG: sll $[[R3:[0-9]+]], $4, 24
; MIPS16-DAG: or $[[R3]], $[[R2]]
; MIPS16-DAG: or $[[R3]], $[[R1]]
diff --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
index bcf367a2b06cc..3db0ed8c95895 100644
--- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
@@ -59,11 +59,10 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
; RV32I-NEXT: lui a2, 16
; RV32I-NEXT: addi a2, a2, -256
; RV32I-NEXT: and a1, a1, a2
-; RV32I-NEXT: srli a2, a0, 24
-; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: lui a3, 4080
-; RV32I-NEXT: and a2, a2, a3
+; RV32I-NEXT: srli a3, a0, 24
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: slli a2, a2, 8
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
@@ -75,11 +74,10 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
; RV64I-NEXT: lui a2, 16
; RV64I-NEXT: addiw a2, a2, -256
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: srliw a2, a0, 24
-; RV64I-NEXT: or a1, a1, a2
-; RV64I-NEXT: slli a2, a0, 8
-; RV64I-NEXT: lui a3, 4080
-; RV64I-NEXT: and a2, a2, a3
+; RV64I-NEXT: srliw a3, a0, 24
+; RV64I-NEXT: or a1, a1, a3
+; RV64I-NEXT: and a2, a0, a2
+; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: slliw a0, a0, 24
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
@@ -108,18 +106,17 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
; RV32I-NEXT: and a2, a2, a3
; RV32I-NEXT: srli a4, a1, 24
; RV32I-NEXT: or a2, a2, a4
-; RV32I-NEXT: slli a4, a1, 8
-; RV32I-NEXT: lui a5, 4080
-; RV32I-NEXT: and a4, a4, a5
+; RV32I-NEXT: and a4, a1, a3
+; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a2, a1, a2
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a3
-; RV32I-NEXT: srli a3, a0, 24
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: and a3, a3, a5
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: and a3, a0, a3
+; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a1, a0, a1
@@ -128,34 +125,31 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
;
; RV64I-LABEL: test_bswap_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a1, a0, 24
-; RV64I-NEXT: li a2, 255
-; RV64I-NEXT: slli a3, a2, 40
-; RV64I-NEXT: and a1, a1, a3
-; RV64I-NEXT: srliw a3, a0, 24
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: slli a3, a0, 40
-; RV64I-NEXT: slli a2, a2, 48
-; RV64I-NEXT: and a2, a3, a2
-; RV64I-NEXT: slli a3, a0, 56
-; RV64I-NEXT: or a2, a3, a2
-; RV64I-NEXT: or a1, a2, a1
-; RV64I-NEXT: srli a2, a0, 40
-; RV64I-NEXT: lui a3, 16
-; RV64I-NEXT: addiw a3, a3, -256
-; RV64I-NEXT: and a2, a2, a3
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: lui a2, 16
+; RV64I-NEXT: addiw a2, a2, -256
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: srli a3, a0, 56
-; RV64I-NEXT: or a2, a2, a3
+; RV64I-NEXT: or a1, a1, a3
; RV64I-NEXT: srli a3, a0, 24
; RV64I-NEXT: lui a4, 4080
; RV64I-NEXT: and a3, a3, a4
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: srliw a0, a0, 24
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: srli a5, a0, 8
+; RV64I-NEXT: srliw a5, a5, 24
+; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a1, a3, a1
+; RV64I-NEXT: and a3, a0, a4
+; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: and a2, a0, a2
+; RV64I-NEXT: slli a2, a2, 40
+; RV64I-NEXT: slli a0, a0, 56
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
; RV32ZB-LABEL: test_bswap_i64:
@@ -402,11 +396,10 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; RV32I-NEXT: lui a2, 16
; RV32I-NEXT: addi a2, a2, -256
; RV32I-NEXT: and a1, a1, a2
-; RV32I-NEXT: srli a2, a0, 24
-; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: lui a3, 4080
-; RV32I-NEXT: and a2, a2, a3
+; RV32I-NEXT: srli a3, a0, 24
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: slli a2, a2, 8
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
@@ -439,11 +432,10 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; RV64I-NEXT: lui a2, 16
; RV64I-NEXT: addiw a2, a2, -256
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: srliw a2, a0, 24
-; RV64I-NEXT: or a1, a1, a2
-; RV64I-NEXT: slli a2, a0, 8
-; RV64I-NEXT: lui a3, 4080
-; RV64I-NEXT: and a2, a2, a3
+; RV64I-NEXT: srliw a3, a0, 24
+; RV64I-NEXT: or a1, a1, a3
+; RV64I-NEXT: and a2, a0, a2
+; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: slliw a0, a0, 24
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
@@ -550,9 +542,8 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; RV32I-NEXT: and a2, a2, a3
; RV32I-NEXT: srli a4, a1, 24
; RV32I-NEXT: or a2, a2, a4
-; RV32I-NEXT: slli a4, a1, 8
-; RV32I-NEXT: lui a5, 4080
-; RV32I-NEXT: and a4, a4, a5
+; RV32I-NEXT: and a4, a1, a3
+; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a1, a1, a2
@@ -564,25 +555,25 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; RV32I-NEXT: slli a1, a1, 4
; RV32I-NEXT: or a1, a2, a1
; RV32I-NEXT: srli a2, a1, 2
-; RV32I-NEXT: lui a6, 209715
-; RV32I-NEXT: addi a6, a6, 819
-; RV32I-NEXT: and a2, a2, a6
-; RV32I-NEXT: and a1, a1, a6
+; RV32I-NEXT: lui a5, 209715
+; RV32I-NEXT: addi a5, a5, 819
+; RV32I-NEXT: and a2, a2, a5
+; RV32I-NEXT: and a1, a1, a5
; RV32I-NEXT: slli a1, a1, 2
; RV32I-NEXT: or a1, a2, a1
; RV32I-NEXT: srli a2, a1, 1
-; RV32I-NEXT: lui a7, 349525
-; RV32I-NEXT: addi a7, a7, 1365
-; RV32I-NEXT: and a2, a2, a7
-; RV32I-NEXT: and a1, a1, a7
+; RV32I-NEXT: lui a6, 349525
+; RV32I-NEXT: addi a6, a6, 1365
+; RV32I-NEXT: and a2, a2, a6
+; RV32I-NEXT: and a1, a1, a6
; RV32I-NEXT: slli a1, a1, 1
; RV32I-NEXT: or a2, a2, a1
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a3
-; RV32I-NEXT: srli a3, a0, 24
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: and a3, a3, a5
+; RV32I-NEXT: srli a7, a0, 24
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: and a3, a0, a3
+; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a0, a0, a1
@@ -592,13 +583,13 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; RV32I-NEXT: slli a0, a0, 4
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
-; RV32I-NEXT: and a1, a1, a6
-; RV32I-NEXT: and a0, a0, a6
+; RV32I-NEXT: and a1, a1, a5
+; RV32I-NEXT: and a0, a0, a5
; RV32I-NEXT: slli a0, a0, 2
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 1
-; RV32I-NEXT: and a1, a1, a7
-; RV32I-NEXT: and a0, a0, a7
+; RV32I-NEXT: and a1, a1, a6
+; RV32I-NEXT: and a0, a0, a6
; RV32I-NEXT: slli a0, a0, 1
; RV32I-NEXT: or a1, a1, a0
; RV32I-NEXT: mv a0, a2
@@ -606,39 +597,36 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
;
; RV64I-LABEL: test_bitreverse_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a1, a0, 24
-; RV64I-NEXT: li a2, 255
-; RV64I-NEXT: slli a3, a2, 40
-; RV64I-NEXT: and a1, a1, a3
-; RV64I-NEXT: srliw a3, a0, 24
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: slli a3, a0, 40
-; RV64I-NEXT: slli a2, a2, 48
-; RV64I-NEXT: and a2, a3, a2
-; RV64I-NEXT: slli a3, a0, 56
-; RV64I-NEXT: or a2, a3, a2
-; RV64I-NEXT: or a1, a2, a1
-; RV64I-NEXT: srli a2, a0, 40
-; RV64I-NEXT: lui a3, 16
-; RV64I-NEXT: addiw a3, a3, -256
-; RV64I-NEXT: and a2, a2, a3
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: lui a2, 16
+; RV64I-NEXT: addiw a2, a2, -256
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: srli a3, a0, 56
-; RV64I-NEXT: or a2, a2, a3
+; RV64I-NEXT: or a1, a1, a3
; RV64I-NEXT: srli a3, a0, 24
; RV64I-NEXT: lui a4, 4080
; RV64I-NEXT: and a3, a3, a4
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: srliw a0, a0, 24
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: lui a3, %hi(.LCPI6_0)
-; RV64I-NEXT: ld a3, %lo(.LCPI6_0)(a3)
+; RV64I-NEXT: srli a5, a0, 8
+; RV64I-NEXT: srliw a5, a5, 24
+; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a1, a3, a1
+; RV64I-NEXT: and a3, a0, a4
+; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: and a2, a0, a2
+; RV64I-NEXT: slli a2, a2, 40
+; RV64I-NEXT: slli a0, a0, 56
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: lui a2, %hi(.LCPI6_0)
+; RV64I-NEXT: ld a2, %lo(.LCPI6_0)(a2)
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: srli a1, a0, 4
-; RV64I-NEXT: and a1, a1, a3
-; RV64I-NEXT: and a0, a0, a3
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: lui a2, %hi(.LCPI6_1)
; RV64I-NEXT: ld a2, %lo(.LCPI6_1)(a2)
; RV64I-NEXT: slli a0, a0, 4
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index ad3548ec78f67..6d10db62d17b3 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -769,11 +769,10 @@ define i32 @bswap_i32(i32 %a) nounwind {
; RV32I-NEXT: lui a2, 16
; RV32I-NEXT: addi a2, a2, -256
; RV32I-NEXT: and a1, a1, a2
-; RV32I-NEXT: srli a2, a0, 24
-; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: slli a2, a0, 8
-; RV32I-NEXT: lui a3, 4080
-; RV32I-NEXT: and a2, a2, a3
+; RV32I-NEXT: srli a3, a0, 24
+; RV32I-NEXT: or a1, a1, a3
+; RV32I-NEXT: and a2, a0, a2
+; RV32I-NEXT: slli a2, a2, 8
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a2
; RV32I-NEXT: or a0, a0, a1
@@ -798,18 +797,17 @@ define i64 @bswap_i64(i64 %a) {
; RV32I-NEXT: and a2, a2, a3
; RV32I-NEXT: srli a4, a1, 24
; RV32I-NEXT: or a2, a2, a4
-; RV32I-NEXT: slli a4, a1, 8
-; RV32I-NEXT: lui a5, 4080
-; RV32I-NEXT: and a4, a4, a5
+; RV32I-NEXT: and a4, a1, a3
+; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a1, a1, 24
; RV32I-NEXT: or a1, a1, a4
; RV32I-NEXT: or a2, a1, a2
; RV32I-NEXT: srli a1, a0, 8
; RV32I-NEXT: and a1, a1, a3
-; RV32I-NEXT: srli a3, a0, 24
-; RV32I-NEXT: or a1, a1, a3
-; RV32I-NEXT: slli a3, a0, 8
-; RV32I-NEXT: and a3, a3, a5
+; RV32I-NEXT: srli a4, a0, 24
+; RV32I-NEXT: or a1, a1, a4
+; RV32I-NEXT: and a3, a0, a3
+; RV32I-NEXT: slli a3, a3, 8
; RV32I-NEXT: slli a0, a0, 24
; RV32I-NEXT: or a0, a0, a3
; RV32I-NEXT: or a1, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index d4ab229d55740..3193a4a7af0b5 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -952,11 +952,10 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
; RV64I-NEXT: lui a2, 16
; RV64I-NEXT: addiw a2, a2, -256
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: srliw a2, a0, 24
-; RV64I-NEXT: or a1, a1, a2
-; RV64I-NEXT: slli a2, a0, 8
-; RV64I-NEXT: lui a3, 4080
-; RV64I-NEXT: and a2, a2, a3
+; RV64I-NEXT: srliw a3, a0, 24
+; RV64I-NEXT: or a1, a1, a3
+; RV64I-NEXT: and a2, a0, a2
+; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: slliw a0, a0, 24
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: or a0, a0, a1
@@ -979,11 +978,10 @@ define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind {
; RV64I-NEXT: lui a3, 16
; RV64I-NEXT: addiw a3, a3, -256
; RV64I-NEXT: and a2, a2, a3
-; RV64I-NEXT: srliw a3, a0, 24
-; RV64I-NEXT: or a2, a2, a3
-; RV64I-NEXT: slli a3, a0, 8
-; RV64I-NEXT: lui a4, 4080
-; RV64I-NEXT: and a3, a3, a4
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: or a2, a2, a4
+; RV64I-NEXT: and a3, a0, a3
+; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: slli a0, a0, 24
; RV64I-NEXT: or a0, a0, a3
; RV64I-NEXT: or a0, a0, a2
@@ -1006,34 +1004,31 @@ declare i64 @llvm.bswap.i64(i64)
define i64 @bswap_i64(i64 %a) {
; RV64I-LABEL: bswap_i64:
; RV64I: # %bb.0:
-; RV64I-NEXT: slli a1, a0, 24
-; RV64I-NEXT: li a2, 255
-; RV64I-NEXT: slli a3, a2, 40
-; RV64I-NEXT: and a1, a1, a3
-; RV64I-NEXT: srliw a3, a0, 24
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: slli a3, a0, 40
-; RV64I-NEXT: slli a2, a2, 48
-; RV64I-NEXT: and a2, a3, a2
-; RV64I-NEXT: slli a3, a0, 56
-; RV64I-NEXT: or a2, a3, a2
-; RV64I-NEXT: or a1, a2, a1
-; RV64I-NEXT: srli a2, a0, 40
-; RV64I-NEXT: lui a3, 16
-; RV64I-NEXT: addiw a3, a3, -256
-; RV64I-NEXT: and a2, a2, a3
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: lui a2, 16
+; RV64I-NEXT: addiw a2, a2, -256
+; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: srli a3, a0, 56
-; RV64I-NEXT: or a2, a2, a3
+; RV64I-NEXT: or a1, a1, a3
; RV64I-NEXT: srli a3, a0, 24
; RV64I-NEXT: lui a4, 4080
; RV64I-NEXT: and a3, a3, a4
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: srliw a0, a0, 24
-; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: srli a5, a0, 8
+; RV64I-NEXT: srliw a5, a5, 24
+; RV64I-NEXT: slli a5, a5, 24
+; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: or a1, a3, a1
+; RV64I-NEXT: and a3, a0, a4
+; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: srliw a4, a0, 24
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: and a2, a0, a2
+; RV64I-NEXT: slli a2, a2, 40
+; RV64I-NEXT: slli a0, a0, 56
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: ret
;
; RV64ZBB-LABEL: bswap_i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index d73b05efe93dd..ab362af991637 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -579,9 +579,8 @@ define <vscale x 1 x i32> @bitreverse_nxv1i32(<vscale x 1 x i32> %va) {
; RV32-NEXT: vand.vx v9, v9, a0
; RV32-NEXT: vsrl.vi v10, v8, 24
; RV32-NEXT: vor.vv v9, v9, v10
-; RV32-NEXT: vsll.vi v10, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v10, v10, a0
+; RV32-NEXT: vand.vx v10, v8, a0
+; RV32-NEXT: vsll.vi v10, v10, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v9
@@ -617,9 +616,8 @@ define <vscale x 1 x i32> @bitreverse_nxv1i32(<vscale x 1 x i32> %va) {
; RV64-NEXT: vand.vx v9, v9, a0
; RV64-NEXT: vsrl.vi v10, v8, 24
; RV64-NEXT: vor.vv v9, v9, v10
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v10, v10, a0
+; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vsll.vi v10, v10, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vor.vv v8, v8, v9
@@ -660,9 +658,8 @@ define <vscale x 2 x i32> @bitreverse_nxv2i32(<vscale x 2 x i32> %va) {
; RV32-NEXT: vand.vx v9, v9, a0
; RV32-NEXT: vsrl.vi v10, v8, 24
; RV32-NEXT: vor.vv v9, v9, v10
-; RV32-NEXT: vsll.vi v10, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v10, v10, a0
+; RV32-NEXT: vand.vx v10, v8, a0
+; RV32-NEXT: vsll.vi v10, v10, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v9
@@ -698,9 +695,8 @@ define <vscale x 2 x i32> @bitreverse_nxv2i32(<vscale x 2 x i32> %va) {
; RV64-NEXT: vand.vx v9, v9, a0
; RV64-NEXT: vsrl.vi v10, v8, 24
; RV64-NEXT: vor.vv v9, v9, v10
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v10, v10, a0
+; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vsll.vi v10, v10, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vor.vv v8, v8, v9
@@ -741,9 +737,8 @@ define <vscale x 4 x i32> @bitreverse_nxv4i32(<vscale x 4 x i32> %va) {
; RV32-NEXT: vand.vx v10, v10, a0
; RV32-NEXT: vsrl.vi v12, v8, 24
; RV32-NEXT: vor.vv v10, v10, v12
-; RV32-NEXT: vsll.vi v12, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v12, v12, a0
+; RV32-NEXT: vand.vx v12, v8, a0
+; RV32-NEXT: vsll.vi v12, v12, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vor.vv v8, v8, v10
@@ -779,9 +774,8 @@ define <vscale x 4 x i32> @bitreverse_nxv4i32(<vscale x 4 x i32> %va) {
; RV64-NEXT: vand.vx v10, v10, a0
; RV64-NEXT: vsrl.vi v12, v8, 24
; RV64-NEXT: vor.vv v10, v10, v12
-; RV64-NEXT: vsll.vi v12, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v12, v12, a0
+; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vsll.vi v12, v12, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vor.vv v8, v8, v10
@@ -822,9 +816,8 @@ define <vscale x 8 x i32> @bitreverse_nxv8i32(<vscale x 8 x i32> %va) {
; RV32-NEXT: vand.vx v12, v12, a0
; RV32-NEXT: vsrl.vi v16, v8, 24
; RV32-NEXT: vor.vv v12, v12, v16
-; RV32-NEXT: vsll.vi v16, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: vand.vx v16, v8, a0
+; RV32-NEXT: vsll.vi v16, v16, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vor.vv v8, v8, v12
@@ -860,9 +853,8 @@ define <vscale x 8 x i32> @bitreverse_nxv8i32(<vscale x 8 x i32> %va) {
; RV64-NEXT: vand.vx v12, v12, a0
; RV64-NEXT: vsrl.vi v16, v8, 24
; RV64-NEXT: vor.vv v12, v12, v16
-; RV64-NEXT: vsll.vi v16, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vsll.vi v16, v16, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vor.vv v8, v8, v12
@@ -903,9 +895,8 @@ define <vscale x 16 x i32> @bitreverse_nxv16i32(<vscale x 16 x i32> %va) {
; RV32-NEXT: vand.vx v16, v16, a0
; RV32-NEXT: vsrl.vi v24, v8, 24
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: vsll.vi v24, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v24, v24, a0
+; RV32-NEXT: vand.vx v24, v8, a0
+; RV32-NEXT: vsll.vi v24, v24, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: vor.vv v8, v8, v16
@@ -941,9 +932,8 @@ define <vscale x 16 x i32> @bitreverse_nxv16i32(<vscale x 16 x i32> %va) {
; RV64-NEXT: vand.vx v16, v16, a0
; RV64-NEXT: vsrl.vi v24, v8, 24
; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsll.vi v24, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v24, v24, a0
+; RV64-NEXT: vand.vx v24, v8, a0
+; RV64-NEXT: vsll.vi v24, v24, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vor.vv v8, v8, v16
@@ -982,66 +972,58 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: lui a0, 1044480
; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4080
+; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw zero, 8(sp)
-; RV32-NEXT: li a1, 255
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a1, a1, -256
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli a3, zero, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v9, v8, a2
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsrl.vx v10, v8, a3
-; RV32-NEXT: vand.vx v10, v10, a1
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: addi a0, a0, 819
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi a0, a0, 1365
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vx v9, v8, a0
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: vsrl.vx v10, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v10, v10, a2
; RV32-NEXT: vor.vv v9, v10, v9
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v10, (a1), zero
-; RV32-NEXT: vsrl.vi v11, v8, 24
-; RV32-NEXT: vand.vx v11, v11, a0
+; RV32-NEXT: vsrl.vi v10, v8, 24
+; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: vlse64.v v11, (a3), zero
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v10, v10, a4
; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vand.vv v10, v12, v10
-; RV32-NEXT: vor.vv v10, v10, v11
-; RV32-NEXT: vlse64.v v11, (a1), zero
+; RV32-NEXT: vand.vv v12, v12, v11
+; RV32-NEXT: vor.vv v10, v12, v10
; RV32-NEXT: vor.vv v9, v10, v9
-; RV32-NEXT: vsll.vx v10, v8, a2
-; RV32-NEXT: vsll.vx v12, v8, a3
-; RV32-NEXT: vand.vv v11, v12, v11
-; RV32-NEXT: vlse64.v v12, (a1), zero
-; RV32-NEXT: vor.vv v10, v10, v11
-; RV32-NEXT: vlse64.v v11, (a1), zero
-; RV32-NEXT: vsll.vi v13, v8, 8
-; RV32-NEXT: vand.vv v12, v13, v12
-; RV32-NEXT: vsll.vi v8, v8, 24
+; RV32-NEXT: vsll.vx v10, v8, a0
+; RV32-NEXT: vand.vx v12, v8, a2
+; RV32-NEXT: vsll.vx v12, v12, a1
+; RV32-NEXT: vor.vv v10, v10, v12
+; RV32-NEXT: vand.vx v12, v8, a4
+; RV32-NEXT: vsll.vi v12, v12, 24
; RV32-NEXT: vand.vv v8, v8, v11
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vlse64.v v11, (a1), zero
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v12, v8
+; RV32-NEXT: vlse64.v v11, (a3), zero
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: vand.vv v9, v9, v11
; RV32-NEXT: vand.vv v8, v8, v11
-; RV32-NEXT: vlse64.v v10, (a1), zero
+; RV32-NEXT: vlse64.v v10, (a3), zero
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 2
; RV32-NEXT: vand.vv v9, v9, v10
; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vlse64.v v10, (a1), zero
+; RV32-NEXT: vlse64.v v10, (a3), zero
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v9, v8
; RV32-NEXT: vsrl.vi v9, v8, 1
@@ -1064,25 +1046,22 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
; RV64-NEXT: vand.vx v10, v10, a2
; RV64-NEXT: vor.vv v9, v10, v9
; RV64-NEXT: vsrl.vi v10, v8, 24
-; RV64-NEXT: lui a2, 4080
-; RV64-NEXT: vand.vx v10, v10, a2
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: vand.vx v10, v10, a3
; RV64-NEXT: vsrl.vi v11, v8, 8
-; RV64-NEXT: li a2, 255
-; RV64-NEXT: slli a3, a2, 24
-; RV64-NEXT: vand.vx v11, v11, a3
+; RV64-NEXT: li a4, 255
+; RV64-NEXT: slli a4, a4, 24
+; RV64-NEXT: vand.vx v11, v11, a4
; RV64-NEXT: vor.vv v10, v11, v10
; RV64-NEXT: vor.vv v9, v10, v9
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: vand.vx v10, v10, a3
-; RV64-NEXT: vsll.vi v11, v8, 24
-; RV64-NEXT: slli a3, a2, 40
-; RV64-NEXT: vand.vx v11, v11, a3
-; RV64-NEXT: vor.vv v10, v11, v10
+; RV64-NEXT: vand.vx v10, v8, a3
+; RV64-NEXT: vsll.vi v10, v10, 24
+; RV64-NEXT: vand.vx v11, v8, a4
+; RV64-NEXT: vsll.vi v11, v11, 8
+; RV64-NEXT: vor.vv v10, v10, v11
; RV64-NEXT: vsll.vx v11, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a2
; RV64-NEXT: vsll.vx v8, v8, a1
-; RV64-NEXT: slli a0, a2, 48
-; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vor.vv v8, v11, v8
; RV64-NEXT: lui a0, %hi(.LCPI18_0)
; RV64-NEXT: ld a0, %lo(.LCPI18_0)(a0)
@@ -1121,66 +1100,58 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: lui a0, 1044480
; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4080
+; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: addi a0, a0, -241
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: addi a0, a0, 819
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi a0, a0, 1365
; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw zero, 8(sp)
-; RV32-NEXT: li a1, 255
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a1, a1, -256
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli a3, zero, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vx v10, v8, a2
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsrl.vx v12, v8, a3
-; RV32-NEXT: vand.vx v12, v12, a1
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vx v10, v8, a0
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: vsrl.vx v12, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v12, v12, a2
; RV32-NEXT: vor.vv v10, v12, v10
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v12, (a1), zero
-; RV32-NEXT: vsrl.vi v14, v8, 24
-; RV32-NEXT: vand.vx v14, v14, a0
+; RV32-NEXT: vsrl.vi v12, v8, 24
+; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: vlse64.v v14, (a3), zero
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v12, v12, a4
; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vand.vv v12, v16, v12
-; RV32-NEXT: vor.vv v12, v12, v14
-; RV32-NEXT: vlse64.v v14, (a1), zero
+; RV32-NEXT: vand.vv v16, v16, v14
+; RV32-NEXT: vor.vv v12, v16, v12
; RV32-NEXT: vor.vv v10, v12, v10
-; RV32-NEXT: vsll.vx v12, v8, a2
-; RV32-NEXT: vsll.vx v16, v8, a3
-; RV32-NEXT: vand.vv v14, v16, v14
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vor.vv v12, v12, v14
-; RV32-NEXT: vlse64.v v14, (a1), zero
-; RV32-NEXT: vsll.vi v18, v8, 8
-; RV32-NEXT: vand.vv v16, v18, v16
-; RV32-NEXT: vsll.vi v8, v8, 24
+; RV32-NEXT: vsll.vx v12, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a2
+; RV32-NEXT: vsll.vx v16, v16, a1
+; RV32-NEXT: vor.vv v12, v12, v16
+; RV32-NEXT: vand.vx v16, v8, a4
+; RV32-NEXT: vsll.vi v16, v16, 24
; RV32-NEXT: vand.vv v8, v8, v14
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v14, (a1), zero
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vlse64.v v14, (a3), zero
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: vand.vv v10, v10, v14
; RV32-NEXT: vand.vv v8, v8, v14
-; RV32-NEXT: vlse64.v v12, (a1), zero
+; RV32-NEXT: vlse64.v v12, (a3), zero
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 2
; RV32-NEXT: vand.vv v10, v10, v12
; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vlse64.v v12, (a1), zero
+; RV32-NEXT: vlse64.v v12, (a3), zero
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vsrl.vi v10, v8, 1
@@ -1203,25 +1174,22 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
; RV64-NEXT: vand.vx v12, v12, a2
; RV64-NEXT: vor.vv v10, v12, v10
; RV64-NEXT: vsrl.vi v12, v8, 24
-; RV64-NEXT: lui a2, 4080
-; RV64-NEXT: vand.vx v12, v12, a2
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: vand.vx v12, v12, a3
; RV64-NEXT: vsrl.vi v14, v8, 8
-; RV64-NEXT: li a2, 255
-; RV64-NEXT: slli a3, a2, 24
-; RV64-NEXT: vand.vx v14, v14, a3
+; RV64-NEXT: li a4, 255
+; RV64-NEXT: slli a4, a4, 24
+; RV64-NEXT: vand.vx v14, v14, a4
; RV64-NEXT: vor.vv v12, v14, v12
; RV64-NEXT: vor.vv v10, v12, v10
-; RV64-NEXT: vsll.vi v12, v8, 8
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: vand.vx v12, v12, a3
-; RV64-NEXT: vsll.vi v14, v8, 24
-; RV64-NEXT: slli a3, a2, 40
-; RV64-NEXT: vand.vx v14, v14, a3
-; RV64-NEXT: vor.vv v12, v14, v12
+; RV64-NEXT: vand.vx v12, v8, a3
+; RV64-NEXT: vsll.vi v12, v12, 24
+; RV64-NEXT: vand.vx v14, v8, a4
+; RV64-NEXT: vsll.vi v14, v14, 8
+; RV64-NEXT: vor.vv v12, v12, v14
; RV64-NEXT: vsll.vx v14, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a2
; RV64-NEXT: vsll.vx v8, v8, a1
-; RV64-NEXT: slli a0, a2, 48
-; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vor.vv v8, v14, v8
; RV64-NEXT: lui a0, %hi(.LCPI19_0)
; RV64-NEXT: ld a0, %lo(.LCPI19_0)(a0)
@@ -1260,66 +1228,58 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: lui a0, 1044480
; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4080
+; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw zero, 8(sp)
-; RV32-NEXT: li a1, 255
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a1, a1, -256
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli a3, zero, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vx v12, v8, a2
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsrl.vx v16, v8, a3
-; RV32-NEXT: vand.vx v16, v16, a1
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: addi a0, a0, 819
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi a0, a0, 1365
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vx v12, v8, a0
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v16, v16, a2
; RV32-NEXT: vor.vv v12, v16, v12
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsrl.vi v20, v8, 24
-; RV32-NEXT: vand.vx v20, v20, a0
+; RV32-NEXT: vsrl.vi v16, v8, 24
+; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: vlse64.v v20, (a3), zero
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v16, v16, a4
; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vor.vv v16, v16, v20
-; RV32-NEXT: vlse64.v v20, (a1), zero
+; RV32-NEXT: vand.vv v24, v24, v20
+; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: vor.vv v12, v16, v12
-; RV32-NEXT: vsll.vx v16, v8, a2
-; RV32-NEXT: vsll.vx v24, v8, a3
-; RV32-NEXT: vand.vv v20, v24, v20
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vor.vv v16, v16, v20
-; RV32-NEXT: vlse64.v v20, (a1), zero
-; RV32-NEXT: vsll.vi v28, v8, 8
-; RV32-NEXT: vand.vv v24, v28, v24
-; RV32-NEXT: vsll.vi v8, v8, 24
+; RV32-NEXT: vsll.vx v16, v8, a0
+; RV32-NEXT: vand.vx v24, v8, a2
+; RV32-NEXT: vsll.vx v24, v24, a1
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: vand.vx v24, v8, a4
+; RV32-NEXT: vsll.vi v24, v24, 24
; RV32-NEXT: vand.vv v8, v8, v20
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vlse64.v v20, (a1), zero
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v24, v8
+; RV32-NEXT: vlse64.v v20, (a3), zero
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: vand.vv v12, v12, v20
; RV32-NEXT: vand.vv v8, v8, v20
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 2
; RV32-NEXT: vand.vv v12, v12, v16
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vsrl.vi v12, v8, 1
@@ -1342,25 +1302,22 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
; RV64-NEXT: vand.vx v16, v16, a2
; RV64-NEXT: vor.vv v12, v16, v12
; RV64-NEXT: vsrl.vi v16, v8, 24
-; RV64-NEXT: lui a2, 4080
-; RV64-NEXT: vand.vx v16, v16, a2
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: vand.vx v16, v16, a3
; RV64-NEXT: vsrl.vi v20, v8, 8
-; RV64-NEXT: li a2, 255
-; RV64-NEXT: slli a3, a2, 24
-; RV64-NEXT: vand.vx v20, v20, a3
+; RV64-NEXT: li a4, 255
+; RV64-NEXT: slli a4, a4, 24
+; RV64-NEXT: vand.vx v20, v20, a4
; RV64-NEXT: vor.vv v16, v20, v16
; RV64-NEXT: vor.vv v12, v16, v12
-; RV64-NEXT: vsll.vi v16, v8, 8
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: vand.vx v16, v16, a3
-; RV64-NEXT: vsll.vi v20, v8, 24
-; RV64-NEXT: slli a3, a2, 40
-; RV64-NEXT: vand.vx v20, v20, a3
-; RV64-NEXT: vor.vv v16, v20, v16
+; RV64-NEXT: vand.vx v16, v8, a3
+; RV64-NEXT: vsll.vi v16, v16, 24
+; RV64-NEXT: vand.vx v20, v8, a4
+; RV64-NEXT: vsll.vi v20, v20, 8
+; RV64-NEXT: vor.vv v16, v16, v20
; RV64-NEXT: vsll.vx v20, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a2
; RV64-NEXT: vsll.vx v8, v8, a1
-; RV64-NEXT: slli a0, a2, 48
-; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vor.vv v8, v20, v8
; RV64-NEXT: lui a0, %hi(.LCPI20_0)
; RV64-NEXT: ld a0, %lo(.LCPI20_0)(a0)
@@ -1397,95 +1354,71 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: lui a0, 1044480
; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4080
+; RV32-NEXT: lui a0, 61681
+; RV32-NEXT: addi a0, a0, -241
; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw zero, 8(sp)
-; RV32-NEXT: li a1, 255
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a1, a1, -256
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsrl.vx v16, v8, a3
-; RV32-NEXT: vand.vx v16, v16, a1
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsrl.vx v0, v8, a2
-; RV32-NEXT: vor.vv v16, v16, v0
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vand.vv v24, v0, v24
-; RV32-NEXT: vsrl.vi v0, v8, 24
-; RV32-NEXT: vand.vx v0, v0, a0
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vor.vv v24, v24, v0
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v24, v0
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsll.vx v24, v8, a3
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vsll.vx v24, v8, a2
-; RV32-NEXT: vlse64.v v0, (a1), zero
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 209715
+; RV32-NEXT: addi a0, a0, 819
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: addi a0, a0, 1365
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v8, a0
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: vsrl.vx v24, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v24, v24, a2
; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsll.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: vlse64.v v24, (a3), zero
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v0, v0, a4
+; RV32-NEXT: vsrl.vi v16, v8, 8
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: addi a5, sp, 16
+; RV32-NEXT: vl8re8.v v0, (a5) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: addi a5, sp, 16
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v0, v8, a2
+; RV32-NEXT: vsll.vx v0, v0, a1
+; RV32-NEXT: vsll.vx v16, v8, a0
+; RV32-NEXT: vor.vv v0, v16, v0
+; RV32-NEXT: vand.vv v16, v8, v24
+; RV32-NEXT: vand.vx v8, v8, a4
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: vor.vv v8, v0, v8
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: vsrl.vi v24, v8, 4
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 2
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: vsrl.vi v24, v8, 1
@@ -1494,7 +1427,7 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -1511,25 +1444,22 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
; RV64-NEXT: vand.vx v24, v24, a2
; RV64-NEXT: vor.vv v16, v24, v16
; RV64-NEXT: vsrl.vi v24, v8, 24
-; RV64-NEXT: lui a2, 4080
-; RV64-NEXT: vand.vx v24, v24, a2
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: vand.vx v24, v24, a3
; RV64-NEXT: vsrl.vi v0, v8, 8
-; RV64-NEXT: li a2, 255
-; RV64-NEXT: slli a3, a2, 24
-; RV64-NEXT: vand.vx v0, v0, a3
+; RV64-NEXT: li a4, 255
+; RV64-NEXT: slli a4, a4, 24
+; RV64-NEXT: vand.vx v0, v0, a4
; RV64-NEXT: vor.vv v24, v0, v24
; RV64-NEXT: vor.vv v16, v24, v16
-; RV64-NEXT: vsll.vi v24, v8, 8
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: vand.vx v24, v24, a3
-; RV64-NEXT: vsll.vi v0, v8, 24
-; RV64-NEXT: slli a3, a2, 40
-; RV64-NEXT: vand.vx v0, v0, a3
-; RV64-NEXT: vor.vv v24, v0, v24
+; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: vsll.vi v24, v24, 24
+; RV64-NEXT: vand.vx v0, v8, a4
+; RV64-NEXT: vsll.vi v0, v0, 8
+; RV64-NEXT: vor.vv v24, v24, v0
; RV64-NEXT: vsll.vx v0, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a2
; RV64-NEXT: vsll.vx v8, v8, a1
-; RV64-NEXT: slli a0, a2, 48
-; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vor.vv v8, v0, v8
; RV64-NEXT: lui a0, %hi(.LCPI21_0)
; RV64-NEXT: ld a0, %lo(.LCPI21_0)(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
index c5a57a9efc5b8..fdef42fbb7248 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
@@ -90,9 +90,8 @@ define <vscale x 1 x i32> @bswap_nxv1i32(<vscale x 1 x i32> %va) {
; RV32-NEXT: vand.vx v9, v9, a0
; RV32-NEXT: vsrl.vi v10, v8, 24
; RV32-NEXT: vor.vv v9, v9, v10
-; RV32-NEXT: vsll.vi v10, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v10, v10, a0
+; RV32-NEXT: vand.vx v10, v8, a0
+; RV32-NEXT: vsll.vi v10, v10, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v9
@@ -107,9 +106,8 @@ define <vscale x 1 x i32> @bswap_nxv1i32(<vscale x 1 x i32> %va) {
; RV64-NEXT: vand.vx v9, v9, a0
; RV64-NEXT: vsrl.vi v10, v8, 24
; RV64-NEXT: vor.vv v9, v9, v10
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v10, v10, a0
+; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vsll.vi v10, v10, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vor.vv v8, v8, v9
@@ -129,9 +127,8 @@ define <vscale x 2 x i32> @bswap_nxv2i32(<vscale x 2 x i32> %va) {
; RV32-NEXT: vand.vx v9, v9, a0
; RV32-NEXT: vsrl.vi v10, v8, 24
; RV32-NEXT: vor.vv v9, v9, v10
-; RV32-NEXT: vsll.vi v10, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v10, v10, a0
+; RV32-NEXT: vand.vx v10, v8, a0
+; RV32-NEXT: vsll.vi v10, v10, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v9
@@ -146,9 +143,8 @@ define <vscale x 2 x i32> @bswap_nxv2i32(<vscale x 2 x i32> %va) {
; RV64-NEXT: vand.vx v9, v9, a0
; RV64-NEXT: vsrl.vi v10, v8, 24
; RV64-NEXT: vor.vv v9, v9, v10
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v10, v10, a0
+; RV64-NEXT: vand.vx v10, v8, a0
+; RV64-NEXT: vsll.vi v10, v10, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vor.vv v8, v8, v9
@@ -168,9 +164,8 @@ define <vscale x 4 x i32> @bswap_nxv4i32(<vscale x 4 x i32> %va) {
; RV32-NEXT: vand.vx v10, v10, a0
; RV32-NEXT: vsrl.vi v12, v8, 24
; RV32-NEXT: vor.vv v10, v10, v12
-; RV32-NEXT: vsll.vi v12, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v12, v12, a0
+; RV32-NEXT: vand.vx v12, v8, a0
+; RV32-NEXT: vsll.vi v12, v12, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vor.vv v8, v8, v10
@@ -185,9 +180,8 @@ define <vscale x 4 x i32> @bswap_nxv4i32(<vscale x 4 x i32> %va) {
; RV64-NEXT: vand.vx v10, v10, a0
; RV64-NEXT: vsrl.vi v12, v8, 24
; RV64-NEXT: vor.vv v10, v10, v12
-; RV64-NEXT: vsll.vi v12, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v12, v12, a0
+; RV64-NEXT: vand.vx v12, v8, a0
+; RV64-NEXT: vsll.vi v12, v12, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vor.vv v8, v8, v10
@@ -207,9 +201,8 @@ define <vscale x 8 x i32> @bswap_nxv8i32(<vscale x 8 x i32> %va) {
; RV32-NEXT: vand.vx v12, v12, a0
; RV32-NEXT: vsrl.vi v16, v8, 24
; RV32-NEXT: vor.vv v12, v12, v16
-; RV32-NEXT: vsll.vi v16, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: vand.vx v16, v8, a0
+; RV32-NEXT: vsll.vi v16, v16, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: vor.vv v8, v8, v12
@@ -224,9 +217,8 @@ define <vscale x 8 x i32> @bswap_nxv8i32(<vscale x 8 x i32> %va) {
; RV64-NEXT: vand.vx v12, v12, a0
; RV64-NEXT: vsrl.vi v16, v8, 24
; RV64-NEXT: vor.vv v12, v12, v16
-; RV64-NEXT: vsll.vi v16, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v16, v16, a0
+; RV64-NEXT: vand.vx v16, v8, a0
+; RV64-NEXT: vsll.vi v16, v16, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vor.vv v8, v8, v12
@@ -246,9 +238,8 @@ define <vscale x 16 x i32> @bswap_nxv16i32(<vscale x 16 x i32> %va) {
; RV32-NEXT: vand.vx v16, v16, a0
; RV32-NEXT: vsrl.vi v24, v8, 24
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: vsll.vi v24, v8, 8
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v24, v24, a0
+; RV32-NEXT: vand.vx v24, v8, a0
+; RV32-NEXT: vsll.vi v24, v24, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: vor.vv v8, v8, v16
@@ -263,9 +254,8 @@ define <vscale x 16 x i32> @bswap_nxv16i32(<vscale x 16 x i32> %va) {
; RV64-NEXT: vand.vx v16, v16, a0
; RV64-NEXT: vsrl.vi v24, v8, 24
; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsll.vi v24, v8, 8
-; RV64-NEXT: lui a0, 4080
-; RV64-NEXT: vand.vx v24, v24, a0
+; RV64-NEXT: vand.vx v24, v8, a0
+; RV64-NEXT: vsll.vi v24, v24, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vor.vv v8, v8, v16
@@ -283,41 +273,33 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: lui a0, 1044480
; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw zero, 8(sp)
-; RV32-NEXT: li a1, 255
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a1, a1, -256
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli a3, zero, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v9, v8, a2
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsrl.vx v10, v8, a3
-; RV32-NEXT: vand.vx v10, v10, a1
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vx v9, v8, a0
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: vsrl.vx v10, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v10, v10, a2
; RV32-NEXT: vor.vv v9, v10, v9
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v10, (a1), zero
-; RV32-NEXT: vsrl.vi v11, v8, 24
-; RV32-NEXT: vand.vx v11, v11, a0
+; RV32-NEXT: vsrl.vi v10, v8, 24
+; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: vlse64.v v11, (a3), zero
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vand.vx v10, v10, a3
; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vand.vv v10, v12, v10
-; RV32-NEXT: vor.vv v10, v10, v11
-; RV32-NEXT: vlse64.v v11, (a1), zero
+; RV32-NEXT: vand.vv v12, v12, v11
+; RV32-NEXT: vor.vv v10, v12, v10
; RV32-NEXT: vor.vv v9, v10, v9
-; RV32-NEXT: vsll.vx v10, v8, a2
-; RV32-NEXT: vsll.vx v12, v8, a3
-; RV32-NEXT: vand.vv v11, v12, v11
-; RV32-NEXT: vlse64.v v12, (a1), zero
-; RV32-NEXT: vor.vv v10, v10, v11
-; RV32-NEXT: vlse64.v v11, (a1), zero
-; RV32-NEXT: vsll.vi v13, v8, 8
-; RV32-NEXT: vand.vv v12, v13, v12
-; RV32-NEXT: vsll.vi v8, v8, 24
+; RV32-NEXT: vsll.vx v10, v8, a0
+; RV32-NEXT: vand.vx v12, v8, a2
+; RV32-NEXT: vsll.vx v12, v12, a1
+; RV32-NEXT: vor.vv v10, v10, v12
+; RV32-NEXT: vand.vx v12, v8, a3
+; RV32-NEXT: vsll.vi v12, v12, 24
; RV32-NEXT: vand.vv v8, v8, v11
-; RV32-NEXT: vor.vv v8, v8, v12
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: addi sp, sp, 16
@@ -335,25 +317,22 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
; RV64-NEXT: vand.vx v10, v10, a2
; RV64-NEXT: vor.vv v9, v10, v9
; RV64-NEXT: vsrl.vi v10, v8, 24
-; RV64-NEXT: lui a2, 4080
-; RV64-NEXT: vand.vx v10, v10, a2
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: vand.vx v10, v10, a3
; RV64-NEXT: vsrl.vi v11, v8, 8
-; RV64-NEXT: li a2, 255
-; RV64-NEXT: slli a3, a2, 24
-; RV64-NEXT: vand.vx v11, v11, a3
+; RV64-NEXT: li a4, 255
+; RV64-NEXT: slli a4, a4, 24
+; RV64-NEXT: vand.vx v11, v11, a4
; RV64-NEXT: vor.vv v10, v11, v10
; RV64-NEXT: vor.vv v9, v10, v9
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: vand.vx v10, v10, a3
-; RV64-NEXT: vsll.vi v11, v8, 24
-; RV64-NEXT: slli a3, a2, 40
-; RV64-NEXT: vand.vx v11, v11, a3
-; RV64-NEXT: vor.vv v10, v11, v10
+; RV64-NEXT: vand.vx v10, v8, a3
+; RV64-NEXT: vsll.vi v10, v10, 24
+; RV64-NEXT: vand.vx v11, v8, a4
+; RV64-NEXT: vsll.vi v11, v11, 8
+; RV64-NEXT: vor.vv v10, v10, v11
; RV64-NEXT: vsll.vx v11, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a2
; RV64-NEXT: vsll.vx v8, v8, a1
-; RV64-NEXT: slli a0, a2, 48
-; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vor.vv v8, v11, v8
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vor.vv v8, v8, v9
@@ -371,41 +350,33 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: lui a0, 1044480
; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw zero, 8(sp)
-; RV32-NEXT: li a1, 255
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a1, a1, -256
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli a3, zero, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vx v10, v8, a2
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsrl.vx v12, v8, a3
-; RV32-NEXT: vand.vx v12, v12, a1
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vx v10, v8, a0
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: vsrl.vx v12, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v12, v12, a2
; RV32-NEXT: vor.vv v10, v12, v10
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v12, (a1), zero
-; RV32-NEXT: vsrl.vi v14, v8, 24
-; RV32-NEXT: vand.vx v14, v14, a0
+; RV32-NEXT: vsrl.vi v12, v8, 24
+; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: vlse64.v v14, (a3), zero
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vand.vx v12, v12, a3
; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vand.vv v12, v16, v12
-; RV32-NEXT: vor.vv v12, v12, v14
-; RV32-NEXT: vlse64.v v14, (a1), zero
+; RV32-NEXT: vand.vv v16, v16, v14
+; RV32-NEXT: vor.vv v12, v16, v12
; RV32-NEXT: vor.vv v10, v12, v10
-; RV32-NEXT: vsll.vx v12, v8, a2
-; RV32-NEXT: vsll.vx v16, v8, a3
-; RV32-NEXT: vand.vv v14, v16, v14
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vor.vv v12, v12, v14
-; RV32-NEXT: vlse64.v v14, (a1), zero
-; RV32-NEXT: vsll.vi v18, v8, 8
-; RV32-NEXT: vand.vv v16, v18, v16
-; RV32-NEXT: vsll.vi v8, v8, 24
+; RV32-NEXT: vsll.vx v12, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a2
+; RV32-NEXT: vsll.vx v16, v16, a1
+; RV32-NEXT: vor.vv v12, v12, v16
+; RV32-NEXT: vand.vx v16, v8, a3
+; RV32-NEXT: vsll.vi v16, v16, 24
; RV32-NEXT: vand.vv v8, v8, v14
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: addi sp, sp, 16
@@ -423,25 +394,22 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
; RV64-NEXT: vand.vx v12, v12, a2
; RV64-NEXT: vor.vv v10, v12, v10
; RV64-NEXT: vsrl.vi v12, v8, 24
-; RV64-NEXT: lui a2, 4080
-; RV64-NEXT: vand.vx v12, v12, a2
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: vand.vx v12, v12, a3
; RV64-NEXT: vsrl.vi v14, v8, 8
-; RV64-NEXT: li a2, 255
-; RV64-NEXT: slli a3, a2, 24
-; RV64-NEXT: vand.vx v14, v14, a3
+; RV64-NEXT: li a4, 255
+; RV64-NEXT: slli a4, a4, 24
+; RV64-NEXT: vand.vx v14, v14, a4
; RV64-NEXT: vor.vv v12, v14, v12
; RV64-NEXT: vor.vv v10, v12, v10
-; RV64-NEXT: vsll.vi v12, v8, 8
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: vand.vx v12, v12, a3
-; RV64-NEXT: vsll.vi v14, v8, 24
-; RV64-NEXT: slli a3, a2, 40
-; RV64-NEXT: vand.vx v14, v14, a3
-; RV64-NEXT: vor.vv v12, v14, v12
+; RV64-NEXT: vand.vx v12, v8, a3
+; RV64-NEXT: vsll.vi v12, v12, 24
+; RV64-NEXT: vand.vx v14, v8, a4
+; RV64-NEXT: vsll.vi v14, v14, 8
+; RV64-NEXT: vor.vv v12, v12, v14
; RV64-NEXT: vsll.vx v14, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a2
; RV64-NEXT: vsll.vx v8, v8, a1
-; RV64-NEXT: slli a0, a2, 48
-; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vor.vv v8, v14, v8
; RV64-NEXT: vor.vv v8, v8, v12
; RV64-NEXT: vor.vv v8, v8, v10
@@ -459,41 +427,33 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: lui a0, 1044480
; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw zero, 8(sp)
-; RV32-NEXT: li a1, 255
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a1, a1, -256
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli a3, zero, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vx v12, v8, a2
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsrl.vx v16, v8, a3
-; RV32-NEXT: vand.vx v16, v16, a1
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vx v12, v8, a0
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: vsrl.vx v16, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v16, v16, a2
; RV32-NEXT: vor.vv v12, v16, v12
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vsrl.vi v20, v8, 24
-; RV32-NEXT: vand.vx v20, v20, a0
+; RV32-NEXT: vsrl.vi v16, v8, 24
+; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: vlse64.v v20, (a3), zero
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vand.vx v16, v16, a3
; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: vor.vv v16, v16, v20
-; RV32-NEXT: vlse64.v v20, (a1), zero
+; RV32-NEXT: vand.vv v24, v24, v20
+; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: vor.vv v12, v16, v12
-; RV32-NEXT: vsll.vx v16, v8, a2
-; RV32-NEXT: vsll.vx v24, v8, a3
-; RV32-NEXT: vand.vv v20, v24, v20
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vor.vv v16, v16, v20
-; RV32-NEXT: vlse64.v v20, (a1), zero
-; RV32-NEXT: vsll.vi v28, v8, 8
-; RV32-NEXT: vand.vv v24, v28, v24
-; RV32-NEXT: vsll.vi v8, v8, 24
+; RV32-NEXT: vsll.vx v16, v8, a0
+; RV32-NEXT: vand.vx v24, v8, a2
+; RV32-NEXT: vsll.vx v24, v24, a1
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: vand.vx v24, v8, a3
+; RV32-NEXT: vsll.vi v24, v24, 24
; RV32-NEXT: vand.vv v8, v8, v20
-; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: addi sp, sp, 16
@@ -511,25 +471,22 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
; RV64-NEXT: vand.vx v16, v16, a2
; RV64-NEXT: vor.vv v12, v16, v12
; RV64-NEXT: vsrl.vi v16, v8, 24
-; RV64-NEXT: lui a2, 4080
-; RV64-NEXT: vand.vx v16, v16, a2
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: vand.vx v16, v16, a3
; RV64-NEXT: vsrl.vi v20, v8, 8
-; RV64-NEXT: li a2, 255
-; RV64-NEXT: slli a3, a2, 24
-; RV64-NEXT: vand.vx v20, v20, a3
+; RV64-NEXT: li a4, 255
+; RV64-NEXT: slli a4, a4, 24
+; RV64-NEXT: vand.vx v20, v20, a4
; RV64-NEXT: vor.vv v16, v20, v16
; RV64-NEXT: vor.vv v12, v16, v12
-; RV64-NEXT: vsll.vi v16, v8, 8
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: vand.vx v16, v16, a3
-; RV64-NEXT: vsll.vi v20, v8, 24
-; RV64-NEXT: slli a3, a2, 40
-; RV64-NEXT: vand.vx v20, v20, a3
-; RV64-NEXT: vor.vv v16, v20, v16
+; RV64-NEXT: vand.vx v16, v8, a3
+; RV64-NEXT: vsll.vi v16, v16, 24
+; RV64-NEXT: vand.vx v20, v8, a4
+; RV64-NEXT: vsll.vi v20, v20, 8
+; RV64-NEXT: vor.vv v16, v16, v20
; RV64-NEXT: vsll.vx v20, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a2
; RV64-NEXT: vsll.vx v8, v8, a1
-; RV64-NEXT: slli a0, a2, 48
-; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vor.vv v8, v20, v8
; RV64-NEXT: vor.vv v8, v8, v16
; RV64-NEXT: vor.vv v8, v8, v12
@@ -545,74 +502,50 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: lui a0, 1044480
; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw zero, 8(sp)
-; RV32-NEXT: li a1, 255
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a1, a1, -256
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsrl.vx v16, v8, a3
-; RV32-NEXT: vand.vx v16, v16, a1
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vsrl.vx v0, v8, a2
-; RV32-NEXT: vor.vv v16, v16, v0
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vand.vv v24, v0, v24
+; RV32-NEXT: li a0, 56
+; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v8, a0
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: vsrl.vx v24, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v24, v24, a2
+; RV32-NEXT: vor.vv v16, v24, v16
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v0, v8, 24
-; RV32-NEXT: vand.vx v0, v0, a0
-; RV32-NEXT: vlse64.v v16, (a1), zero
-; RV32-NEXT: vor.vv v24, v24, v0
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v24, v0
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsll.vx v0, v8, a3
-; RV32-NEXT: vand.vv v16, v0, v16
-; RV32-NEXT: vsll.vx v0, v8, a2
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v0, (a1), zero
-; RV32-NEXT: vsll.vi v16, v8, 8
+; RV32-NEXT: addi a3, sp, 8
+; RV32-NEXT: vlse64.v v24, (a3), zero
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vand.vx v0, v0, a3
+; RV32-NEXT: vsrl.vi v16, v8, 8
; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vl8re8.v v0, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v0, v8, a2
+; RV32-NEXT: vsll.vx v0, v0, a1
+; RV32-NEXT: vsll.vx v16, v8, a0
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: vand.vx v8, v8, a3
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsll.vi v24, v24, 8
+; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: vor.vv v8, v16, v8
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -629,25 +562,22 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
; RV64-NEXT: vand.vx v24, v24, a2
; RV64-NEXT: vor.vv v16, v24, v16
; RV64-NEXT: vsrl.vi v24, v8, 24
-; RV64-NEXT: lui a2, 4080
-; RV64-NEXT: vand.vx v24, v24, a2
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: vand.vx v24, v24, a3
; RV64-NEXT: vsrl.vi v0, v8, 8
-; RV64-NEXT: li a2, 255
-; RV64-NEXT: slli a3, a2, 24
-; RV64-NEXT: vand.vx v0, v0, a3
+; RV64-NEXT: li a4, 255
+; RV64-NEXT: slli a4, a4, 24
+; RV64-NEXT: vand.vx v0, v0, a4
; RV64-NEXT: vor.vv v24, v0, v24
; RV64-NEXT: vor.vv v16, v24, v16
-; RV64-NEXT: vsll.vi v24, v8, 8
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: vand.vx v24, v24, a3
-; RV64-NEXT: vsll.vi v0, v8, 24
-; RV64-NEXT: slli a3, a2, 40
-; RV64-NEXT: vand.vx v0, v0, a3
-; RV64-NEXT: vor.vv v24, v0, v24
+; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: vsll.vi v24, v24, 24
+; RV64-NEXT: vand.vx v0, v8, a4
+; RV64-NEXT: vsll.vi v0, v0, 8
+; RV64-NEXT: vor.vv v24, v24, v0
; RV64-NEXT: vsll.vx v0, v8, a0
+; RV64-NEXT: vand.vx v8, v8, a2
; RV64-NEXT: vsll.vx v8, v8, a1
-; RV64-NEXT: slli a0, a2, 48
-; RV64-NEXT: vand.vx v8, v8, a0
; RV64-NEXT: vor.vv v8, v0, v8
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index e7c82a097dd81..c213c430b033a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -85,9 +85,8 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; RV32-NEXT: vand.vx v9, v9, a1
; RV32-NEXT: vsrl.vi v10, v8, 24
; RV32-NEXT: vor.vv v9, v9, v10
-; RV32-NEXT: vsll.vi v10, v8, 8
-; RV32-NEXT: lui a1, 4080
-; RV32-NEXT: vand.vx v10, v10, a1
+; RV32-NEXT: vand.vx v10, v8, a1
+; RV32-NEXT: vsll.vi v10, v10, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v9
@@ -125,9 +124,8 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; RV64-NEXT: vand.vx v9, v9, a1
; RV64-NEXT: vsrl.vi v10, v8, 24
; RV64-NEXT: vor.vv v9, v9, v10
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: lui a1, 4080
-; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vand.vx v10, v8, a1
+; RV64-NEXT: vsll.vi v10, v10, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vor.vv v8, v8, v9
@@ -186,32 +184,19 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; RV32-NEXT: vmerge.vxm v11, v11, a5, v0
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vand.vv v11, v12, v11
-; RV32-NEXT: vor.vv v10, v11, v10
+; RV32-NEXT: vand.vv v12, v12, v11
+; RV32-NEXT: vor.vv v10, v12, v10
; RV32-NEXT: vor.vv v9, v10, v9
-; RV32-NEXT: li a5, 255
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a5
-; RV32-NEXT: vmerge.vim v10, v10, 0, v0
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vsll.vi v11, v8, 8
-; RV32-NEXT: vand.vv v10, v11, v10
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v11, a3
-; RV32-NEXT: vmerge.vim v11, v11, 0, v0
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vsll.vi v12, v8, 24
-; RV32-NEXT: vand.vv v11, v12, v11
-; RV32-NEXT: vor.vv v10, v11, v10
-; RV32-NEXT: vsll.vx v11, v8, a2
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v12, a4
-; RV32-NEXT: vmerge.vim v12, v12, 0, v0
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v11, v11, v12
-; RV32-NEXT: vsll.vx v8, v8, a1
-; RV32-NEXT: vor.vv v8, v8, v11
-; RV32-NEXT: vor.vv v8, v8, v10
+; RV32-NEXT: vsll.vx v10, v8, a1
+; RV32-NEXT: vand.vx v12, v8, a3
+; RV32-NEXT: vsll.vx v12, v12, a2
+; RV32-NEXT: vor.vv v10, v10, v12
+; RV32-NEXT: vand.vx v12, v8, a4
+; RV32-NEXT: vsll.vi v12, v12, 24
+; RV32-NEXT: vand.vv v8, v8, v11
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v12, v8
+; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vsrl.vi v9, v8, 4
; RV32-NEXT: lui a1, 61681
@@ -259,25 +244,22 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; RV64-NEXT: vand.vx v10, v10, a3
; RV64-NEXT: vor.vv v9, v10, v9
; RV64-NEXT: vsrl.vi v10, v8, 24
-; RV64-NEXT: lui a3, 4080
-; RV64-NEXT: vand.vx v10, v10, a3
+; RV64-NEXT: lui a4, 4080
+; RV64-NEXT: vand.vx v10, v10, a4
; RV64-NEXT: vsrl.vi v11, v8, 8
-; RV64-NEXT: li a3, 255
-; RV64-NEXT: slli a4, a3, 24
-; RV64-NEXT: vand.vx v11, v11, a4
+; RV64-NEXT: li a5, 255
+; RV64-NEXT: slli a5, a5, 24
+; RV64-NEXT: vand.vx v11, v11, a5
; RV64-NEXT: vor.vv v10, v11, v10
; RV64-NEXT: vor.vv v9, v10, v9
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: vand.vx v10, v10, a4
-; RV64-NEXT: vsll.vi v11, v8, 24
-; RV64-NEXT: slli a4, a3, 40
-; RV64-NEXT: vand.vx v11, v11, a4
+; RV64-NEXT: vand.vx v10, v8, a5
+; RV64-NEXT: vsll.vi v10, v10, 8
+; RV64-NEXT: vand.vx v11, v8, a4
+; RV64-NEXT: vsll.vi v11, v11, 24
; RV64-NEXT: vor.vv v10, v11, v10
; RV64-NEXT: vsll.vx v11, v8, a1
+; RV64-NEXT: vand.vx v8, v8, a3
; RV64-NEXT: vsll.vx v8, v8, a2
-; RV64-NEXT: slli a1, a3, 48
-; RV64-NEXT: vand.vx v8, v8, a1
; RV64-NEXT: vor.vv v8, v11, v8
; RV64-NEXT: lui a1, %hi(.LCPI2_0)
; RV64-NEXT: ld a1, %lo(.LCPI2_0)(a1)
@@ -497,9 +479,8 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1
; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 24
; LMULMAX2-RV32-NEXT: vor.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT: vsll.vi v12, v8, 8
-; LMULMAX2-RV32-NEXT: lui a1, 4080
-; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a1
+; LMULMAX2-RV32-NEXT: vand.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT: vsll.vi v12, v12, 8
; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 24
; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12
; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
@@ -537,9 +518,8 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1
; LMULMAX2-RV64-NEXT: vsrl.vi v12, v8, 24
; LMULMAX2-RV64-NEXT: vor.vv v10, v10, v12
-; LMULMAX2-RV64-NEXT: vsll.vi v12, v8, 8
-; LMULMAX2-RV64-NEXT: lui a1, 4080
-; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a1
+; LMULMAX2-RV64-NEXT: vand.vx v12, v8, a1
+; LMULMAX2-RV64-NEXT: vsll.vi v12, v12, 8
; LMULMAX2-RV64-NEXT: vsll.vi v8, v8, 24
; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v12
; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
@@ -579,55 +559,54 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2
; LMULMAX1-RV32-NEXT: vsrl.vi v11, v8, 24
; LMULMAX1-RV32-NEXT: vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT: vsll.vi v11, v8, 8
-; LMULMAX1-RV32-NEXT: lui a3, 4080
-; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a3
+; LMULMAX1-RV32-NEXT: vand.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT: vsll.vi v11, v11, 8
; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 24
; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v11
; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT: lui a4, 61681
-; LMULMAX1-RV32-NEXT: addi a4, a4, -241
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a4
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4
+; LMULMAX1-RV32-NEXT: lui a3, 61681
+; LMULMAX1-RV32-NEXT: addi a3, a3, -241
+; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3
+; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a3
; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 4
; LMULMAX1-RV32-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX1-RV32-NEXT: lui a5, 209715
-; LMULMAX1-RV32-NEXT: addi a5, a5, 819
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a5
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a5
+; LMULMAX1-RV32-NEXT: lui a4, 209715
+; LMULMAX1-RV32-NEXT: addi a4, a4, 819
+; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a4
+; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4
; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 2
; LMULMAX1-RV32-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT: lui a6, 349525
-; LMULMAX1-RV32-NEXT: addi a6, a6, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a6
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a6
+; LMULMAX1-RV32-NEXT: lui a5, 349525
+; LMULMAX1-RV32-NEXT: addi a5, a5, 1365
+; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a5
+; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a5
; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v8
; LMULMAX1-RV32-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 8
; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2
; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 24
; LMULMAX1-RV32-NEXT: vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT: vsll.vi v11, v9, 8
-; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a3
+; LMULMAX1-RV32-NEXT: vand.vx v11, v9, a2
+; LMULMAX1-RV32-NEXT: vsll.vi v11, v11, 8
; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 24
; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v11
; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10
; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a4
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4
+; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3
+; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a3
; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 4
; LMULMAX1-RV32-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 2
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a5
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a5
+; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a4
+; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4
; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 2
; LMULMAX1-RV32-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a6
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a6
+; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a5
+; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a5
; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v9
; LMULMAX1-RV32-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV32-NEXT: vse32.v v9, (a0)
@@ -646,55 +625,54 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2
; LMULMAX1-RV64-NEXT: vsrl.vi v11, v8, 24
; LMULMAX1-RV64-NEXT: vor.vv v10, v10, v11
-; LMULMAX1-RV64-NEXT: vsll.vi v11, v8, 8
-; LMULMAX1-RV64-NEXT: lui a3, 4080
-; LMULMAX1-RV64-NEXT: vand.vx v11, v11, a3
+; LMULMAX1-RV64-NEXT: vand.vx v11, v8, a2
+; LMULMAX1-RV64-NEXT: vsll.vi v11, v11, 8
; LMULMAX1-RV64-NEXT: vsll.vi v8, v8, 24
; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v11
; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT: lui a4, 61681
-; LMULMAX1-RV64-NEXT: addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a4
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4
+; LMULMAX1-RV64-NEXT: lui a3, 61681
+; LMULMAX1-RV64-NEXT: addiw a3, a3, -241
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3
+; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a3
; LMULMAX1-RV64-NEXT: vsll.vi v8, v8, 4
; LMULMAX1-RV64-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX1-RV64-NEXT: lui a5, 209715
-; LMULMAX1-RV64-NEXT: addiw a5, a5, 819
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a5
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5
+; LMULMAX1-RV64-NEXT: lui a4, 209715
+; LMULMAX1-RV64-NEXT: addiw a4, a4, 819
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a4
+; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4
; LMULMAX1-RV64-NEXT: vsll.vi v8, v8, 2
; LMULMAX1-RV64-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT: lui a6, 349525
-; LMULMAX1-RV64-NEXT: addiw a6, a6, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a6
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a6
+; LMULMAX1-RV64-NEXT: lui a5, 349525
+; LMULMAX1-RV64-NEXT: addiw a5, a5, 1365
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a5
+; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5
; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v8
; LMULMAX1-RV64-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 8
; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2
; LMULMAX1-RV64-NEXT: vsrl.vi v11, v9, 24
; LMULMAX1-RV64-NEXT: vor.vv v10, v10, v11
-; LMULMAX1-RV64-NEXT: vsll.vi v11, v9, 8
-; LMULMAX1-RV64-NEXT: vand.vx v11, v11, a3
+; LMULMAX1-RV64-NEXT: vand.vx v11, v9, a2
+; LMULMAX1-RV64-NEXT: vsll.vi v11, v11, 8
; LMULMAX1-RV64-NEXT: vsll.vi v9, v9, 24
; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v11
; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a4
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3
+; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a3
; LMULMAX1-RV64-NEXT: vsll.vi v9, v9, 4
; LMULMAX1-RV64-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a5
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a4
+; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4
; LMULMAX1-RV64-NEXT: vsll.vi v9, v9, 2
; LMULMAX1-RV64-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a6
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a6
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a5
+; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5
; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v9
; LMULMAX1-RV64-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV64-NEXT: vse32.v v9, (a0)
@@ -732,32 +710,19 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV32-NEXT: vmerge.vxm v14, v14, a5, v0
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 8
-; LMULMAX2-RV32-NEXT: vand.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT: vor.vv v12, v14, v12
+; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v14
+; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12
; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT: li a5, 255
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v12, a5
-; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 0, v0
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vsll.vi v14, v8, 8
-; LMULMAX2-RV32-NEXT: vand.vv v12, v14, v12
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v14, a3
-; LMULMAX2-RV32-NEXT: vmerge.vim v14, v14, 0, v0
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vsll.vi v16, v8, 24
-; LMULMAX2-RV32-NEXT: vand.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT: vor.vv v12, v14, v12
-; LMULMAX2-RV32-NEXT: vsll.vx v14, v8, a2
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v16, a4
-; LMULMAX2-RV32-NEXT: vmerge.vim v16, v16, 0, v0
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v14, v14, v16
-; LMULMAX2-RV32-NEXT: vsll.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v14
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12
+; LMULMAX2-RV32-NEXT: vsll.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a3
+; LMULMAX2-RV32-NEXT: vsll.vx v16, v16, a2
+; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a4
+; LMULMAX2-RV32-NEXT: vsll.vi v16, v16, 24
+; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 8
+; LMULMAX2-RV32-NEXT: vor.vv v8, v16, v8
+; LMULMAX2-RV32-NEXT: vor.vv v8, v12, v8
; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4
; LMULMAX2-RV32-NEXT: lui a1, 61681
@@ -805,25 +770,22 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a3
; LMULMAX2-RV64-NEXT: vor.vv v10, v12, v10
; LMULMAX2-RV64-NEXT: vsrl.vi v12, v8, 24
-; LMULMAX2-RV64-NEXT: lui a3, 4080
-; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a3
+; LMULMAX2-RV64-NEXT: lui a4, 4080
+; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a4
; LMULMAX2-RV64-NEXT: vsrl.vi v14, v8, 8
-; LMULMAX2-RV64-NEXT: li a3, 255
-; LMULMAX2-RV64-NEXT: slli a4, a3, 24
-; LMULMAX2-RV64-NEXT: vand.vx v14, v14, a4
+; LMULMAX2-RV64-NEXT: li a5, 255
+; LMULMAX2-RV64-NEXT: slli a5, a5, 24
+; LMULMAX2-RV64-NEXT: vand.vx v14, v14, a5
; LMULMAX2-RV64-NEXT: vor.vv v12, v14, v12
; LMULMAX2-RV64-NEXT: vor.vv v10, v12, v10
-; LMULMAX2-RV64-NEXT: vsll.vi v12, v8, 8
-; LMULMAX2-RV64-NEXT: slli a4, a3, 32
-; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a4
-; LMULMAX2-RV64-NEXT: vsll.vi v14, v8, 24
-; LMULMAX2-RV64-NEXT: slli a4, a3, 40
-; LMULMAX2-RV64-NEXT: vand.vx v14, v14, a4
+; LMULMAX2-RV64-NEXT: vand.vx v12, v8, a5
+; LMULMAX2-RV64-NEXT: vsll.vi v12, v12, 8
+; LMULMAX2-RV64-NEXT: vand.vx v14, v8, a4
+; LMULMAX2-RV64-NEXT: vsll.vi v14, v14, 24
; LMULMAX2-RV64-NEXT: vor.vv v12, v14, v12
; LMULMAX2-RV64-NEXT: vsll.vx v14, v8, a1
+; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a3
; LMULMAX2-RV64-NEXT: vsll.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: slli a1, a3, 48
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
; LMULMAX2-RV64-NEXT: vor.vv v8, v14, v8
; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI5_0)
; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI5_0)(a1)
@@ -855,19 +817,19 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-RV32-NEXT: addi a1, a0, 16
-; LMULMAX1-RV32-NEXT: vle64.v v12, (a1)
+; LMULMAX1-RV32-NEXT: vle64.v v10, (a1)
; LMULMAX1-RV32-NEXT: vle64.v v8, (a0)
; LMULMAX1-RV32-NEXT: li a2, 56
-; LMULMAX1-RV32-NEXT: vsrl.vx v9, v12, a2
+; LMULMAX1-RV32-NEXT: vsrl.vx v9, v10, a2
; LMULMAX1-RV32-NEXT: li a3, 40
-; LMULMAX1-RV32-NEXT: vsrl.vx v10, v12, a3
+; LMULMAX1-RV32-NEXT: vsrl.vx v11, v10, a3
; LMULMAX1-RV32-NEXT: lui a4, 16
; LMULMAX1-RV32-NEXT: addi a4, a4, -256
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a4
-; LMULMAX1-RV32-NEXT: vor.vv v10, v10, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v12, 24
+; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4
+; LMULMAX1-RV32-NEXT: vor.vv v11, v11, v9
+; LMULMAX1-RV32-NEXT: vsrl.vi v9, v10, 24
; LMULMAX1-RV32-NEXT: lui a5, 4080
-; LMULMAX1-RV32-NEXT: vand.vx v11, v9, a5
+; LMULMAX1-RV32-NEXT: vand.vx v12, v9, a5
; LMULMAX1-RV32-NEXT: li a6, 5
; LMULMAX1-RV32-NEXT: vmv.s.x v0, a6
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
@@ -875,102 +837,89 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV32-NEXT: lui a6, 1044480
; LMULMAX1-RV32-NEXT: vmerge.vxm v9, v9, a6, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vsrl.vi v13, v12, 8
+; LMULMAX1-RV32-NEXT: vsrl.vi v13, v10, 8
; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v9
-; LMULMAX1-RV32-NEXT: vor.vv v11, v13, v11
-; LMULMAX1-RV32-NEXT: vor.vv v13, v11, v10
-; LMULMAX1-RV32-NEXT: li a6, 255
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v10, a6
-; LMULMAX1-RV32-NEXT: vmerge.vim v10, v10, 0, v0
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vsll.vi v11, v12, 8
-; LMULMAX1-RV32-NEXT: vand.vv v14, v11, v10
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v11, a4
-; LMULMAX1-RV32-NEXT: vmerge.vim v11, v11, 0, v0
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vsll.vi v15, v12, 24
-; LMULMAX1-RV32-NEXT: vand.vv v15, v15, v11
-; LMULMAX1-RV32-NEXT: vor.vv v14, v15, v14
-; LMULMAX1-RV32-NEXT: vsll.vx v15, v12, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v16, a5
-; LMULMAX1-RV32-NEXT: vmerge.vim v16, v16, 0, v0
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v15, v15, v16
-; LMULMAX1-RV32-NEXT: vsll.vx v12, v12, a2
-; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v15
-; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v14
+; LMULMAX1-RV32-NEXT: vor.vv v12, v13, v12
+; LMULMAX1-RV32-NEXT: vor.vv v11, v12, v11
+; LMULMAX1-RV32-NEXT: vsll.vx v12, v10, a2
+; LMULMAX1-RV32-NEXT: vand.vx v13, v10, a4
+; LMULMAX1-RV32-NEXT: vsll.vx v13, v13, a3
; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT: vsrl.vi v13, v12, 4
+; LMULMAX1-RV32-NEXT: vand.vx v13, v10, a5
+; LMULMAX1-RV32-NEXT: vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v9
+; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 8
+; LMULMAX1-RV32-NEXT: vor.vv v10, v13, v10
+; LMULMAX1-RV32-NEXT: vor.vv v10, v12, v10
+; LMULMAX1-RV32-NEXT: vor.vv v10, v10, v11
+; LMULMAX1-RV32-NEXT: vsrl.vi v11, v10, 4
; LMULMAX1-RV32-NEXT: lui a6, 61681
; LMULMAX1-RV32-NEXT: addi a6, a6, -241
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v14, a6
+; LMULMAX1-RV32-NEXT: vmv.v.x v12, a6
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v14
-; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v14
-; LMULMAX1-RV32-NEXT: vsll.vi v12, v12, 4
-; LMULMAX1-RV32-NEXT: vor.vv v12, v13, v12
-; LMULMAX1-RV32-NEXT: vsrl.vi v13, v12, 2
+; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v12
+; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 4
+; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT: vsrl.vi v11, v10, 2
; LMULMAX1-RV32-NEXT: lui a6, 209715
; LMULMAX1-RV32-NEXT: addi a6, a6, 819
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v15, a6
+; LMULMAX1-RV32-NEXT: vmv.v.x v13, a6
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v15
-; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v15
-; LMULMAX1-RV32-NEXT: vsll.vi v12, v12, 2
-; LMULMAX1-RV32-NEXT: vor.vv v12, v13, v12
-; LMULMAX1-RV32-NEXT: vsrl.vi v13, v12, 1
+; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v13
+; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v13
+; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 2
+; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT: vsrl.vi v11, v10, 1
; LMULMAX1-RV32-NEXT: lui a6, 349525
; LMULMAX1-RV32-NEXT: addi a6, a6, 1365
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v17, a6
+; LMULMAX1-RV32-NEXT: vmv.v.x v14, a6
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v17
-; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v17
-; LMULMAX1-RV32-NEXT: vadd.vv v12, v12, v12
-; LMULMAX1-RV32-NEXT: vor.vv v12, v13, v12
-; LMULMAX1-RV32-NEXT: vsrl.vx v13, v8, a2
-; LMULMAX1-RV32-NEXT: vsrl.vx v18, v8, a3
-; LMULMAX1-RV32-NEXT: vand.vx v18, v18, a4
-; LMULMAX1-RV32-NEXT: vor.vv v13, v18, v13
-; LMULMAX1-RV32-NEXT: vsrl.vi v18, v8, 24
-; LMULMAX1-RV32-NEXT: vand.vx v18, v18, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v19, v8, 8
-; LMULMAX1-RV32-NEXT: vand.vv v9, v19, v9
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v18
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v13
-; LMULMAX1-RV32-NEXT: vsll.vi v13, v8, 8
-; LMULMAX1-RV32-NEXT: vand.vv v10, v13, v10
-; LMULMAX1-RV32-NEXT: vsll.vi v13, v8, 24
-; LMULMAX1-RV32-NEXT: vand.vv v11, v13, v11
+; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v14
+; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v14
+; LMULMAX1-RV32-NEXT: vadd.vv v10, v10, v10
; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT: vsll.vx v11, v8, a3
-; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v16
-; LMULMAX1-RV32-NEXT: vsll.vx v8, v8, a2
+; LMULMAX1-RV32-NEXT: vsrl.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT: vsrl.vx v15, v8, a3
+; LMULMAX1-RV32-NEXT: vand.vx v15, v15, a4
+; LMULMAX1-RV32-NEXT: vor.vv v11, v15, v11
+; LMULMAX1-RV32-NEXT: vsrl.vi v15, v8, 24
+; LMULMAX1-RV32-NEXT: vand.vx v15, v15, a5
+; LMULMAX1-RV32-NEXT: vsrl.vi v16, v8, 8
+; LMULMAX1-RV32-NEXT: vand.vv v16, v16, v9
+; LMULMAX1-RV32-NEXT: vor.vv v15, v16, v15
+; LMULMAX1-RV32-NEXT: vor.vv v11, v15, v11
+; LMULMAX1-RV32-NEXT: vsll.vx v15, v8, a2
+; LMULMAX1-RV32-NEXT: vand.vx v16, v8, a4
+; LMULMAX1-RV32-NEXT: vsll.vx v16, v16, a3
+; LMULMAX1-RV32-NEXT: vor.vv v15, v15, v16
+; LMULMAX1-RV32-NEXT: vand.vx v16, v8, a5
+; LMULMAX1-RV32-NEXT: vsll.vi v16, v16, 24
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 8
+; LMULMAX1-RV32-NEXT: vor.vv v8, v16, v8
+; LMULMAX1-RV32-NEXT: vor.vv v8, v15, v8
; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v14
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v14
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12
; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 4
; LMULMAX1-RV32-NEXT: vor.vv v8, v9, v8
; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v15
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v13
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v13
; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 2
; LMULMAX1-RV32-NEXT: vor.vv v8, v9, v8
; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v17
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v17
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v14
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v14
; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v8
; LMULMAX1-RV32-NEXT: vor.vv v8, v9, v8
; LMULMAX1-RV32-NEXT: vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT: vse64.v v12, (a1)
+; LMULMAX1-RV32-NEXT: vse64.v v10, (a1)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64-LABEL: bitreverse_v4i64:
@@ -992,43 +941,40 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV64-NEXT: vand.vx v11, v11, a5
; LMULMAX1-RV64-NEXT: vsrl.vi v12, v9, 8
; LMULMAX1-RV64-NEXT: li a6, 255
-; LMULMAX1-RV64-NEXT: slli a7, a6, 24
-; LMULMAX1-RV64-NEXT: vand.vx v12, v12, a7
+; LMULMAX1-RV64-NEXT: slli a6, a6, 24
+; LMULMAX1-RV64-NEXT: vand.vx v12, v12, a6
; LMULMAX1-RV64-NEXT: vor.vv v11, v12, v11
; LMULMAX1-RV64-NEXT: vor.vv v10, v11, v10
-; LMULMAX1-RV64-NEXT: vsll.vi v11, v9, 8
-; LMULMAX1-RV64-NEXT: slli t0, a6, 32
-; LMULMAX1-RV64-NEXT: vand.vx v11, v11, t0
-; LMULMAX1-RV64-NEXT: vsll.vi v12, v9, 24
-; LMULMAX1-RV64-NEXT: slli t1, a6, 40
-; LMULMAX1-RV64-NEXT: vand.vx v12, v12, t1
+; LMULMAX1-RV64-NEXT: vand.vx v11, v9, a6
+; LMULMAX1-RV64-NEXT: vsll.vi v11, v11, 8
+; LMULMAX1-RV64-NEXT: vand.vx v12, v9, a5
+; LMULMAX1-RV64-NEXT: vsll.vi v12, v12, 24
; LMULMAX1-RV64-NEXT: vor.vv v11, v12, v11
; LMULMAX1-RV64-NEXT: vsll.vx v12, v9, a2
+; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4
; LMULMAX1-RV64-NEXT: vsll.vx v9, v9, a3
-; LMULMAX1-RV64-NEXT: slli a6, a6, 48
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a6
; LMULMAX1-RV64-NEXT: vor.vv v9, v12, v9
-; LMULMAX1-RV64-NEXT: lui t2, %hi(.LCPI5_0)
-; LMULMAX1-RV64-NEXT: ld t2, %lo(.LCPI5_0)(t2)
+; LMULMAX1-RV64-NEXT: lui a7, %hi(.LCPI5_0)
+; LMULMAX1-RV64-NEXT: ld a7, %lo(.LCPI5_0)(a7)
; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v11
; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t2
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, t2
-; LMULMAX1-RV64-NEXT: lui t3, %hi(.LCPI5_1)
-; LMULMAX1-RV64-NEXT: ld t3, %lo(.LCPI5_1)(t3)
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a7
+; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a7
+; LMULMAX1-RV64-NEXT: lui t0, %hi(.LCPI5_1)
+; LMULMAX1-RV64-NEXT: ld t0, %lo(.LCPI5_1)(t0)
; LMULMAX1-RV64-NEXT: vsll.vi v9, v9, 4
; LMULMAX1-RV64-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t3
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, t3
-; LMULMAX1-RV64-NEXT: lui t4, %hi(.LCPI5_2)
-; LMULMAX1-RV64-NEXT: ld t4, %lo(.LCPI5_2)(t4)
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t0
+; LMULMAX1-RV64-NEXT: vand.vx v9, v9, t0
+; LMULMAX1-RV64-NEXT: lui t1, %hi(.LCPI5_2)
+; LMULMAX1-RV64-NEXT: ld t1, %lo(.LCPI5_2)(t1)
; LMULMAX1-RV64-NEXT: vsll.vi v9, v9, 2
; LMULMAX1-RV64-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t4
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, t4
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t1
+; LMULMAX1-RV64-NEXT: vand.vx v9, v9, t1
; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v9
; LMULMAX1-RV64-NEXT: vor.vv v9, v10, v9
; LMULMAX1-RV64-NEXT: vsrl.vx v10, v8, a2
@@ -1038,33 +984,33 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV64-NEXT: vsrl.vi v11, v8, 24
; LMULMAX1-RV64-NEXT: vand.vx v11, v11, a5
; LMULMAX1-RV64-NEXT: vsrl.vi v12, v8, 8
-; LMULMAX1-RV64-NEXT: vand.vx v12, v12, a7
+; LMULMAX1-RV64-NEXT: vand.vx v12, v12, a6
; LMULMAX1-RV64-NEXT: vor.vv v11, v12, v11
; LMULMAX1-RV64-NEXT: vor.vv v10, v11, v10
-; LMULMAX1-RV64-NEXT: vsll.vi v11, v8, 8
-; LMULMAX1-RV64-NEXT: vand.vx v11, v11, t0
-; LMULMAX1-RV64-NEXT: vsll.vi v12, v8, 24
-; LMULMAX1-RV64-NEXT: vand.vx v12, v12, t1
+; LMULMAX1-RV64-NEXT: vand.vx v11, v8, a6
+; LMULMAX1-RV64-NEXT: vsll.vi v11, v11, 8
+; LMULMAX1-RV64-NEXT: vand.vx v12, v8, a5
+; LMULMAX1-RV64-NEXT: vsll.vi v12, v12, 24
; LMULMAX1-RV64-NEXT: vor.vv v11, v12, v11
; LMULMAX1-RV64-NEXT: vsll.vx v12, v8, a2
+; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4
; LMULMAX1-RV64-NEXT: vsll.vx v8, v8, a3
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a6
; LMULMAX1-RV64-NEXT: vor.vv v8, v12, v8
; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v11
; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, t2
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a7
+; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a7
; LMULMAX1-RV64-NEXT: vsll.vi v8, v8, 4
; LMULMAX1-RV64-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t3
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, t3
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t0
+; LMULMAX1-RV64-NEXT: vand.vx v8, v8, t0
; LMULMAX1-RV64-NEXT: vsll.vi v8, v8, 2
; LMULMAX1-RV64-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t4
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, t4
+; LMULMAX1-RV64-NEXT: vand.vx v10, v10, t1
+; LMULMAX1-RV64-NEXT: vand.vx v8, v8, t1
; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v8
; LMULMAX1-RV64-NEXT: vor.vv v8, v10, v8
; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
index 8251697878b84..5837dba7b8cd9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -33,9 +33,8 @@ define void @bswap_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; RV32-NEXT: vand.vx v9, v9, a1
; RV32-NEXT: vsrl.vi v10, v8, 24
; RV32-NEXT: vor.vv v9, v9, v10
-; RV32-NEXT: vsll.vi v10, v8, 8
-; RV32-NEXT: lui a1, 4080
-; RV32-NEXT: vand.vx v10, v10, a1
+; RV32-NEXT: vand.vx v10, v8, a1
+; RV32-NEXT: vsll.vi v10, v10, 8
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vor.vv v8, v8, v9
@@ -52,9 +51,8 @@ define void @bswap_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
; RV64-NEXT: vand.vx v9, v9, a1
; RV64-NEXT: vsrl.vi v10, v8, 24
; RV64-NEXT: vor.vv v9, v9, v10
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: lui a1, 4080
-; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vand.vx v10, v8, a1
+; RV64-NEXT: vsll.vi v10, v10, 8
; RV64-NEXT: vsll.vi v8, v8, 24
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vor.vv v8, v8, v9
@@ -92,32 +90,19 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; RV32-NEXT: vmerge.vxm v11, v11, a5, v0
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vand.vv v11, v12, v11
-; RV32-NEXT: vor.vv v10, v11, v10
+; RV32-NEXT: vand.vv v12, v12, v11
+; RV32-NEXT: vor.vv v10, v12, v10
; RV32-NEXT: vor.vv v9, v10, v9
-; RV32-NEXT: li a5, 255
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v10, a5
-; RV32-NEXT: vmerge.vim v10, v10, 0, v0
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vsll.vi v11, v8, 8
-; RV32-NEXT: vand.vv v10, v11, v10
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v11, a3
-; RV32-NEXT: vmerge.vim v11, v11, 0, v0
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vsll.vi v12, v8, 24
-; RV32-NEXT: vand.vv v11, v12, v11
-; RV32-NEXT: vor.vv v10, v11, v10
-; RV32-NEXT: vsll.vx v11, v8, a2
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v12, a4
-; RV32-NEXT: vmerge.vim v12, v12, 0, v0
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vand.vv v11, v11, v12
-; RV32-NEXT: vsll.vx v8, v8, a1
-; RV32-NEXT: vor.vv v8, v8, v11
-; RV32-NEXT: vor.vv v8, v8, v10
+; RV32-NEXT: vsll.vx v10, v8, a1
+; RV32-NEXT: vand.vx v12, v8, a3
+; RV32-NEXT: vsll.vx v12, v12, a2
+; RV32-NEXT: vor.vv v10, v10, v12
+; RV32-NEXT: vand.vx v12, v8, a4
+; RV32-NEXT: vsll.vi v12, v12, 24
+; RV32-NEXT: vand.vv v8, v8, v11
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v12, v8
+; RV32-NEXT: vor.vv v8, v10, v8
; RV32-NEXT: vor.vv v8, v8, v9
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
@@ -135,25 +120,22 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
; RV64-NEXT: vand.vx v10, v10, a3
; RV64-NEXT: vor.vv v9, v10, v9
; RV64-NEXT: vsrl.vi v10, v8, 24
-; RV64-NEXT: lui a3, 4080
-; RV64-NEXT: vand.vx v10, v10, a3
+; RV64-NEXT: lui a4, 4080
+; RV64-NEXT: vand.vx v10, v10, a4
; RV64-NEXT: vsrl.vi v11, v8, 8
-; RV64-NEXT: li a3, 255
-; RV64-NEXT: slli a4, a3, 24
-; RV64-NEXT: vand.vx v11, v11, a4
+; RV64-NEXT: li a5, 255
+; RV64-NEXT: slli a5, a5, 24
+; RV64-NEXT: vand.vx v11, v11, a5
; RV64-NEXT: vor.vv v10, v11, v10
; RV64-NEXT: vor.vv v9, v10, v9
-; RV64-NEXT: vsll.vi v10, v8, 8
-; RV64-NEXT: slli a4, a3, 32
-; RV64-NEXT: vand.vx v10, v10, a4
-; RV64-NEXT: vsll.vi v11, v8, 24
-; RV64-NEXT: slli a4, a3, 40
-; RV64-NEXT: vand.vx v11, v11, a4
+; RV64-NEXT: vand.vx v10, v8, a5
+; RV64-NEXT: vsll.vi v10, v10, 8
+; RV64-NEXT: vand.vx v11, v8, a4
+; RV64-NEXT: vsll.vi v11, v11, 24
; RV64-NEXT: vor.vv v10, v11, v10
; RV64-NEXT: vsll.vx v11, v8, a1
+; RV64-NEXT: vand.vx v8, v8, a3
; RV64-NEXT: vsll.vx v8, v8, a2
-; RV64-NEXT: slli a1, a3, 48
-; RV64-NEXT: vand.vx v8, v8, a1
; RV64-NEXT: vor.vv v8, v11, v8
; RV64-NEXT: vor.vv v8, v8, v10
; RV64-NEXT: vor.vv v8, v8, v9
@@ -238,9 +220,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1
; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 24
; LMULMAX2-RV32-NEXT: vor.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT: vsll.vi v12, v8, 8
-; LMULMAX2-RV32-NEXT: lui a1, 4080
-; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a1
+; LMULMAX2-RV32-NEXT: vand.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT: vsll.vi v12, v12, 8
; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 24
; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12
; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
@@ -257,9 +238,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1
; LMULMAX2-RV64-NEXT: vsrl.vi v12, v8, 24
; LMULMAX2-RV64-NEXT: vor.vv v10, v10, v12
-; LMULMAX2-RV64-NEXT: vsll.vi v12, v8, 8
-; LMULMAX2-RV64-NEXT: lui a1, 4080
-; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a1
+; LMULMAX2-RV64-NEXT: vand.vx v12, v8, a1
+; LMULMAX2-RV64-NEXT: vsll.vi v12, v12, 8
; LMULMAX2-RV64-NEXT: vsll.vi v8, v8, 24
; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v12
; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
@@ -278,9 +258,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2
; LMULMAX1-RV32-NEXT: vsrl.vi v11, v8, 24
; LMULMAX1-RV32-NEXT: vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT: vsll.vi v11, v8, 8
-; LMULMAX1-RV32-NEXT: lui a3, 4080
-; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a3
+; LMULMAX1-RV32-NEXT: vand.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT: vsll.vi v11, v11, 8
; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 24
; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v11
; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
@@ -288,8 +267,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2
; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 24
; LMULMAX1-RV32-NEXT: vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT: vsll.vi v11, v9, 8
-; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a3
+; LMULMAX1-RV32-NEXT: vand.vx v11, v9, a2
+; LMULMAX1-RV32-NEXT: vsll.vi v11, v11, 8
; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 24
; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v11
; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10
@@ -309,9 +288,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2
; LMULMAX1-RV64-NEXT: vsrl.vi v11, v8, 24
; LMULMAX1-RV64-NEXT: vor.vv v10, v10, v11
-; LMULMAX1-RV64-NEXT: vsll.vi v11, v8, 8
-; LMULMAX1-RV64-NEXT: lui a3, 4080
-; LMULMAX1-RV64-NEXT: vand.vx v11, v11, a3
+; LMULMAX1-RV64-NEXT: vand.vx v11, v8, a2
+; LMULMAX1-RV64-NEXT: vsll.vi v11, v11, 8
; LMULMAX1-RV64-NEXT: vsll.vi v8, v8, 24
; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v11
; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
@@ -319,8 +297,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2
; LMULMAX1-RV64-NEXT: vsrl.vi v11, v9, 24
; LMULMAX1-RV64-NEXT: vor.vv v10, v10, v11
-; LMULMAX1-RV64-NEXT: vsll.vi v11, v9, 8
-; LMULMAX1-RV64-NEXT: vand.vx v11, v11, a3
+; LMULMAX1-RV64-NEXT: vand.vx v11, v9, a2
+; LMULMAX1-RV64-NEXT: vsll.vi v11, v11, 8
; LMULMAX1-RV64-NEXT: vsll.vi v9, v9, 24
; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v11
; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
@@ -359,32 +337,19 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV32-NEXT: vmerge.vxm v14, v14, a5, v0
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 8
-; LMULMAX2-RV32-NEXT: vand.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT: vor.vv v12, v14, v12
+; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v14
+; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12
; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT: li a5, 255
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v12, a5
-; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 0, v0
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vsll.vi v14, v8, 8
-; LMULMAX2-RV32-NEXT: vand.vv v12, v14, v12
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v14, a3
-; LMULMAX2-RV32-NEXT: vmerge.vim v14, v14, 0, v0
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vsll.vi v16, v8, 24
-; LMULMAX2-RV32-NEXT: vand.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT: vor.vv v12, v14, v12
-; LMULMAX2-RV32-NEXT: vsll.vx v14, v8, a2
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v16, a4
-; LMULMAX2-RV32-NEXT: vmerge.vim v16, v16, 0, v0
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v14, v14, v16
-; LMULMAX2-RV32-NEXT: vsll.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v14
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12
+; LMULMAX2-RV32-NEXT: vsll.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a3
+; LMULMAX2-RV32-NEXT: vsll.vx v16, v16, a2
+; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a4
+; LMULMAX2-RV32-NEXT: vsll.vi v16, v16, 24
+; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 8
+; LMULMAX2-RV32-NEXT: vor.vv v8, v16, v8
+; LMULMAX2-RV32-NEXT: vor.vv v8, v12, v8
; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
; LMULMAX2-RV32-NEXT: vse64.v v8, (a0)
; LMULMAX2-RV32-NEXT: ret
@@ -402,25 +367,22 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a3
; LMULMAX2-RV64-NEXT: vor.vv v10, v12, v10
; LMULMAX2-RV64-NEXT: vsrl.vi v12, v8, 24
-; LMULMAX2-RV64-NEXT: lui a3, 4080
-; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a3
+; LMULMAX2-RV64-NEXT: lui a4, 4080
+; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a4
; LMULMAX2-RV64-NEXT: vsrl.vi v14, v8, 8
-; LMULMAX2-RV64-NEXT: li a3, 255
-; LMULMAX2-RV64-NEXT: slli a4, a3, 24
-; LMULMAX2-RV64-NEXT: vand.vx v14, v14, a4
+; LMULMAX2-RV64-NEXT: li a5, 255
+; LMULMAX2-RV64-NEXT: slli a5, a5, 24
+; LMULMAX2-RV64-NEXT: vand.vx v14, v14, a5
; LMULMAX2-RV64-NEXT: vor.vv v12, v14, v12
; LMULMAX2-RV64-NEXT: vor.vv v10, v12, v10
-; LMULMAX2-RV64-NEXT: vsll.vi v12, v8, 8
-; LMULMAX2-RV64-NEXT: slli a4, a3, 32
-; LMULMAX2-RV64-NEXT: vand.vx v12, v12, a4
-; LMULMAX2-RV64-NEXT: vsll.vi v14, v8, 24
-; LMULMAX2-RV64-NEXT: slli a4, a3, 40
-; LMULMAX2-RV64-NEXT: vand.vx v14, v14, a4
+; LMULMAX2-RV64-NEXT: vand.vx v12, v8, a5
+; LMULMAX2-RV64-NEXT: vsll.vi v12, v12, 8
+; LMULMAX2-RV64-NEXT: vand.vx v14, v8, a4
+; LMULMAX2-RV64-NEXT: vsll.vi v14, v14, 24
; LMULMAX2-RV64-NEXT: vor.vv v12, v14, v12
; LMULMAX2-RV64-NEXT: vsll.vx v14, v8, a1
+; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a3
; LMULMAX2-RV64-NEXT: vsll.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: slli a1, a3, 48
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
; LMULMAX2-RV64-NEXT: vor.vv v8, v14, v8
; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v12
; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
@@ -431,17 +393,17 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-RV32-NEXT: addi a1, a0, 16
-; LMULMAX1-RV32-NEXT: vle64.v v9, (a1)
-; LMULMAX1-RV32-NEXT: vle64.v v8, (a0)
+; LMULMAX1-RV32-NEXT: vle64.v v8, (a1)
+; LMULMAX1-RV32-NEXT: vle64.v v9, (a0)
; LMULMAX1-RV32-NEXT: li a2, 56
-; LMULMAX1-RV32-NEXT: vsrl.vx v10, v9, a2
+; LMULMAX1-RV32-NEXT: vsrl.vx v10, v8, a2
; LMULMAX1-RV32-NEXT: li a3, 40
-; LMULMAX1-RV32-NEXT: vsrl.vx v11, v9, a3
+; LMULMAX1-RV32-NEXT: vsrl.vx v11, v8, a3
; LMULMAX1-RV32-NEXT: lui a4, 16
; LMULMAX1-RV32-NEXT: addi a4, a4, -256
; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4
; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 24
+; LMULMAX1-RV32-NEXT: vsrl.vi v11, v8, 24
; LMULMAX1-RV32-NEXT: lui a5, 4080
; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5
; LMULMAX1-RV32-NEXT: li a6, 5
@@ -451,57 +413,44 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV32-NEXT: lui a6, 1044480
; LMULMAX1-RV32-NEXT: vmerge.vxm v12, v12, a6, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vsrl.vi v13, v8, 8
+; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v12
+; LMULMAX1-RV32-NEXT: vor.vv v11, v13, v11
+; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT: vsll.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT: vand.vx v13, v8, a4
+; LMULMAX1-RV32-NEXT: vsll.vx v13, v13, a3
+; LMULMAX1-RV32-NEXT: vor.vv v11, v11, v13
+; LMULMAX1-RV32-NEXT: vand.vx v13, v8, a5
+; LMULMAX1-RV32-NEXT: vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 8
+; LMULMAX1-RV32-NEXT: vor.vv v8, v13, v8
+; LMULMAX1-RV32-NEXT: vor.vv v8, v11, v8
+; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT: vsrl.vx v10, v9, a2
+; LMULMAX1-RV32-NEXT: vsrl.vx v11, v9, a3
+; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4
+; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 24
+; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5
; LMULMAX1-RV32-NEXT: vsrl.vi v13, v9, 8
; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v12
; LMULMAX1-RV32-NEXT: vor.vv v11, v13, v11
; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT: li a6, 255
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v11, a6
-; LMULMAX1-RV32-NEXT: vmerge.vim v11, v11, 0, v0
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vsll.vi v13, v9, 8
-; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v11
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v14, a4
-; LMULMAX1-RV32-NEXT: vmerge.vim v14, v14, 0, v0
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vsll.vi v15, v9, 24
-; LMULMAX1-RV32-NEXT: vand.vv v15, v15, v14
-; LMULMAX1-RV32-NEXT: vor.vv v13, v15, v13
-; LMULMAX1-RV32-NEXT: vsll.vx v15, v9, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v16, a5
-; LMULMAX1-RV32-NEXT: vmerge.vim v16, v16, 0, v0
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v15, v15, v16
-; LMULMAX1-RV32-NEXT: vsll.vx v9, v9, a2
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v13
+; LMULMAX1-RV32-NEXT: vsll.vx v11, v9, a2
+; LMULMAX1-RV32-NEXT: vand.vx v13, v9, a4
+; LMULMAX1-RV32-NEXT: vsll.vx v13, v13, a3
+; LMULMAX1-RV32-NEXT: vor.vv v11, v11, v13
+; LMULMAX1-RV32-NEXT: vand.vx v13, v9, a5
+; LMULMAX1-RV32-NEXT: vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12
+; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 8
+; LMULMAX1-RV32-NEXT: vor.vv v9, v13, v9
+; LMULMAX1-RV32-NEXT: vor.vv v9, v11, v9
; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT: vsrl.vx v13, v8, a3
-; LMULMAX1-RV32-NEXT: vand.vx v13, v13, a4
-; LMULMAX1-RV32-NEXT: vor.vv v10, v13, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v13, v8, 24
-; LMULMAX1-RV32-NEXT: vand.vx v13, v13, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v15, v8, 8
-; LMULMAX1-RV32-NEXT: vand.vv v12, v15, v12
-; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT: vor.vv v10, v12, v10
-; LMULMAX1-RV32-NEXT: vsll.vi v12, v8, 8
-; LMULMAX1-RV32-NEXT: vand.vv v11, v12, v11
-; LMULMAX1-RV32-NEXT: vsll.vi v12, v8, 24
-; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v14
-; LMULMAX1-RV32-NEXT: vor.vv v11, v12, v11
-; LMULMAX1-RV32-NEXT: vsll.vx v12, v8, a3
-; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v16
-; LMULMAX1-RV32-NEXT: vsll.vx v8, v8, a2
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v12
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT: vse64.v v9, (a1)
+; LMULMAX1-RV32-NEXT: vse64.v v9, (a0)
+; LMULMAX1-RV32-NEXT: vse64.v v8, (a1)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64-LABEL: bswap_v4i64:
@@ -523,21 +472,18 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV64-NEXT: vand.vx v11, v11, a5
; LMULMAX1-RV64-NEXT: vsrl.vi v12, v8, 8
; LMULMAX1-RV64-NEXT: li a6, 255
-; LMULMAX1-RV64-NEXT: slli a7, a6, 24
-; LMULMAX1-RV64-NEXT: vand.vx v12, v12, a7
+; LMULMAX1-RV64-NEXT: slli a6, a6, 24
+; LMULMAX1-RV64-NEXT: vand.vx v12, v12, a6
; LMULMAX1-RV64-NEXT: vor.vv v11, v12, v11
; LMULMAX1-RV64-NEXT: vor.vv v10, v11, v10
-; LMULMAX1-RV64-NEXT: vsll.vi v11, v8, 8
-; LMULMAX1-RV64-NEXT: slli t0, a6, 32
-; LMULMAX1-RV64-NEXT: vand.vx v11, v11, t0
-; LMULMAX1-RV64-NEXT: vsll.vi v12, v8, 24
-; LMULMAX1-RV64-NEXT: slli t1, a6, 40
-; LMULMAX1-RV64-NEXT: vand.vx v12, v12, t1
+; LMULMAX1-RV64-NEXT: vand.vx v11, v8, a6
+; LMULMAX1-RV64-NEXT: vsll.vi v11, v11, 8
+; LMULMAX1-RV64-NEXT: vand.vx v12, v8, a5
+; LMULMAX1-RV64-NEXT: vsll.vi v12, v12, 24
; LMULMAX1-RV64-NEXT: vor.vv v11, v12, v11
; LMULMAX1-RV64-NEXT: vsll.vx v12, v8, a2
+; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4
; LMULMAX1-RV64-NEXT: vsll.vx v8, v8, a3
-; LMULMAX1-RV64-NEXT: slli a6, a6, 48
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a6
; LMULMAX1-RV64-NEXT: vor.vv v8, v12, v8
; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v11
; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
@@ -548,17 +494,17 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
; LMULMAX1-RV64-NEXT: vsrl.vi v11, v9, 24
; LMULMAX1-RV64-NEXT: vand.vx v11, v11, a5
; LMULMAX1-RV64-NEXT: vsrl.vi v12, v9, 8
-; LMULMAX1-RV64-NEXT: vand.vx v12, v12, a7
+; LMULMAX1-RV64-NEXT: vand.vx v12, v12, a6
; LMULMAX1-RV64-NEXT: vor.vv v11, v12, v11
; LMULMAX1-RV64-NEXT: vor.vv v10, v11, v10
-; LMULMAX1-RV64-NEXT: vsll.vi v11, v9, 8
-; LMULMAX1-RV64-NEXT: vand.vx v11, v11, t0
-; LMULMAX1-RV64-NEXT: vsll.vi v12, v9, 24
-; LMULMAX1-RV64-NEXT: vand.vx v12, v12, t1
+; LMULMAX1-RV64-NEXT: vand.vx v11, v9, a6
+; LMULMAX1-RV64-NEXT: vsll.vi v11, v11, 8
+; LMULMAX1-RV64-NEXT: vand.vx v12, v9, a5
+; LMULMAX1-RV64-NEXT: vsll.vi v12, v12, 24
; LMULMAX1-RV64-NEXT: vor.vv v11, v12, v11
; LMULMAX1-RV64-NEXT: vsll.vx v12, v9, a2
+; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4
; LMULMAX1-RV64-NEXT: vsll.vx v9, v9, a3
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a6
; LMULMAX1-RV64-NEXT: vor.vv v9, v12, v9
; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v11
; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
More information about the llvm-commits
mailing list