[llvm] f387918 - [TargetLowering][RISCV][ARM][AArch64][Mips] Reduce the number of AND mask constants used by BSWAP expansion.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 15 14:36:12 PST 2022


Author: Craig Topper
Date: 2022-11-15T14:36:01-08:00
New Revision: f387918dd8549331a4f60df70cccd9558eca8df1

URL: https://github.com/llvm/llvm-project/commit/f387918dd8549331a4f60df70cccd9558eca8df1
DIFF: https://github.com/llvm/llvm-project/commit/f387918dd8549331a4f60df70cccd9558eca8df1.diff

LOG: [TargetLowering][RISCV][ARM][AArch64][Mips] Reduce the number of AND mask constants used by BSWAP expansion.

We can reuse constants if we use SRL followed by AND and AND followed by SHL.
Similar was done to bitreverse previously.

Differential Revision: https://reviews.llvm.org/D138045

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
    llvm/test/CodeGen/ARM/load-combine-big-endian.ll
    llvm/test/CodeGen/ARM/load-combine.ll
    llvm/test/CodeGen/Mips/bswap.ll
    llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
    llvm/test/CodeGen/RISCV/rv32zbb.ll
    llvm/test/CodeGen/RISCV/rv64zbb.ll
    llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4a34909dbcb69..d5f624a20b68a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8348,36 +8348,36 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
   case MVT::i32:
     Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op,
+                       DAG.getConstant(0xFF00, dl, VT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(8, dl, SHVT));
     Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
-    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
-    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
-                       DAG.getConstant(0xFF0000, dl, VT));
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, dl, VT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
     Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
     Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
     return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
   case MVT::i64:
     Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
-    Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
-    Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
-    Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp7 = DAG.getNode(ISD::AND, dl, VT, Op,
+                       DAG.getConstant(255ULL<<8, dl, VT));
+    Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Tmp7, DAG.getConstant(40, dl, SHVT));
+    Tmp6 = DAG.getNode(ISD::AND, dl, VT, Op,
+                       DAG.getConstant(255ULL<<16, dl, VT));
+    Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Tmp6, DAG.getConstant(24, dl, SHVT));
+    Tmp5 = DAG.getNode(ISD::AND, dl, VT, Op,
+                       DAG.getConstant(255ULL<<24, dl, VT));
+    Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Tmp5, DAG.getConstant(8, dl, SHVT));
     Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
-    Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
-    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
-    Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7,
-                       DAG.getConstant(255ULL<<48, dl, VT));
-    Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6,
-                       DAG.getConstant(255ULL<<40, dl, VT));
-    Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5,
-                       DAG.getConstant(255ULL<<32, dl, VT));
     Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4,
                        DAG.getConstant(255ULL<<24, dl, VT));
+    Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
                        DAG.getConstant(255ULL<<16, dl, VT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2,
-                       DAG.getConstant(255ULL<<8 , dl, VT));
+                       DAG.getConstant(255ULL<<8, dl, VT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
     Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
     Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
     Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index 62d18bd92b0a7..1fba00d9f7b6c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -201,29 +201,27 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) #0 {
 ; CHECK-LABEL: bswap_v2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
-; CHECK-NEXT:    adrp x9, .LCPI14_1
 ; CHECK-NEXT:    adrp x10, .LCPI14_2
+; CHECK-NEXT:    adrp x9, .LCPI14_1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI14_0]
 ; CHECK-NEXT:    adrp x8, .LCPI14_3
+; CHECK-NEXT:    ldr d3, [x10, :lo12:.LCPI14_2]
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    lsr z4.s, p0/m, z4.s, z1.s
 ; CHECK-NEXT:    ldr d2, [x9, :lo12:.LCPI14_1]
 ; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z1.s
-; CHECK-NEXT:    ldr d3, [x10, :lo12:.LCPI14_2]
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    lsr z6.s, p0/m, z6.s, z2.s
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI14_3]
-; CHECK-NEXT:    adrp x8, .LCPI14_4
+; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z2.s
 ; CHECK-NEXT:    lslr z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    and z0.d, z0.d, z3.d
+; CHECK-NEXT:    and z3.d, z5.d, z3.d
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT:    and z2.d, z6.d, z3.d
-; CHECK-NEXT:    and z0.d, z0.d, z4.d
-; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI14_4]
-; CHECK-NEXT:    orr z2.d, z2.d, z5.d
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI14_3]
+; CHECK-NEXT:    orr z3.d, z3.d, z4.d
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z3.s
+; CHECK-NEXT:    orr z0.d, z0.d, z3.d
+; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op)
@@ -290,25 +288,24 @@ define void @bswap_v16i16(<16 x i16>* %a) #0 {
 define <2 x i32> @bswap_v2i32(<2 x i32> %op) #0 {
 ; CHECK-LABEL: bswap_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI18_1
-; CHECK-NEXT:    adrp x9, .LCPI18_2
-; CHECK-NEXT:    adrp x10, .LCPI18_0
+; CHECK-NEXT:    adrp x8, .LCPI18_0
+; CHECK-NEXT:    adrp x10, .LCPI18_2
+; CHECK-NEXT:    adrp x9, .LCPI18_1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI18_1]
-; CHECK-NEXT:    adrp x8, .LCPI18_3
-; CHECK-NEXT:    ldr d2, [x9, :lo12:.LCPI18_2]
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    lsr z4.s, p0/m, z4.s, z1.s
+; CHECK-NEXT:    ldr d3, [x10, :lo12:.LCPI18_2]
+; CHECK-NEXT:    ldr d2, [x9, :lo12:.LCPI18_1]
 ; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z1.s
-; CHECK-NEXT:    ldr d3, [x10, :lo12:.LCPI18_0]
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI18_3]
-; CHECK-NEXT:    and z2.d, z5.d, z2.d
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z3.s
-; CHECK-NEXT:    lslr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT:    and z1.d, z1.d, z4.d
-; CHECK-NEXT:    orr z2.d, z2.d, z5.d
+; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z2.s
+; CHECK-NEXT:    and z5.d, z5.d, z3.d
+; CHECK-NEXT:    and z3.d, z0.d, z3.d
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    orr z2.d, z5.d, z4.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z2.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -320,25 +317,24 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) #0 {
 define <4 x i32> @bswap_v4i32(<4 x i32> %op) #0 {
 ; CHECK-LABEL: bswap_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI19_1
-; CHECK-NEXT:    adrp x9, .LCPI19_2
-; CHECK-NEXT:    adrp x10, .LCPI19_0
+; CHECK-NEXT:    adrp x8, .LCPI19_0
+; CHECK-NEXT:    adrp x10, .LCPI19_2
+; CHECK-NEXT:    adrp x9, .LCPI19_1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI19_1]
-; CHECK-NEXT:    adrp x8, .LCPI19_3
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI19_2]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    lsr z4.s, p0/m, z4.s, z1.s
+; CHECK-NEXT:    ldr q3, [x10, :lo12:.LCPI19_2]
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI19_1]
 ; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z1.s
-; CHECK-NEXT:    ldr q3, [x10, :lo12:.LCPI19_0]
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI19_3]
-; CHECK-NEXT:    and z2.d, z5.d, z2.d
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z3.s
-; CHECK-NEXT:    lslr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT:    and z1.d, z1.d, z4.d
-; CHECK-NEXT:    orr z2.d, z2.d, z5.d
+; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z2.s
+; CHECK-NEXT:    and z5.d, z5.d, z3.d
+; CHECK-NEXT:    and z3.d, z0.d, z3.d
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    orr z2.d, z5.d, z4.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -352,35 +348,33 @@ define void @bswap_v8i32(<8 x i32>* %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI20_0
 ; CHECK-NEXT:    adrp x9, .LCPI20_1
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q4, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI20_0]
 ; CHECK-NEXT:    adrp x8, .LCPI20_2
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI20_1]
-; CHECK-NEXT:    movprfx z5, z2
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI20_1]
+; CHECK-NEXT:    movprfx z5, z1
 ; CHECK-NEXT:    lsr z5.s, p0/m, z5.s, z0.s
-; CHECK-NEXT:    movprfx z6, z2
-; CHECK-NEXT:    lsr z6.s, p0/m, z6.s, z1.s
-; CHECK-NEXT:    movprfx z7, z2
+; CHECK-NEXT:    movprfx z6, z1
+; CHECK-NEXT:    lsr z6.s, p0/m, z6.s, z2.s
+; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    lsl z7.s, p0/m, z7.s, z0.s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI20_2]
-; CHECK-NEXT:    adrp x8, .LCPI20_3
-; CHECK-NEXT:    lsl z2.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    and z6.d, z6.d, z4.d
-; CHECK-NEXT:    ldr q16, [x8, :lo12:.LCPI20_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI20_2]
+; CHECK-NEXT:    movprfx z16, z4
+; CHECK-NEXT:    lsr z16.s, p0/m, z16.s, z2.s
+; CHECK-NEXT:    and z1.d, z1.d, z3.d
+; CHECK-NEXT:    and z6.d, z6.d, z3.d
+; CHECK-NEXT:    and z16.d, z16.d, z3.d
+; CHECK-NEXT:    and z3.d, z4.d, z3.d
 ; CHECK-NEXT:    orr z5.d, z6.d, z5.d
-; CHECK-NEXT:    movprfx z6, z3
-; CHECK-NEXT:    lsr z6.s, p0/m, z6.s, z1.s
-; CHECK-NEXT:    and z4.d, z6.d, z4.d
-; CHECK-NEXT:    movprfx z6, z3
+; CHECK-NEXT:    movprfx z6, z4
 ; CHECK-NEXT:    lsr z6.s, p0/m, z6.s, z0.s
-; CHECK-NEXT:    lslr z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT:    lslr z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    and z2.d, z2.d, z16.d
-; CHECK-NEXT:    and z1.d, z1.d, z16.d
-; CHECK-NEXT:    orr z3.d, z4.d, z6.d
-; CHECK-NEXT:    orr z0.d, z0.d, z1.d
-; CHECK-NEXT:    orr z1.d, z7.d, z2.d
+; CHECK-NEXT:    lslr z0.s, p0/m, z0.s, z4.s
+; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    lslr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    orr z3.d, z16.d, z6.d
+; CHECK-NEXT:    orr z0.d, z0.d, z2.d
+; CHECK-NEXT:    orr z1.d, z7.d, z1.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z3.d
 ; CHECK-NEXT:    orr z1.d, z1.d, z5.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -397,48 +391,43 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) #0 {
 ; CHECK-NEXT:    mov w8, #56
 ; CHECK-NEXT:    mov w9, #40
 ; CHECK-NEXT:    mov w10, #65280
+; CHECK-NEXT:    mov w11, #24
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    mov w8, #24
+; CHECK-NEXT:    mov w8, #16711680
 ; CHECK-NEXT:    fmov d2, x9
-; CHECK-NEXT:    mov w9, #16711680
+; CHECK-NEXT:    mov w9, #8
 ; CHECK-NEXT:    fmov d3, x10
-; CHECK-NEXT:    mov w10, #8
-; CHECK-NEXT:    fmov d4, x8
+; CHECK-NEXT:    movprfx z7, z0
+; CHECK-NEXT:    lsr z7.d, p0/m, z7.d, z1.d
+; CHECK-NEXT:    fmov d5, x8
 ; CHECK-NEXT:    mov w8, #-16777216
-; CHECK-NEXT:    fmov d5, x9
-; CHECK-NEXT:    mov x9, #1095216660480
 ; CHECK-NEXT:    movprfx z16, z0
 ; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z2.d
-; CHECK-NEXT:    and z3.d, z16.d, z3.d
-; CHECK-NEXT:    fmov d7, x8
-; CHECK-NEXT:    mov x8, #280375465082880
+; CHECK-NEXT:    fmov d4, x11
+; CHECK-NEXT:    fmov d6, x9
+; CHECK-NEXT:    and z16.d, z16.d, z3.d
+; CHECK-NEXT:    fmov d17, x8
+; CHECK-NEXT:    orr z7.d, z16.d, z7.d
 ; CHECK-NEXT:    movprfx z16, z0
 ; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z4.d
-; CHECK-NEXT:    fmov d6, x10
-; CHECK-NEXT:    and z5.d, z16.d, z5.d
-; CHECK-NEXT:    movprfx z16, z0
-; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z6.d
-; CHECK-NEXT:    fmov d18, x8
-; CHECK-NEXT:    mov x8, #71776119061217280
-; CHECK-NEXT:    and z7.d, z16.d, z7.d
-; CHECK-NEXT:    fmov d17, x9
-; CHECK-NEXT:    orr z5.d, z7.d, z5.d
-; CHECK-NEXT:    movprfx z16, z0
-; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z1.d
-; CHECK-NEXT:    fmov d7, x8
-; CHECK-NEXT:    lslr z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT:    lslr z4.d, p0/m, z4.d, z0.d
-; CHECK-NEXT:    lslr z2.d, p0/m, z2.d, z0.d
-; CHECK-NEXT:    and z6.d, z6.d, z17.d
-; CHECK-NEXT:    and z4.d, z4.d, z18.d
+; CHECK-NEXT:    movprfx z18, z0
+; CHECK-NEXT:    lsr z18.d, p0/m, z18.d, z6.d
+; CHECK-NEXT:    and z16.d, z16.d, z5.d
+; CHECK-NEXT:    and z5.d, z0.d, z5.d
+; CHECK-NEXT:    and z18.d, z18.d, z17.d
+; CHECK-NEXT:    and z17.d, z0.d, z17.d
+; CHECK-NEXT:    lslr z6.d, p0/m, z6.d, z17.d
+; CHECK-NEXT:    lslr z4.d, p0/m, z4.d, z5.d
+; CHECK-NEXT:    and z3.d, z0.d, z3.d
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    and z1.d, z2.d, z7.d
-; CHECK-NEXT:    orr z3.d, z3.d, z16.d
+; CHECK-NEXT:    orr z16.d, z18.d, z16.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    orr z2.d, z4.d, z6.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
-; CHECK-NEXT:    orr z1.d, z5.d, z3.d
+; CHECK-NEXT:    orr z1.d, z16.d, z7.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z2.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -464,37 +453,32 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) #0 {
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI22_3]
 ; CHECK-NEXT:    adrp x8, .LCPI22_6
 ; CHECK-NEXT:    ldr q5, [x9, :lo12:.LCPI22_4]
-; CHECK-NEXT:    adrp x9, .LCPI22_7
+; CHECK-NEXT:    movprfx z7, z0
+; CHECK-NEXT:    lsr z7.d, p0/m, z7.d, z1.d
 ; CHECK-NEXT:    movprfx z16, z0
 ; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z2.d
-; CHECK-NEXT:    and z3.d, z16.d, z3.d
-; CHECK-NEXT:    ldr q7, [x8, :lo12:.LCPI22_6]
-; CHECK-NEXT:    adrp x8, .LCPI22_8
-; CHECK-NEXT:    movprfx z16, z0
-; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z4.d
 ; CHECK-NEXT:    ldr q6, [x10, :lo12:.LCPI22_5]
-; CHECK-NEXT:    and z5.d, z16.d, z5.d
-; CHECK-NEXT:    movprfx z16, z0
-; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z6.d
-; CHECK-NEXT:    ldr q18, [x8, :lo12:.LCPI22_8]
-; CHECK-NEXT:    adrp x8, .LCPI22_9
-; CHECK-NEXT:    and z7.d, z16.d, z7.d
-; CHECK-NEXT:    ldr q17, [x9, :lo12:.LCPI22_7]
-; CHECK-NEXT:    orr z5.d, z7.d, z5.d
+; CHECK-NEXT:    ldr q17, [x8, :lo12:.LCPI22_6]
+; CHECK-NEXT:    and z16.d, z16.d, z3.d
+; CHECK-NEXT:    orr z7.d, z16.d, z7.d
 ; CHECK-NEXT:    movprfx z16, z0
-; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z1.d
-; CHECK-NEXT:    ldr q7, [x8, :lo12:.LCPI22_9]
-; CHECK-NEXT:    lslr z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT:    lslr z4.d, p0/m, z4.d, z0.d
-; CHECK-NEXT:    lslr z2.d, p0/m, z2.d, z0.d
-; CHECK-NEXT:    and z6.d, z6.d, z17.d
-; CHECK-NEXT:    and z4.d, z4.d, z18.d
+; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z4.d
+; CHECK-NEXT:    movprfx z18, z0
+; CHECK-NEXT:    lsr z18.d, p0/m, z18.d, z6.d
+; CHECK-NEXT:    and z16.d, z16.d, z5.d
+; CHECK-NEXT:    and z18.d, z18.d, z17.d
+; CHECK-NEXT:    and z17.d, z0.d, z17.d
+; CHECK-NEXT:    and z5.d, z0.d, z5.d
+; CHECK-NEXT:    lslr z6.d, p0/m, z6.d, z17.d
+; CHECK-NEXT:    lslr z4.d, p0/m, z4.d, z5.d
+; CHECK-NEXT:    and z3.d, z0.d, z3.d
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    and z1.d, z2.d, z7.d
-; CHECK-NEXT:    orr z3.d, z3.d, z16.d
+; CHECK-NEXT:    orr z16.d, z18.d, z16.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    orr z2.d, z4.d, z6.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
-; CHECK-NEXT:    orr z1.d, z5.d, z3.d
+; CHECK-NEXT:    orr z1.d, z16.d, z7.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z2.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -508,79 +492,72 @@ define void @bswap_v4i64(<4 x i64>* %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI23_0
 ; CHECK-NEXT:    adrp x9, .LCPI23_1
-; CHECK-NEXT:    adrp x10, .LCPI23_2
+; CHECK-NEXT:    adrp x10, .LCPI23_3
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI23_0]
-; CHECK-NEXT:    adrp x8, .LCPI23_4
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI23_1]
-; CHECK-NEXT:    adrp x9, .LCPI23_3
-; CHECK-NEXT:    ldr q4, [x10, :lo12:.LCPI23_2]
-; CHECK-NEXT:    adrp x10, .LCPI23_5
-; CHECK-NEXT:    ldr q7, [x8, :lo12:.LCPI23_4]
-; CHECK-NEXT:    adrp x8, .LCPI23_6
-; CHECK-NEXT:    ldr q5, [x9, :lo12:.LCPI23_3]
-; CHECK-NEXT:    adrp x9, .LCPI23_7
-; CHECK-NEXT:    movprfx z6, z1
-; CHECK-NEXT:    lsr z6.d, p0/m, z6.d, z2.d
-; CHECK-NEXT:    movprfx z17, z1
-; CHECK-NEXT:    lsr z17.d, p0/m, z17.d, z3.d
-; CHECK-NEXT:    ldr q18, [x8, :lo12:.LCPI23_6]
-; CHECK-NEXT:    adrp x8, .LCPI23_8
-; CHECK-NEXT:    and z6.d, z6.d, z4.d
-; CHECK-NEXT:    ldr q16, [x10, :lo12:.LCPI23_5]
-; CHECK-NEXT:    orr z6.d, z6.d, z17.d
-; CHECK-NEXT:    ldr q17, [x9, :lo12:.LCPI23_7]
-; CHECK-NEXT:    ldr q21, [x8, :lo12:.LCPI23_8]
-; CHECK-NEXT:    adrp x8, .LCPI23_9
-; CHECK-NEXT:    movprfx z19, z1
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI23_0]
+; CHECK-NEXT:    adrp x8, .LCPI23_2
+; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI23_1]
+; CHECK-NEXT:    adrp x9, .LCPI23_4
+; CHECK-NEXT:    ldr q5, [x10, :lo12:.LCPI23_3]
+; CHECK-NEXT:    adrp x10, .LCPI23_6
+; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI23_2]
+; CHECK-NEXT:    adrp x8, .LCPI23_5
+; CHECK-NEXT:    ldr q6, [x9, :lo12:.LCPI23_4]
+; CHECK-NEXT:    movprfx z16, z2
+; CHECK-NEXT:    lsr z16.d, p0/m, z16.d, z3.d
+; CHECK-NEXT:    ldr q17, [x10, :lo12:.LCPI23_6]
+; CHECK-NEXT:    movprfx z18, z2
+; CHECK-NEXT:    lsr z18.d, p0/m, z18.d, z0.d
+; CHECK-NEXT:    ldr q7, [x8, :lo12:.LCPI23_5]
+; CHECK-NEXT:    movprfx z19, z2
 ; CHECK-NEXT:    lsr z19.d, p0/m, z19.d, z5.d
-; CHECK-NEXT:    movprfx z20, z1
-; CHECK-NEXT:    lsr z20.d, p0/m, z20.d, z16.d
-; CHECK-NEXT:    and z19.d, z19.d, z7.d
-; CHECK-NEXT:    and z20.d, z20.d, z18.d
-; CHECK-NEXT:    orr z19.d, z20.d, z19.d
-; CHECK-NEXT:    movprfx z20, z1
-; CHECK-NEXT:    lsl z20.d, p0/m, z20.d, z16.d
-; CHECK-NEXT:    movprfx z22, z1
-; CHECK-NEXT:    lsl z22.d, p0/m, z22.d, z5.d
-; CHECK-NEXT:    ldr q23, [x8, :lo12:.LCPI23_9]
+; CHECK-NEXT:    movprfx z20, z2
+; CHECK-NEXT:    lsr z20.d, p0/m, z20.d, z7.d
+; CHECK-NEXT:    and z16.d, z16.d, z4.d
+; CHECK-NEXT:    and z19.d, z19.d, z6.d
 ; CHECK-NEXT:    and z20.d, z20.d, z17.d
-; CHECK-NEXT:    and z22.d, z22.d, z21.d
-; CHECK-NEXT:    orr z6.d, z19.d, z6.d
-; CHECK-NEXT:    orr z19.d, z22.d, z20.d
+; CHECK-NEXT:    orr z16.d, z16.d, z18.d
+; CHECK-NEXT:    orr z18.d, z20.d, z19.d
+; CHECK-NEXT:    and z19.d, z2.d, z17.d
+; CHECK-NEXT:    and z20.d, z2.d, z6.d
+; CHECK-NEXT:    lsl z19.d, p0/m, z19.d, z7.d
+; CHECK-NEXT:    lsl z20.d, p0/m, z20.d, z5.d
+; CHECK-NEXT:    orr z16.d, z18.d, z16.d
+; CHECK-NEXT:    orr z18.d, z20.d, z19.d
+; CHECK-NEXT:    movprfx z19, z2
+; CHECK-NEXT:    lsl z19.d, p0/m, z19.d, z0.d
+; CHECK-NEXT:    and z2.d, z2.d, z4.d
+; CHECK-NEXT:    movprfx z20, z1
+; CHECK-NEXT:    lsr z20.d, p0/m, z20.d, z3.d
+; CHECK-NEXT:    lsl z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    movprfx z21, z1
+; CHECK-NEXT:    lsr z21.d, p0/m, z21.d, z0.d
+; CHECK-NEXT:    and z20.d, z20.d, z4.d
+; CHECK-NEXT:    orr z2.d, z19.d, z2.d
+; CHECK-NEXT:    orr z19.d, z20.d, z21.d
 ; CHECK-NEXT:    movprfx z20, z1
-; CHECK-NEXT:    lsl z20.d, p0/m, z20.d, z3.d
-; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    movprfx z22, z0
-; CHECK-NEXT:    lsr z22.d, p0/m, z22.d, z2.d
-; CHECK-NEXT:    and z1.d, z1.d, z23.d
-; CHECK-NEXT:    and z4.d, z22.d, z4.d
-; CHECK-NEXT:    movprfx z22, z0
-; CHECK-NEXT:    lsr z22.d, p0/m, z22.d, z3.d
-; CHECK-NEXT:    orr z1.d, z20.d, z1.d
-; CHECK-NEXT:    orr z4.d, z4.d, z22.d
-; CHECK-NEXT:    movprfx z20, z0
 ; CHECK-NEXT:    lsr z20.d, p0/m, z20.d, z5.d
-; CHECK-NEXT:    movprfx z22, z0
-; CHECK-NEXT:    lsr z22.d, p0/m, z22.d, z16.d
-; CHECK-NEXT:    lslr z16.d, p0/m, z16.d, z0.d
-; CHECK-NEXT:    lslr z5.d, p0/m, z5.d, z0.d
-; CHECK-NEXT:    lslr z2.d, p0/m, z2.d, z0.d
-; CHECK-NEXT:    and z7.d, z20.d, z7.d
-; CHECK-NEXT:    and z18.d, z22.d, z18.d
-; CHECK-NEXT:    and z16.d, z16.d, z17.d
-; CHECK-NEXT:    and z5.d, z5.d, z21.d
-; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z3.d
-; CHECK-NEXT:    and z2.d, z2.d, z23.d
-; CHECK-NEXT:    orr z7.d, z18.d, z7.d
-; CHECK-NEXT:    orr z3.d, z5.d, z16.d
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    orr z2.d, z7.d, z4.d
+; CHECK-NEXT:    movprfx z21, z1
+; CHECK-NEXT:    lsr z21.d, p0/m, z21.d, z7.d
+; CHECK-NEXT:    and z20.d, z20.d, z6.d
+; CHECK-NEXT:    and z21.d, z21.d, z17.d
+; CHECK-NEXT:    and z17.d, z1.d, z17.d
+; CHECK-NEXT:    and z6.d, z1.d, z6.d
+; CHECK-NEXT:    lslr z7.d, p0/m, z7.d, z17.d
+; CHECK-NEXT:    lslr z5.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    lslr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    orr z20.d, z21.d, z20.d
+; CHECK-NEXT:    and z4.d, z1.d, z4.d
+; CHECK-NEXT:    movprfx z1, z4
+; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
+; CHECK-NEXT:    orr z3.d, z5.d, z7.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    orr z1.d, z20.d, z19.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z3.d
-; CHECK-NEXT:    orr z1.d, z1.d, z19.d
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    orr z1.d, z1.d, z6.d
+; CHECK-NEXT:    orr z2.d, z2.d, z18.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    orr z1.d, z2.d, z16.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op = load <4 x i64>, <4 x i64>* %a

diff  --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
index e8673b91df8cc..010f22df64fdc 100644
--- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -56,11 +56,11 @@ define i32 @load_i32_by_i8_bswap(i32* %arg) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;
@@ -230,22 +230,21 @@ define i32 @load_i32_by_i16_i8(i32* %arg) {
 define i64 @load_i64_by_i8_bswap(i64* %arg) {
 ; CHECK-LABEL: load_i64_by_i8_bswap:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    ldr r1, [r0]
 ; CHECK-NEXT:    mov r12, #65280
 ; CHECK-NEXT:    ldr r0, [r0, #4]
-; CHECK-NEXT:    mov lr, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r3, r12, r0, lsr #8
-; CHECK-NEXT:    and r2, lr, r0, lsl #8
 ; CHECK-NEXT:    orr r3, r3, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    and r2, r12, r1, lsr #8
 ; CHECK-NEXT:    orr r0, r0, r3
-; CHECK-NEXT:    and r3, lr, r1, lsl #8
+; CHECK-NEXT:    and r3, r1, #65280
 ; CHECK-NEXT:    orr r2, r2, r1, lsr #24
-; CHECK-NEXT:    orr r1, r3, r1, lsl #24
+; CHECK-NEXT:    lsl r1, r1, #24
+; CHECK-NEXT:    orr r1, r1, r3, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r2
-; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -389,11 +388,11 @@ define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r0, [r0, #1]
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;
@@ -447,11 +446,11 @@ define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r0, [r0, #-4]
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;
@@ -603,11 +602,11 @@ define i32 @load_i32_by_bswap_i16(i32* %arg) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;
@@ -684,12 +683,12 @@ define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    add r0, r0, r1
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
 ; CHECK-NEXT:    ldr r0, [r0, #12]
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;
@@ -750,12 +749,12 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    add r0, r1, r0
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
 ; CHECK-NEXT:    ldr r0, [r0, #13]
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;

diff  --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll
index 1a4153f8355fa..1720f41895639 100644
--- a/llvm/test/CodeGen/ARM/load-combine.ll
+++ b/llvm/test/CodeGen/ARM/load-combine.ll
@@ -123,11 +123,11 @@ define i32 @load_i32_by_i8_bswap(i32* %arg) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;
@@ -243,22 +243,21 @@ define i64 @load_i64_by_i8(i64* %arg) {
 define i64 @load_i64_by_i8_bswap(i64* %arg) {
 ; CHECK-LABEL: load_i64_by_i8_bswap:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    ldr r1, [r0]
 ; CHECK-NEXT:    mov r12, #65280
 ; CHECK-NEXT:    ldr r0, [r0, #4]
-; CHECK-NEXT:    mov lr, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r3, r12, r0, lsr #8
-; CHECK-NEXT:    and r2, lr, r0, lsl #8
 ; CHECK-NEXT:    orr r3, r3, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    and r2, r12, r1, lsr #8
 ; CHECK-NEXT:    orr r0, r0, r3
-; CHECK-NEXT:    and r3, lr, r1, lsl #8
+; CHECK-NEXT:    and r3, r1, #65280
 ; CHECK-NEXT:    orr r2, r2, r1, lsr #24
-; CHECK-NEXT:    orr r1, r3, r1, lsl #24
+; CHECK-NEXT:    lsl r1, r1, #24
+; CHECK-NEXT:    orr r1, r1, r3, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r2
-; CHECK-NEXT:    pop {r11, lr}
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -425,11 +424,11 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r0, [r0, #1]
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;
@@ -482,11 +481,11 @@ define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r0, [r0, #-4]
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;
@@ -541,11 +540,11 @@ define i32 @load_i32_by_bswap_i16(i32* %arg) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    mov r2, #16711680
+; CHECK-NEXT:    and r2, r0, #65280
 ; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    and r2, r2, r0, lsl #8
 ; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    orr r0, r2, r0, lsl #24
+; CHECK-NEXT:    lsl r0, r0, #24
+; CHECK-NEXT:    orr r0, r0, r2, lsl #8
 ; CHECK-NEXT:    orr r0, r0, r1
 ; CHECK-NEXT:    mov pc, lr
 ;

diff  --git a/llvm/test/CodeGen/Mips/bswap.ll b/llvm/test/CodeGen/Mips/bswap.ll
index 0e8fd2ae4ffef..ace6c3d6021d2 100644
--- a/llvm/test/CodeGen/Mips/bswap.ll
+++ b/llvm/test/CodeGen/Mips/bswap.ll
@@ -23,16 +23,15 @@ entry:
 
 ; MIPS16-LABEL: bswap32:
 ; MIPS16-DAG: srl $[[R0:[0-9]+]], $4, 8
+; MIPS16-DAG: li  $[[R4:[0-9]+]], 65280
+; MIPS16-DAG: and $[[R0]], $[[R4]]
 ; MIPS16-DAG: srl $[[R1:[0-9]+]], $4, 24
-; MIPS16-DAG: sll $[[R2:[0-9]+]], $4, 8
+; MIPS16-DAG: or $[[R1]], $[[R0]]
+; MIPS16-DAG: and $[[R4]], $4
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $[[R4]], 8
 ; MIPS16-DAG: sll $[[R3:[0-9]+]], $4, 24
-; MIPS16-DAG: li  $[[R4:[0-9]+]], 65280
-; MIPS16-DAG: and $[[R4]], $[[R0]]
-; MIPS16-DAG: or  $[[R1]], $[[R4]]
-; MIPS16-DAG: lw  $[[R7:[0-9]+]], $CPI
-; MIPS16-DAG: and $[[R7]], $[[R2]]
-; MIPS16-DAG: or  $[[R3]], $[[R7]]
-; MIPS16-DAG: or  $[[R3]], $[[R1]]
+; MIPS16-DAG: or $[[R3]], $[[R2]]
+; MIPS16-DAG: or $[[R3]], $[[R1]]
 
   %or.3 = call i32 @llvm.bswap.i32(i32 %x)
   ret i32 %or.3
@@ -58,23 +57,22 @@ entry:
 
 ; MIPS16-LABEL: bswap64:
 ; MIPS16-DAG: srl $[[R0:[0-9]+]], $5, 8
-; MIPS16-DAG: srl $[[R1:[0-9]+]], $5, 24
-; MIPS16-DAG: sll $[[R2:[0-9]+]], $5, 8
-; MIPS16-DAG: sll $[[R3:[0-9]+]], $5, 24
 ; MIPS16-DAG: li  $[[R4:[0-9]+]], 65280
 ; MIPS16-DAG: and $[[R0]], $[[R4]]
+; MIPS16-DAG: srl $[[R1:[0-9]+]], $5, 24
 ; MIPS16-DAG: or  $[[R1]], $[[R0]]
-; MIPS16-DAG: lw  $[[R7:[0-9]+]], 1f
-; MIPS16-DAG: and $[[R2]], $[[R7]]
-; MIPS16-DAG: or  $[[R3]], $[[R2]]
-; MIPS16-DAG: or  $[[R3]], $[[R1]]
+; MIPS16-DAG: sll $[[R3:[0-9]+]], $5, 24
+; MIPS16-DAG: and $5, $[[R4]]
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $5, 8
+; MIPS16-DAG: or $[[R0]], $[[R3]]
+; MIPS16-DAG: or $[[R0]], $[[R1]]
 ; MIPS16-DAG: srl $[[R0:[0-9]+]], $4, 8
-; MIPS16-DAG: srl $[[R1:[0-9]+]], $4, 24
-; MIPS16-DAG: sll $[[R2:[0-9]+]], $4, 8
-; MIPS16-DAG: sll $[[R3:[0-9]+]], $4, 24
 ; MIPS16-DAG: and $[[R0]], $[[R4]]
+; MIPS16-DAG: srl $[[R1:[0-9]+]], $4, 24
 ; MIPS16-DAG: or  $[[R1]], $[[R0]]
-; MIPS16-DAG: and $[[R2]], $[[R7]]
+; MIPS16-DAG: and $[[R4]], $4
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $[[R4]], 8
+; MIPS16-DAG: sll $[[R3:[0-9]+]], $4, 24
 ; MIPS16-DAG: or  $[[R3]], $[[R2]]
 ; MIPS16-DAG: or  $[[R3]], $[[R1]]
 

diff  --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
index bcf367a2b06cc..3db0ed8c95895 100644
--- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
@@ -59,11 +59,10 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 16
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a2, a0, 24
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    lui a3, 4080
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
@@ -75,11 +74,10 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 16
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a2, a0, 24
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    slli a2, a0, 8
-; RV64I-NEXT:    lui a3, 4080
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
@@ -108,18 +106,17 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    slli a4, a1, 8
-; RV32I-NEXT:    lui a5, 4080
-; RV32I-NEXT:    and a4, a4, a5
+; RV32I-NEXT:    and a4, a1, a3
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a2, a1, a2
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a3, a0, 8
-; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    srli a4, a0, 24
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a1, a0, a1
@@ -128,34 +125,31 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 ;
 ; RV64I-LABEL: test_bswap_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 24
-; RV64I-NEXT:    li a2, 255
-; RV64I-NEXT:    slli a3, a2, 40
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    slli a3, a0, 40
-; RV64I-NEXT:    slli a2, a2, 48
-; RV64I-NEXT:    and a2, a3, a2
-; RV64I-NEXT:    slli a3, a0, 56
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    srli a2, a0, 40
-; RV64I-NEXT:    lui a3, 16
-; RV64I-NEXT:    addiw a3, a3, -256
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a3, a0, 56
-; RV64I-NEXT:    or a2, a2, a3
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    srli a3, a0, 24
 ; RV64I-NEXT:    lui a4, 4080
 ; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a5, a0, 8
+; RV64I-NEXT:    srliw a5, a5, 24
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    and a3, a0, a4
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZB-LABEL: test_bswap_i64:
@@ -402,11 +396,10 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 16
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a2, a0, 24
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    lui a3, 4080
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
@@ -439,11 +432,10 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 16
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a2, a0, 24
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    slli a2, a0, 8
-; RV64I-NEXT:    lui a3, 4080
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
@@ -550,9 +542,8 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    slli a4, a1, 8
-; RV32I-NEXT:    lui a5, 4080
-; RV32I-NEXT:    and a4, a4, a5
+; RV32I-NEXT:    and a4, a1, a3
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a1, a1, a2
@@ -564,25 +555,25 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    srli a2, a1, 2
-; RV32I-NEXT:    lui a6, 209715
-; RV32I-NEXT:    addi a6, a6, 819
-; RV32I-NEXT:    and a2, a2, a6
-; RV32I-NEXT:    and a1, a1, a6
+; RV32I-NEXT:    lui a5, 209715
+; RV32I-NEXT:    addi a5, a5, 819
+; RV32I-NEXT:    and a2, a2, a5
+; RV32I-NEXT:    and a1, a1, a5
 ; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    srli a2, a1, 1
-; RV32I-NEXT:    lui a7, 349525
-; RV32I-NEXT:    addi a7, a7, 1365
-; RV32I-NEXT:    and a2, a2, a7
-; RV32I-NEXT:    and a1, a1, a7
+; RV32I-NEXT:    lui a6, 349525
+; RV32I-NEXT:    addi a6, a6, 1365
+; RV32I-NEXT:    and a2, a2, a6
+; RV32I-NEXT:    and a1, a1, a6
 ; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    or a2, a2, a1
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a3, a0, 8
-; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    srli a7, a0, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a0, a0, a1
@@ -592,13 +583,13 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    and a1, a1, a6
-; RV32I-NEXT:    and a0, a0, a6
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a0, a0, a5
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    and a1, a1, a7
-; RV32I-NEXT:    and a0, a0, a7
+; RV32I-NEXT:    and a1, a1, a6
+; RV32I-NEXT:    and a0, a0, a6
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a1, a1, a0
 ; RV32I-NEXT:    mv a0, a2
@@ -606,39 +597,36 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ;
 ; RV64I-LABEL: test_bitreverse_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 24
-; RV64I-NEXT:    li a2, 255
-; RV64I-NEXT:    slli a3, a2, 40
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    slli a3, a0, 40
-; RV64I-NEXT:    slli a2, a2, 48
-; RV64I-NEXT:    and a2, a3, a2
-; RV64I-NEXT:    slli a3, a0, 56
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    srli a2, a0, 40
-; RV64I-NEXT:    lui a3, 16
-; RV64I-NEXT:    addiw a3, a3, -256
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a3, a0, 56
-; RV64I-NEXT:    or a2, a2, a3
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    srli a3, a0, 24
 ; RV64I-NEXT:    lui a4, 4080
 ; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lui a3, %hi(.LCPI6_0)
-; RV64I-NEXT:    ld a3, %lo(.LCPI6_0)(a3)
+; RV64I-NEXT:    srli a5, a0, 8
+; RV64I-NEXT:    srliw a5, a5, 24
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    and a3, a0, a4
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    lui a2, %hi(.LCPI6_0)
+; RV64I-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    lui a2, %hi(.LCPI6_1)
 ; RV64I-NEXT:    ld a2, %lo(.LCPI6_1)(a2)
 ; RV64I-NEXT:    slli a0, a0, 4

diff  --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index ad3548ec78f67..6d10db62d17b3 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -769,11 +769,10 @@ define i32 @bswap_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 16
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a2, a0, 24
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    lui a3, 4080
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
@@ -798,18 +797,17 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    slli a4, a1, 8
-; RV32I-NEXT:    lui a5, 4080
-; RV32I-NEXT:    and a4, a4, a5
+; RV32I-NEXT:    and a4, a1, a3
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a2, a1, a2
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a3, a0, 8
-; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    srli a4, a0, 24
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a1, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index d4ab229d55740..3193a4a7af0b5 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -952,11 +952,10 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 16
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a2, a0, 24
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    slli a2, a0, 8
-; RV64I-NEXT:    lui a3, 4080
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
@@ -979,11 +978,10 @@ define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind {
 ; RV64I-NEXT:    lui a3, 16
 ; RV64I-NEXT:    addiw a3, a3, -256
 ; RV64I-NEXT:    and a2, a2, a3
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    slli a3, a0, 8
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    or a2, a2, a4
+; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
@@ -1006,34 +1004,31 @@ declare i64 @llvm.bswap.i64(i64)
 define i64 @bswap_i64(i64 %a) {
 ; RV64I-LABEL: bswap_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 24
-; RV64I-NEXT:    li a2, 255
-; RV64I-NEXT:    slli a3, a2, 40
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    slli a3, a0, 40
-; RV64I-NEXT:    slli a2, a2, 48
-; RV64I-NEXT:    and a2, a3, a2
-; RV64I-NEXT:    slli a3, a0, 56
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    srli a2, a0, 40
-; RV64I-NEXT:    lui a3, 16
-; RV64I-NEXT:    addiw a3, a3, -256
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a3, a0, 56
-; RV64I-NEXT:    or a2, a2, a3
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    srli a3, a0, 24
 ; RV64I-NEXT:    lui a4, 4080
 ; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a5, a0, 8
+; RV64I-NEXT:    srliw a5, a5, 24
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    and a3, a0, a4
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: bswap_i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index d73b05efe93dd..ab362af991637 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -579,9 +579,8 @@ define <vscale x 1 x i32> @bitreverse_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV32-NEXT:    vand.vx v9, v9, a0
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    vsll.vi v10, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v10, v10, a0
+; RV32-NEXT:    vand.vx v10, v8, a0
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -617,9 +616,8 @@ define <vscale x 1 x i32> @bitreverse_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -660,9 +658,8 @@ define <vscale x 2 x i32> @bitreverse_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV32-NEXT:    vand.vx v9, v9, a0
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    vsll.vi v10, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v10, v10, a0
+; RV32-NEXT:    vand.vx v10, v8, a0
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -698,9 +695,8 @@ define <vscale x 2 x i32> @bitreverse_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -741,9 +737,8 @@ define <vscale x 4 x i32> @bitreverse_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV32-NEXT:    vand.vx v10, v10, a0
 ; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vsll.vi v12, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v12, v12, a0
+; RV32-NEXT:    vand.vx v12, v8, a0
+; RV32-NEXT:    vsll.vi v12, v12, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -779,9 +774,8 @@ define <vscale x 4 x i32> @bitreverse_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsrl.vi v12, v8, 24
 ; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    vsll.vi v12, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vsll.vi v12, v12, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v8, v10
@@ -822,9 +816,8 @@ define <vscale x 8 x i32> @bitreverse_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vsll.vi v16, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -860,9 +853,8 @@ define <vscale x 8 x i32> @bitreverse_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsrl.vi v16, v8, 24
 ; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    vsll.vi v16, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vsll.vi v16, v16, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vor.vv v8, v8, v12
@@ -903,9 +895,8 @@ define <vscale x 16 x i32> @bitreverse_nxv16i32(<vscale x 16 x i32> %va) {
 ; RV32-NEXT:    vand.vx v16, v16, a0
 ; RV32-NEXT:    vsrl.vi v24, v8, 24
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    vsll.vi v24, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v24, v24, a0
+; RV32-NEXT:    vand.vx v24, v8, a0
+; RV32-NEXT:    vsll.vi v24, v24, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -941,9 +932,8 @@ define <vscale x 16 x i32> @bitreverse_nxv16i32(<vscale x 16 x i32> %va) {
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsrl.vi v24, v8, 24
 ; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsll.vi v24, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vand.vx v24, v8, a0
+; RV64-NEXT:    vsll.vi v24, v24, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -982,66 +972,58 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    lui a0, 1044480
 ; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4080
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw zero, 8(sp)
-; RV32-NEXT:    li a1, 255
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -256
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsrl.vx v10, v8, a3
-; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    vsrl.vx v10, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v10, v10, a2
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a1), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 24
-; RV32-NEXT:    vand.vx v11, v11, a0
+; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vlse64.v v11, (a3), zero
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v10, v10, a4
 ; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v10, v10, v11
-; RV32-NEXT:    vlse64.v v11, (a1), zero
+; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    vor.vv v10, v12, v10
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a2
-; RV32-NEXT:    vsll.vx v12, v8, a3
-; RV32-NEXT:    vand.vv v11, v12, v11
-; RV32-NEXT:    vlse64.v v12, (a1), zero
-; RV32-NEXT:    vor.vv v10, v10, v11
-; RV32-NEXT:    vlse64.v v11, (a1), zero
-; RV32-NEXT:    vsll.vi v13, v8, 8
-; RV32-NEXT:    vand.vv v12, v13, v12
-; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vx v10, v8, a0
+; RV32-NEXT:    vand.vx v12, v8, a2
+; RV32-NEXT:    vsll.vx v12, v12, a1
+; RV32-NEXT:    vor.vv v10, v10, v12
+; RV32-NEXT:    vand.vx v12, v8, a4
+; RV32-NEXT:    vsll.vi v12, v12, 24
 ; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vlse64.v v11, (a1), zero
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vlse64.v v11, (a3), zero
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vlse64.v v10, (a1), zero
+; RV32-NEXT:    vlse64.v v10, (a3), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
 ; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vlse64.v v10, (a1), zero
+; RV32-NEXT:    vlse64.v v10, (a3), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
@@ -1064,25 +1046,22 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
-; RV64-NEXT:    lui a2, 4080
-; RV64-NEXT:    vand.vx v10, v10, a2
+; RV64-NEXT:    lui a3, 4080
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vsrl.vi v11, v8, 8
-; RV64-NEXT:    li a2, 255
-; RV64-NEXT:    slli a3, a2, 24
-; RV64-NEXT:    vand.vx v11, v11, a3
+; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    slli a4, a4, 24
+; RV64-NEXT:    vand.vx v11, v11, a4
 ; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    vand.vx v10, v10, a3
-; RV64-NEXT:    vsll.vi v11, v8, 24
-; RV64-NEXT:    slli a3, a2, 40
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
+; RV64-NEXT:    vand.vx v10, v8, a3
+; RV64-NEXT:    vsll.vi v10, v10, 24
+; RV64-NEXT:    vand.vx v11, v8, a4
+; RV64-NEXT:    vsll.vi v11, v11, 8
+; RV64-NEXT:    vor.vv v10, v10, v11
 ; RV64-NEXT:    vsll.vx v11, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
-; RV64-NEXT:    slli a0, a2, 48
-; RV64-NEXT:    vand.vx v8, v8, a0
 ; RV64-NEXT:    vor.vv v8, v11, v8
 ; RV64-NEXT:    lui a0, %hi(.LCPI18_0)
 ; RV64-NEXT:    ld a0, %lo(.LCPI18_0)(a0)
@@ -1121,66 +1100,58 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    lui a0, 1044480
 ; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4080
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw zero, 8(sp)
-; RV32-NEXT:    li a1, 255
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -256
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v8, a0
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v12, v12, a2
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a1), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 24
-; RV32-NEXT:    vand.vx v14, v14, a0
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vlse64.v v14, (a3), zero
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v12, v12, a4
 ; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v12, v12, v14
-; RV32-NEXT:    vlse64.v v14, (a1), zero
+; RV32-NEXT:    vand.vv v16, v16, v14
+; RV32-NEXT:    vor.vv v12, v16, v12
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a2
-; RV32-NEXT:    vsll.vx v16, v8, a3
-; RV32-NEXT:    vand.vv v14, v16, v14
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vor.vv v12, v12, v14
-; RV32-NEXT:    vlse64.v v14, (a1), zero
-; RV32-NEXT:    vsll.vi v18, v8, 8
-; RV32-NEXT:    vand.vv v16, v18, v16
-; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vx v12, v8, a0
+; RV32-NEXT:    vand.vx v16, v8, a2
+; RV32-NEXT:    vsll.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v12, v12, v16
+; RV32-NEXT:    vand.vx v16, v8, a4
+; RV32-NEXT:    vsll.vi v16, v16, 24
 ; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vlse64.v v14, (a1), zero
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vlse64.v v14, (a3), zero
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 4
 ; RV32-NEXT:    vand.vv v10, v10, v14
 ; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vlse64.v v12, (a1), zero
+; RV32-NEXT:    vlse64.v v12, (a3), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
 ; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vlse64.v v12, (a1), zero
+; RV32-NEXT:    vlse64.v v12, (a3), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
@@ -1203,25 +1174,22 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vor.vv v10, v12, v10
 ; RV64-NEXT:    vsrl.vi v12, v8, 24
-; RV64-NEXT:    lui a2, 4080
-; RV64-NEXT:    vand.vx v12, v12, a2
+; RV64-NEXT:    lui a3, 4080
+; RV64-NEXT:    vand.vx v12, v12, a3
 ; RV64-NEXT:    vsrl.vi v14, v8, 8
-; RV64-NEXT:    li a2, 255
-; RV64-NEXT:    slli a3, a2, 24
-; RV64-NEXT:    vand.vx v14, v14, a3
+; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    slli a4, a4, 24
+; RV64-NEXT:    vand.vx v14, v14, a4
 ; RV64-NEXT:    vor.vv v12, v14, v12
 ; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsll.vi v12, v8, 8
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    vand.vx v12, v12, a3
-; RV64-NEXT:    vsll.vi v14, v8, 24
-; RV64-NEXT:    slli a3, a2, 40
-; RV64-NEXT:    vand.vx v14, v14, a3
-; RV64-NEXT:    vor.vv v12, v14, v12
+; RV64-NEXT:    vand.vx v12, v8, a3
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v14, v8, a4
+; RV64-NEXT:    vsll.vi v14, v14, 8
+; RV64-NEXT:    vor.vv v12, v12, v14
 ; RV64-NEXT:    vsll.vx v14, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
-; RV64-NEXT:    slli a0, a2, 48
-; RV64-NEXT:    vand.vx v8, v8, a0
 ; RV64-NEXT:    vor.vv v8, v14, v8
 ; RV64-NEXT:    lui a0, %hi(.LCPI19_0)
 ; RV64-NEXT:    ld a0, %lo(.LCPI19_0)(a0)
@@ -1260,66 +1228,58 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    lui a0, 1044480
 ; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4080
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw zero, 8(sp)
-; RV32-NEXT:    li a1, 255
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -256
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vx v12, v8, a0
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v16, v16, a2
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 24
-; RV32-NEXT:    vand.vx v20, v20, a0
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vlse64.v v20, (a3), zero
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v16, v16, a4
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    vor.vv v16, v16, v20
-; RV32-NEXT:    vlse64.v v20, (a1), zero
+; RV32-NEXT:    vand.vv v24, v24, v20
+; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsll.vx v16, v8, a2
-; RV32-NEXT:    vsll.vx v24, v8, a3
-; RV32-NEXT:    vand.vv v20, v24, v20
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vor.vv v16, v16, v20
-; RV32-NEXT:    vlse64.v v20, (a1), zero
-; RV32-NEXT:    vsll.vi v28, v8, 8
-; RV32-NEXT:    vand.vv v24, v28, v24
-; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vx v16, v8, a0
+; RV32-NEXT:    vand.vx v24, v8, a2
+; RV32-NEXT:    vsll.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vand.vx v24, v8, a4
+; RV32-NEXT:    vsll.vi v24, v24, 24
 ; RV32-NEXT:    vand.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vlse64.v v20, (a1), zero
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vlse64.v v20, (a3), zero
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 4
 ; RV32-NEXT:    vand.vv v12, v12, v20
 ; RV32-NEXT:    vand.vv v8, v8, v20
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
 ; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
@@ -1342,25 +1302,22 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vor.vv v12, v16, v12
 ; RV64-NEXT:    vsrl.vi v16, v8, 24
-; RV64-NEXT:    lui a2, 4080
-; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    lui a3, 4080
+; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    vsrl.vi v20, v8, 8
-; RV64-NEXT:    li a2, 255
-; RV64-NEXT:    slli a3, a2, 24
-; RV64-NEXT:    vand.vx v20, v20, a3
+; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    slli a4, a4, 24
+; RV64-NEXT:    vand.vx v20, v20, a4
 ; RV64-NEXT:    vor.vv v16, v20, v16
 ; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsll.vi v16, v8, 8
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v20, v8, 24
-; RV64-NEXT:    slli a3, a2, 40
-; RV64-NEXT:    vand.vx v20, v20, a3
-; RV64-NEXT:    vor.vv v16, v20, v16
+; RV64-NEXT:    vand.vx v16, v8, a3
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v20, v8, a4
+; RV64-NEXT:    vsll.vi v20, v20, 8
+; RV64-NEXT:    vor.vv v16, v16, v20
 ; RV64-NEXT:    vsll.vx v20, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
-; RV64-NEXT:    slli a0, a2, 48
-; RV64-NEXT:    vand.vx v8, v8, a0
 ; RV64-NEXT:    vor.vv v8, v20, v8
 ; RV64-NEXT:    lui a0, %hi(.LCPI20_0)
 ; RV64-NEXT:    ld a0, %lo(.LCPI20_0)(a0)
@@ -1397,95 +1354,71 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    lui a0, 1044480
 ; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4080
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw zero, 8(sp)
-; RV32-NEXT:    li a1, 255
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -256
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a2, 61681
-; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    lui a2, 209715
-; RV32-NEXT:    addi a2, a2, 819
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    lui a2, 349525
-; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsrl.vx v0, v8, a2
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    vand.vx v0, v0, a0
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8re8.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsll.vx v24, v8, a3
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    vlse64.v v0, (a1), zero
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    lui a0, 209715
+; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    addi a0, a0, 1365
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a0
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    vsrl.vx v24, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v24, a2
 ; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsll.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v0, v8, 24
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vlse64.v v24, (a3), zero
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vand.vx v0, v0, a4
+; RV32-NEXT:    vsrl.vi v16, v8, 8
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vl8re8.v v0, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    addi a5, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a2
+; RV32-NEXT:    vsll.vx v0, v0, a1
+; RV32-NEXT:    vsll.vx v16, v8, a0
+; RV32-NEXT:    vor.vv v0, v16, v0
+; RV32-NEXT:    vand.vv v16, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    vor.vv v8, v0, v8
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8re8.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8re8.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
 ; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 2
 ; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 1
@@ -1494,7 +1427,7 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -1511,25 +1444,22 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vor.vv v16, v24, v16
 ; RV64-NEXT:    vsrl.vi v24, v8, 24
-; RV64-NEXT:    lui a2, 4080
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    lui a3, 4080
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsrl.vi v0, v8, 8
-; RV64-NEXT:    li a2, 255
-; RV64-NEXT:    slli a3, a2, 24
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    slli a4, a4, 24
+; RV64-NEXT:    vand.vx v0, v0, a4
 ; RV64-NEXT:    vor.vv v24, v0, v24
 ; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsll.vi v24, v8, 8
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    vand.vx v24, v24, a3
-; RV64-NEXT:    vsll.vi v0, v8, 24
-; RV64-NEXT:    slli a3, a2, 40
-; RV64-NEXT:    vand.vx v0, v0, a3
-; RV64-NEXT:    vor.vv v24, v0, v24
+; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v0, v8, a4
+; RV64-NEXT:    vsll.vi v0, v0, 8
+; RV64-NEXT:    vor.vv v24, v24, v0
 ; RV64-NEXT:    vsll.vx v0, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
-; RV64-NEXT:    slli a0, a2, 48
-; RV64-NEXT:    vand.vx v8, v8, a0
 ; RV64-NEXT:    vor.vv v8, v0, v8
 ; RV64-NEXT:    lui a0, %hi(.LCPI21_0)
 ; RV64-NEXT:    ld a0, %lo(.LCPI21_0)(a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
index c5a57a9efc5b8..fdef42fbb7248 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
@@ -90,9 +90,8 @@ define <vscale x 1 x i32> @bswap_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV32-NEXT:    vand.vx v9, v9, a0
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    vsll.vi v10, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v10, v10, a0
+; RV32-NEXT:    vand.vx v10, v8, a0
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -107,9 +106,8 @@ define <vscale x 1 x i32> @bswap_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -129,9 +127,8 @@ define <vscale x 2 x i32> @bswap_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV32-NEXT:    vand.vx v9, v9, a0
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    vsll.vi v10, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v10, v10, a0
+; RV32-NEXT:    vand.vx v10, v8, a0
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -146,9 +143,8 @@ define <vscale x 2 x i32> @bswap_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -168,9 +164,8 @@ define <vscale x 4 x i32> @bswap_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV32-NEXT:    vand.vx v10, v10, a0
 ; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vsll.vi v12, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v12, v12, a0
+; RV32-NEXT:    vand.vx v12, v8, a0
+; RV32-NEXT:    vsll.vi v12, v12, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -185,9 +180,8 @@ define <vscale x 4 x i32> @bswap_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsrl.vi v12, v8, 24
 ; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    vsll.vi v12, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vsll.vi v12, v12, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v8, v10
@@ -207,9 +201,8 @@ define <vscale x 8 x i32> @bswap_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vsll.vi v16, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -224,9 +217,8 @@ define <vscale x 8 x i32> @bswap_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsrl.vi v16, v8, 24
 ; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    vsll.vi v16, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vsll.vi v16, v16, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vor.vv v8, v8, v12
@@ -246,9 +238,8 @@ define <vscale x 16 x i32> @bswap_nxv16i32(<vscale x 16 x i32> %va) {
 ; RV32-NEXT:    vand.vx v16, v16, a0
 ; RV32-NEXT:    vsrl.vi v24, v8, 24
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    vsll.vi v24, v8, 8
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    vand.vx v24, v24, a0
+; RV32-NEXT:    vand.vx v24, v8, a0
+; RV32-NEXT:    vsll.vi v24, v24, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vor.vv v8, v8, v16
@@ -263,9 +254,8 @@ define <vscale x 16 x i32> @bswap_nxv16i32(<vscale x 16 x i32> %va) {
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsrl.vi v24, v8, 24
 ; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsll.vi v24, v8, 8
-; RV64-NEXT:    lui a0, 4080
-; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vand.vx v24, v8, a0
+; RV64-NEXT:    vsll.vi v24, v24, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -283,41 +273,33 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    lui a0, 1044480
 ; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw zero, 8(sp)
-; RV32-NEXT:    li a1, 255
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -256
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsrl.vx v10, v8, a3
-; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    vsrl.vx v10, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v10, v10, a2
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a1), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 24
-; RV32-NEXT:    vand.vx v11, v11, a0
+; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vlse64.v v11, (a3), zero
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vand.vx v10, v10, a3
 ; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v10, v10, v11
-; RV32-NEXT:    vlse64.v v11, (a1), zero
+; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    vor.vv v10, v12, v10
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a2
-; RV32-NEXT:    vsll.vx v12, v8, a3
-; RV32-NEXT:    vand.vv v11, v12, v11
-; RV32-NEXT:    vlse64.v v12, (a1), zero
-; RV32-NEXT:    vor.vv v10, v10, v11
-; RV32-NEXT:    vlse64.v v11, (a1), zero
-; RV32-NEXT:    vsll.vi v13, v8, 8
-; RV32-NEXT:    vand.vv v12, v13, v12
-; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vx v10, v8, a0
+; RV32-NEXT:    vand.vx v12, v8, a2
+; RV32-NEXT:    vsll.vx v12, v12, a1
+; RV32-NEXT:    vor.vv v10, v10, v12
+; RV32-NEXT:    vand.vx v12, v8, a3
+; RV32-NEXT:    vsll.vi v12, v12, 24
 ; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -335,25 +317,22 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
-; RV64-NEXT:    lui a2, 4080
-; RV64-NEXT:    vand.vx v10, v10, a2
+; RV64-NEXT:    lui a3, 4080
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vsrl.vi v11, v8, 8
-; RV64-NEXT:    li a2, 255
-; RV64-NEXT:    slli a3, a2, 24
-; RV64-NEXT:    vand.vx v11, v11, a3
+; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    slli a4, a4, 24
+; RV64-NEXT:    vand.vx v11, v11, a4
 ; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    vand.vx v10, v10, a3
-; RV64-NEXT:    vsll.vi v11, v8, 24
-; RV64-NEXT:    slli a3, a2, 40
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
+; RV64-NEXT:    vand.vx v10, v8, a3
+; RV64-NEXT:    vsll.vi v10, v10, 24
+; RV64-NEXT:    vand.vx v11, v8, a4
+; RV64-NEXT:    vsll.vi v11, v11, 8
+; RV64-NEXT:    vor.vv v10, v10, v11
 ; RV64-NEXT:    vsll.vx v11, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
-; RV64-NEXT:    slli a0, a2, 48
-; RV64-NEXT:    vand.vx v8, v8, a0
 ; RV64-NEXT:    vor.vv v8, v11, v8
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -371,41 +350,33 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    lui a0, 1044480
 ; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw zero, 8(sp)
-; RV32-NEXT:    li a1, 255
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -256
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v8, a0
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v12, v12, a2
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a1), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 24
-; RV32-NEXT:    vand.vx v14, v14, a0
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vlse64.v v14, (a3), zero
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vand.vx v12, v12, a3
 ; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v12, v12, v14
-; RV32-NEXT:    vlse64.v v14, (a1), zero
+; RV32-NEXT:    vand.vv v16, v16, v14
+; RV32-NEXT:    vor.vv v12, v16, v12
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a2
-; RV32-NEXT:    vsll.vx v16, v8, a3
-; RV32-NEXT:    vand.vv v14, v16, v14
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vor.vv v12, v12, v14
-; RV32-NEXT:    vlse64.v v14, (a1), zero
-; RV32-NEXT:    vsll.vi v18, v8, 8
-; RV32-NEXT:    vand.vv v16, v18, v16
-; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vx v12, v8, a0
+; RV32-NEXT:    vand.vx v16, v8, a2
+; RV32-NEXT:    vsll.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v12, v12, v16
+; RV32-NEXT:    vand.vx v16, v8, a3
+; RV32-NEXT:    vsll.vi v16, v16, 24
 ; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -423,25 +394,22 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vor.vv v10, v12, v10
 ; RV64-NEXT:    vsrl.vi v12, v8, 24
-; RV64-NEXT:    lui a2, 4080
-; RV64-NEXT:    vand.vx v12, v12, a2
+; RV64-NEXT:    lui a3, 4080
+; RV64-NEXT:    vand.vx v12, v12, a3
 ; RV64-NEXT:    vsrl.vi v14, v8, 8
-; RV64-NEXT:    li a2, 255
-; RV64-NEXT:    slli a3, a2, 24
-; RV64-NEXT:    vand.vx v14, v14, a3
+; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    slli a4, a4, 24
+; RV64-NEXT:    vand.vx v14, v14, a4
 ; RV64-NEXT:    vor.vv v12, v14, v12
 ; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsll.vi v12, v8, 8
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    vand.vx v12, v12, a3
-; RV64-NEXT:    vsll.vi v14, v8, 24
-; RV64-NEXT:    slli a3, a2, 40
-; RV64-NEXT:    vand.vx v14, v14, a3
-; RV64-NEXT:    vor.vv v12, v14, v12
+; RV64-NEXT:    vand.vx v12, v8, a3
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v14, v8, a4
+; RV64-NEXT:    vsll.vi v14, v14, 8
+; RV64-NEXT:    vor.vv v12, v12, v14
 ; RV64-NEXT:    vsll.vx v14, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
-; RV64-NEXT:    slli a0, a2, 48
-; RV64-NEXT:    vand.vx v8, v8, a0
 ; RV64-NEXT:    vor.vv v8, v14, v8
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v8, v10
@@ -459,41 +427,33 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    lui a0, 1044480
 ; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw zero, 8(sp)
-; RV32-NEXT:    li a1, 255
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -256
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vx v12, v8, a0
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v16, v16, a2
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 24
-; RV32-NEXT:    vand.vx v20, v20, a0
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vlse64.v v20, (a3), zero
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vand.vx v16, v16, a3
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    vor.vv v16, v16, v20
-; RV32-NEXT:    vlse64.v v20, (a1), zero
+; RV32-NEXT:    vand.vv v24, v24, v20
+; RV32-NEXT:    vor.vv v16, v24, v16
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsll.vx v16, v8, a2
-; RV32-NEXT:    vsll.vx v24, v8, a3
-; RV32-NEXT:    vand.vv v20, v24, v20
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vor.vv v16, v16, v20
-; RV32-NEXT:    vlse64.v v20, (a1), zero
-; RV32-NEXT:    vsll.vi v28, v8, 8
-; RV32-NEXT:    vand.vv v24, v28, v24
-; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vx v16, v8, a0
+; RV32-NEXT:    vand.vx v24, v8, a2
+; RV32-NEXT:    vsll.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vand.vx v24, v8, a3
+; RV32-NEXT:    vsll.vi v24, v24, 24
 ; RV32-NEXT:    vand.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -511,25 +471,22 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vor.vv v12, v16, v12
 ; RV64-NEXT:    vsrl.vi v16, v8, 24
-; RV64-NEXT:    lui a2, 4080
-; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    lui a3, 4080
+; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    vsrl.vi v20, v8, 8
-; RV64-NEXT:    li a2, 255
-; RV64-NEXT:    slli a3, a2, 24
-; RV64-NEXT:    vand.vx v20, v20, a3
+; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    slli a4, a4, 24
+; RV64-NEXT:    vand.vx v20, v20, a4
 ; RV64-NEXT:    vor.vv v16, v20, v16
 ; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsll.vi v16, v8, 8
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vsll.vi v20, v8, 24
-; RV64-NEXT:    slli a3, a2, 40
-; RV64-NEXT:    vand.vx v20, v20, a3
-; RV64-NEXT:    vor.vv v16, v20, v16
+; RV64-NEXT:    vand.vx v16, v8, a3
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v20, v8, a4
+; RV64-NEXT:    vsll.vi v20, v20, 8
+; RV64-NEXT:    vor.vv v16, v16, v20
 ; RV64-NEXT:    vsll.vx v20, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
-; RV64-NEXT:    slli a0, a2, 48
-; RV64-NEXT:    vand.vx v8, v8, a0
 ; RV64-NEXT:    vor.vv v8, v20, v8
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vor.vv v8, v8, v12
@@ -545,74 +502,50 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    lui a0, 1044480
 ; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4080
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw zero, 8(sp)
-; RV32-NEXT:    li a1, 255
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -256
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a1
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsrl.vx v0, v8, a2
-; RV32-NEXT:    vor.vv v16, v16, v0
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v24, v0, v24
+; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a0
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    vsrl.vx v24, v8, a1
+; RV32-NEXT:    lui a2, 16
+; RV32-NEXT:    addi a2, a2, -256
+; RV32-NEXT:    vand.vx v24, v24, a2
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    vand.vx v0, v0, a0
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8re8.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsll.vx v0, v8, a3
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsll.vx v0, v8, a2
-; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v0, (a1), zero
-; RV32-NEXT:    vsll.vi v16, v8, 8
+; RV32-NEXT:    addi a3, sp, 8
+; RV32-NEXT:    vlse64.v v24, (a3), zero
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vand.vx v0, v0, a3
+; RV32-NEXT:    vsrl.vi v16, v8, 8
 ; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vl8re8.v v0, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a2
+; RV32-NEXT:    vsll.vx v0, v0, a1
+; RV32-NEXT:    vsll.vx v16, v8, a0
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a3
 ; RV32-NEXT:    vsll.vi v8, v8, 24
-; RV32-NEXT:    vand.vv v8, v8, v0
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8re8.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8re8.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -629,25 +562,22 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vor.vv v16, v24, v16
 ; RV64-NEXT:    vsrl.vi v24, v8, 24
-; RV64-NEXT:    lui a2, 4080
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    lui a3, 4080
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsrl.vi v0, v8, 8
-; RV64-NEXT:    li a2, 255
-; RV64-NEXT:    slli a3, a2, 24
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    slli a4, a4, 24
+; RV64-NEXT:    vand.vx v0, v0, a4
 ; RV64-NEXT:    vor.vv v24, v0, v24
 ; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsll.vi v24, v8, 8
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    vand.vx v24, v24, a3
-; RV64-NEXT:    vsll.vi v0, v8, 24
-; RV64-NEXT:    slli a3, a2, 40
-; RV64-NEXT:    vand.vx v0, v0, a3
-; RV64-NEXT:    vor.vv v24, v0, v24
+; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v0, v8, a4
+; RV64-NEXT:    vsll.vi v0, v0, 8
+; RV64-NEXT:    vor.vv v24, v24, v0
 ; RV64-NEXT:    vsll.vx v0, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
-; RV64-NEXT:    slli a0, a2, 48
-; RV64-NEXT:    vand.vx v8, v8, a0
 ; RV64-NEXT:    vor.vv v8, v0, v8
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vor.vv v8, v8, v16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index e7c82a097dd81..c213c430b033a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -85,9 +85,8 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; RV32-NEXT:    vand.vx v9, v9, a1
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    vsll.vi v10, v8, 8
-; RV32-NEXT:    lui a1, 4080
-; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vand.vx v10, v8, a1
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -125,9 +124,8 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    lui a1, 4080
-; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -186,32 +184,19 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; RV32-NEXT:    vmerge.vxm v11, v11, a5, v0
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v11, v12, v11
-; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    vor.vv v10, v12, v10
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    li a5, 255
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a5
-; RV32-NEXT:    vmerge.vim v10, v10, 0, v0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vi v11, v8, 8
-; RV32-NEXT:    vand.vv v10, v11, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a3
-; RV32-NEXT:    vmerge.vim v11, v11, 0, v0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vi v12, v8, 24
-; RV32-NEXT:    vand.vv v11, v12, v11
-; RV32-NEXT:    vor.vv v10, v11, v10
-; RV32-NEXT:    vsll.vx v11, v8, a2
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a4
-; RV32-NEXT:    vmerge.vim v12, v12, 0, v0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v11, v11, v12
-; RV32-NEXT:    vsll.vx v8, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsll.vx v10, v8, a1
+; RV32-NEXT:    vand.vx v12, v8, a3
+; RV32-NEXT:    vsll.vx v12, v12, a2
+; RV32-NEXT:    vor.vv v10, v10, v12
+; RV32-NEXT:    vand.vx v12, v8, a4
+; RV32-NEXT:    vsll.vi v12, v12, 24
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    lui a1, 61681
@@ -259,25 +244,22 @@ define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vor.vv v9, v10, v9
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
-; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v10, v10, a3
+; RV64-NEXT:    lui a4, 4080
+; RV64-NEXT:    vand.vx v10, v10, a4
 ; RV64-NEXT:    vsrl.vi v11, v8, 8
-; RV64-NEXT:    li a3, 255
-; RV64-NEXT:    slli a4, a3, 24
-; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    slli a5, a5, 24
+; RV64-NEXT:    vand.vx v11, v11, a5
 ; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    vand.vx v10, v10, a4
-; RV64-NEXT:    vsll.vi v11, v8, 24
-; RV64-NEXT:    slli a4, a3, 40
-; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vand.vx v10, v8, a5
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vand.vx v11, v8, a4
+; RV64-NEXT:    vsll.vi v11, v11, 24
 ; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vsll.vx v11, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
 ; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    slli a1, a3, 48
-; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vor.vv v8, v11, v8
 ; RV64-NEXT:    lui a1, %hi(.LCPI2_0)
 ; RV64-NEXT:    ld a1, %lo(.LCPI2_0)(a1)
@@ -497,9 +479,8 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV32-NEXT:    vand.vx v10, v10, a1
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v12, v8, 24
 ; LMULMAX2-RV32-NEXT:    vor.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT:    vsll.vi v12, v8, 8
-; LMULMAX2-RV32-NEXT:    lui a1, 4080
-; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a1
+; LMULMAX2-RV32-NEXT:    vand.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT:    vsll.vi v12, v12, 8
 ; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 24
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
@@ -537,9 +518,8 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    vand.vx v10, v10, a1
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v12, v8, 24
 ; LMULMAX2-RV64-NEXT:    vor.vv v10, v10, v12
-; LMULMAX2-RV64-NEXT:    vsll.vi v12, v8, 8
-; LMULMAX2-RV64-NEXT:    lui a1, 4080
-; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a1
+; LMULMAX2-RV64-NEXT:    vand.vx v12, v8, a1
+; LMULMAX2-RV64-NEXT:    vsll.vi v12, v12, 8
 ; LMULMAX2-RV64-NEXT:    vsll.vi v8, v8, 24
 ; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v12
 ; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
@@ -579,55 +559,54 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a2
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT:    vsll.vi v11, v8, 8
-; LMULMAX1-RV32-NEXT:    lui a3, 4080
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT:    vsll.vi v11, v11, 8
 ; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v11
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT:    lui a4, 61681
-; LMULMAX1-RV32-NEXT:    addi a4, a4, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a4
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a4
+; LMULMAX1-RV32-NEXT:    lui a3, 61681
+; LMULMAX1-RV32-NEXT:    addi a3, a3, -241
+; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a3
 ; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 4
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX1-RV32-NEXT:    lui a5, 209715
-; LMULMAX1-RV32-NEXT:    addi a5, a5, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a5
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a5
+; LMULMAX1-RV32-NEXT:    lui a4, 209715
+; LMULMAX1-RV32-NEXT:    addi a4, a4, 819
+; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a4
+; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a4
 ; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 2
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a6, 349525
-; LMULMAX1-RV32-NEXT:    addi a6, a6, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a6
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a6
+; LMULMAX1-RV32-NEXT:    lui a5, 349525
+; LMULMAX1-RV32-NEXT:    addi a5, a5, 1365
+; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a5
+; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a5
 ; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v8
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 8
 ; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a2
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT:    vsll.vi v11, v9, 8
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v9, a2
+; LMULMAX1-RV32-NEXT:    vsll.vi v11, v11, 8
 ; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v11
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a4
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a4
+; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a3
 ; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 4
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a5
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a5
+; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a4
+; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a4
 ; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 2
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a6
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a6
+; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a5
+; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a5
 ; LMULMAX1-RV32-NEXT:    vadd.vv v9, v9, v9
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV32-NEXT:    vse32.v v9, (a0)
@@ -646,55 +625,54 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a2
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v11, v8, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV64-NEXT:    vsll.vi v11, v8, 8
-; LMULMAX1-RV64-NEXT:    lui a3, 4080
-; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v11, v8, a2
+; LMULMAX1-RV64-NEXT:    vsll.vi v11, v11, 8
 ; LMULMAX1-RV64-NEXT:    vsll.vi v8, v8, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT:    lui a4, 61681
-; LMULMAX1-RV64-NEXT:    addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a4
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a4
+; LMULMAX1-RV64-NEXT:    lui a3, 61681
+; LMULMAX1-RV64-NEXT:    addiw a3, a3, -241
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a3
 ; LMULMAX1-RV64-NEXT:    vsll.vi v8, v8, 4
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX1-RV64-NEXT:    lui a5, 209715
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a5
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a5
+; LMULMAX1-RV64-NEXT:    lui a4, 209715
+; LMULMAX1-RV64-NEXT:    addiw a4, a4, 819
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a4
 ; LMULMAX1-RV64-NEXT:    vsll.vi v8, v8, 2
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT:    lui a6, 349525
-; LMULMAX1-RV64-NEXT:    addiw a6, a6, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a6
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a6
+; LMULMAX1-RV64-NEXT:    lui a5, 349525
+; LMULMAX1-RV64-NEXT:    addiw a5, a5, 1365
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a5
 ; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v8
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 8
 ; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a2
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v11, v9, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV64-NEXT:    vsll.vi v11, v9, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v11, v9, a2
+; LMULMAX1-RV64-NEXT:    vsll.vi v11, v11, 8
 ; LMULMAX1-RV64-NEXT:    vsll.vi v9, v9, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a4
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a3
 ; LMULMAX1-RV64-NEXT:    vsll.vi v9, v9, 4
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a5
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a4
+; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a4
 ; LMULMAX1-RV64-NEXT:    vsll.vi v9, v9, 2
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a6
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a6
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a5
+; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a5
 ; LMULMAX1-RV64-NEXT:    vadd.vv v9, v9, v9
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV64-NEXT:    vse32.v v9, (a0)
@@ -732,32 +710,19 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV32-NEXT:    vmerge.vxm v14, v14, a5, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v16, v8, 8
-; LMULMAX2-RV32-NEXT:    vand.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v14, v12
+; LMULMAX2-RV32-NEXT:    vand.vv v16, v16, v14
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v16, v12
 ; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    li a5, 255
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a5
-; LMULMAX2-RV32-NEXT:    vmerge.vim v12, v12, 0, v0
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vsll.vi v14, v8, 8
-; LMULMAX2-RV32-NEXT:    vand.vv v12, v14, v12
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v14, a3
-; LMULMAX2-RV32-NEXT:    vmerge.vim v14, v14, 0, v0
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vsll.vi v16, v8, 24
-; LMULMAX2-RV32-NEXT:    vand.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v14, v12
-; LMULMAX2-RV32-NEXT:    vsll.vx v14, v8, a2
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v16, a4
-; LMULMAX2-RV32-NEXT:    vmerge.vim v16, v16, 0, v0
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v14, v14, v16
-; LMULMAX2-RV32-NEXT:    vsll.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
+; LMULMAX2-RV32-NEXT:    vsll.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a3
+; LMULMAX2-RV32-NEXT:    vsll.vx v16, v16, a2
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a4
+; LMULMAX2-RV32-NEXT:    vsll.vi v16, v16, 24
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 8
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v16, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v12, v8
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
 ; LMULMAX2-RV32-NEXT:    lui a1, 61681
@@ -805,25 +770,22 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a3
 ; LMULMAX2-RV64-NEXT:    vor.vv v10, v12, v10
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v12, v8, 24
-; LMULMAX2-RV64-NEXT:    lui a3, 4080
-; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a3
+; LMULMAX2-RV64-NEXT:    lui a4, 4080
+; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a4
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v14, v8, 8
-; LMULMAX2-RV64-NEXT:    li a3, 255
-; LMULMAX2-RV64-NEXT:    slli a4, a3, 24
-; LMULMAX2-RV64-NEXT:    vand.vx v14, v14, a4
+; LMULMAX2-RV64-NEXT:    li a5, 255
+; LMULMAX2-RV64-NEXT:    slli a5, a5, 24
+; LMULMAX2-RV64-NEXT:    vand.vx v14, v14, a5
 ; LMULMAX2-RV64-NEXT:    vor.vv v12, v14, v12
 ; LMULMAX2-RV64-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV64-NEXT:    vsll.vi v12, v8, 8
-; LMULMAX2-RV64-NEXT:    slli a4, a3, 32
-; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a4
-; LMULMAX2-RV64-NEXT:    vsll.vi v14, v8, 24
-; LMULMAX2-RV64-NEXT:    slli a4, a3, 40
-; LMULMAX2-RV64-NEXT:    vand.vx v14, v14, a4
+; LMULMAX2-RV64-NEXT:    vand.vx v12, v8, a5
+; LMULMAX2-RV64-NEXT:    vsll.vi v12, v12, 8
+; LMULMAX2-RV64-NEXT:    vand.vx v14, v8, a4
+; LMULMAX2-RV64-NEXT:    vsll.vi v14, v14, 24
 ; LMULMAX2-RV64-NEXT:    vor.vv v12, v14, v12
 ; LMULMAX2-RV64-NEXT:    vsll.vx v14, v8, a1
+; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a3
 ; LMULMAX2-RV64-NEXT:    vsll.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    slli a1, a3, 48
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
 ; LMULMAX2-RV64-NEXT:    vor.vv v8, v14, v8
 ; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI5_0)
 ; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI5_0)(a1)
@@ -855,19 +817,19 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v12, (a1)
+; LMULMAX1-RV32-NEXT:    vle64.v v10, (a1)
 ; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
 ; LMULMAX1-RV32-NEXT:    li a2, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v9, v12, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v9, v10, a2
 ; LMULMAX1-RV32-NEXT:    li a3, 40
-; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v12, a3
+; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v10, a3
 ; LMULMAX1-RV32-NEXT:    lui a4, 16
 ; LMULMAX1-RV32-NEXT:    addi a4, a4, -256
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v12, 24
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v9
+; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v10, 24
 ; LMULMAX1-RV32-NEXT:    lui a5, 4080
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v9, a5
+; LMULMAX1-RV32-NEXT:    vand.vx v12, v9, a5
 ; LMULMAX1-RV32-NEXT:    li a6, 5
 ; LMULMAX1-RV32-NEXT:    vmv.s.x v0, a6
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -875,102 +837,89 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32-NEXT:    lui a6, 1044480
 ; LMULMAX1-RV32-NEXT:    vmerge.vxm v9, v9, a6, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v12, 8
+; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v10, 8
 ; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v9
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v13, v11, v10
-; LMULMAX1-RV32-NEXT:    li a6, 255
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a6
-; LMULMAX1-RV32-NEXT:    vmerge.vim v10, v10, 0, v0
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsll.vi v11, v12, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v14, v11, v10
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v11, a4
-; LMULMAX1-RV32-NEXT:    vmerge.vim v11, v11, 0, v0
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsll.vi v15, v12, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v15, v15, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v14, v15, v14
-; LMULMAX1-RV32-NEXT:    vsll.vx v15, v12, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v16, a5
-; LMULMAX1-RV32-NEXT:    vmerge.vim v16, v16, 0, v0
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v15, v15, v16
-; LMULMAX1-RV32-NEXT:    vsll.vx v12, v12, a2
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v15
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v14
+; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v12, v11
+; LMULMAX1-RV32-NEXT:    vsll.vx v12, v10, a2
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v10, a4
+; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
 ; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v12, 4
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v10, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v9
+; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 8
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v13, v10
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v11
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v10, 4
 ; LMULMAX1-RV32-NEXT:    lui a6, 61681
 ; LMULMAX1-RV32-NEXT:    addi a6, a6, -241
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v14, a6
+; LMULMAX1-RV32-NEXT:    vmv.v.x v12, a6
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v14
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v14
-; LMULMAX1-RV32-NEXT:    vsll.vi v12, v12, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v12, 2
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v12
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v12
+; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 4
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v10, 2
 ; LMULMAX1-RV32-NEXT:    lui a6, 209715
 ; LMULMAX1-RV32-NEXT:    addi a6, a6, 819
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v15, a6
+; LMULMAX1-RV32-NEXT:    vmv.v.x v13, a6
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v15
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v15
-; LMULMAX1-RV32-NEXT:    vsll.vi v12, v12, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v12, 1
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v13
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v13
+; LMULMAX1-RV32-NEXT:    vsll.vi v10, v10, 2
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v10, 1
 ; LMULMAX1-RV32-NEXT:    lui a6, 349525
 ; LMULMAX1-RV32-NEXT:    addi a6, a6, 1365
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v17, a6
+; LMULMAX1-RV32-NEXT:    vmv.v.x v14, a6
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v17
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v17
-; LMULMAX1-RV32-NEXT:    vadd.vv v12, v12, v12
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v13, v12
-; LMULMAX1-RV32-NEXT:    vsrl.vx v13, v8, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vx v18, v8, a3
-; LMULMAX1-RV32-NEXT:    vand.vx v18, v18, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v13, v18, v13
-; LMULMAX1-RV32-NEXT:    vsrl.vi v18, v8, 24
-; LMULMAX1-RV32-NEXT:    vand.vx v18, v18, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v19, v8, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v19, v9
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v18
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v13
-; LMULMAX1-RV32-NEXT:    vsll.vi v13, v8, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v13, v10
-; LMULMAX1-RV32-NEXT:    vsll.vi v13, v8, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v13, v11
+; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v14
+; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v14
+; LMULMAX1-RV32-NEXT:    vadd.vv v10, v10, v10
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsll.vx v11, v8, a3
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v16
-; LMULMAX1-RV32-NEXT:    vsll.vx v8, v8, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v15, v8, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v15, v15, a4
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v15, v11
+; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v8, 24
+; LMULMAX1-RV32-NEXT:    vand.vx v15, v15, a5
+; LMULMAX1-RV32-NEXT:    vsrl.vi v16, v8, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v16, v16, v9
+; LMULMAX1-RV32-NEXT:    vor.vv v15, v16, v15
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v15, v11
+; LMULMAX1-RV32-NEXT:    vsll.vx v15, v8, a2
+; LMULMAX1-RV32-NEXT:    vand.vx v16, v8, a4
+; LMULMAX1-RV32-NEXT:    vsll.vx v16, v16, a3
+; LMULMAX1-RV32-NEXT:    vor.vv v15, v15, v16
+; LMULMAX1-RV32-NEXT:    vand.vx v16, v8, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v16, v16, 24
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
+; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v16, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v15, v8
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v14
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v14
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v12
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v12
 ; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 4
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v9, v8
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v15
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v13
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v13
 ; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 2
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v9, v8
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v17
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v17
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v14
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v14
 ; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v8
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v9, v8
 ; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v12, (a1)
+; LMULMAX1-RV32-NEXT:    vse64.v v10, (a1)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bitreverse_v4i64:
@@ -992,43 +941,40 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, a5
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v12, v9, 8
 ; LMULMAX1-RV64-NEXT:    li a6, 255
-; LMULMAX1-RV64-NEXT:    slli a7, a6, 24
-; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, a7
+; LMULMAX1-RV64-NEXT:    slli a6, a6, 24
+; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, a6
 ; LMULMAX1-RV64-NEXT:    vor.vv v11, v12, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV64-NEXT:    vsll.vi v11, v9, 8
-; LMULMAX1-RV64-NEXT:    slli t0, a6, 32
-; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, t0
-; LMULMAX1-RV64-NEXT:    vsll.vi v12, v9, 24
-; LMULMAX1-RV64-NEXT:    slli t1, a6, 40
-; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, t1
+; LMULMAX1-RV64-NEXT:    vand.vx v11, v9, a6
+; LMULMAX1-RV64-NEXT:    vsll.vi v11, v11, 8
+; LMULMAX1-RV64-NEXT:    vand.vx v12, v9, a5
+; LMULMAX1-RV64-NEXT:    vsll.vi v12, v12, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v11, v12, v11
 ; LMULMAX1-RV64-NEXT:    vsll.vx v12, v9, a2
+; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a4
 ; LMULMAX1-RV64-NEXT:    vsll.vx v9, v9, a3
-; LMULMAX1-RV64-NEXT:    slli a6, a6, 48
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a6
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v12, v9
-; LMULMAX1-RV64-NEXT:    lui t2, %hi(.LCPI5_0)
-; LMULMAX1-RV64-NEXT:    ld t2, %lo(.LCPI5_0)(t2)
+; LMULMAX1-RV64-NEXT:    lui a7, %hi(.LCPI5_0)
+; LMULMAX1-RV64-NEXT:    ld a7, %lo(.LCPI5_0)(a7)
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t2
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, t2
-; LMULMAX1-RV64-NEXT:    lui t3, %hi(.LCPI5_1)
-; LMULMAX1-RV64-NEXT:    ld t3, %lo(.LCPI5_1)(t3)
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a7
+; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a7
+; LMULMAX1-RV64-NEXT:    lui t0, %hi(.LCPI5_1)
+; LMULMAX1-RV64-NEXT:    ld t0, %lo(.LCPI5_1)(t0)
 ; LMULMAX1-RV64-NEXT:    vsll.vi v9, v9, 4
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t3
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, t3
-; LMULMAX1-RV64-NEXT:    lui t4, %hi(.LCPI5_2)
-; LMULMAX1-RV64-NEXT:    ld t4, %lo(.LCPI5_2)(t4)
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t0
+; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, t0
+; LMULMAX1-RV64-NEXT:    lui t1, %hi(.LCPI5_2)
+; LMULMAX1-RV64-NEXT:    ld t1, %lo(.LCPI5_2)(t1)
 ; LMULMAX1-RV64-NEXT:    vsll.vi v9, v9, 2
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t4
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, t4
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t1
+; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, t1
 ; LMULMAX1-RV64-NEXT:    vadd.vv v9, v9, v9
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v10, v9
 ; LMULMAX1-RV64-NEXT:    vsrl.vx v10, v8, a2
@@ -1038,33 +984,33 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v11, v8, 24
 ; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, a5
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v12, v8, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, a7
+; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, a6
 ; LMULMAX1-RV64-NEXT:    vor.vv v11, v12, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV64-NEXT:    vsll.vi v11, v8, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, t0
-; LMULMAX1-RV64-NEXT:    vsll.vi v12, v8, 24
-; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, t1
+; LMULMAX1-RV64-NEXT:    vand.vx v11, v8, a6
+; LMULMAX1-RV64-NEXT:    vsll.vi v11, v11, 8
+; LMULMAX1-RV64-NEXT:    vand.vx v12, v8, a5
+; LMULMAX1-RV64-NEXT:    vsll.vi v12, v12, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v11, v12, v11
 ; LMULMAX1-RV64-NEXT:    vsll.vx v12, v8, a2
+; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a4
 ; LMULMAX1-RV64-NEXT:    vsll.vx v8, v8, a3
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a6
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v12, v8
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, t2
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a7
+; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a7
 ; LMULMAX1-RV64-NEXT:    vsll.vi v8, v8, 4
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t3
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, t3
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t0
+; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, t0
 ; LMULMAX1-RV64-NEXT:    vsll.vi v8, v8, 2
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t4
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, t4
+; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, t1
+; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, t1
 ; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v8
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v10, v8
 ; LMULMAX1-RV64-NEXT:    vse64.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
index 8251697878b84..5837dba7b8cd9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -33,9 +33,8 @@ define void @bswap_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; RV32-NEXT:    vand.vx v9, v9, a1
 ; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    vsll.vi v10, v8, 8
-; RV32-NEXT:    lui a1, 4080
-; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vand.vx v10, v8, a1
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -52,9 +51,8 @@ define void @bswap_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    lui a1, 4080
-; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vsll.vi v8, v8, 24
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -92,32 +90,19 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; RV32-NEXT:    vmerge.vxm v11, v11, a5, v0
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v11, v12, v11
-; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    vor.vv v10, v12, v10
 ; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    li a5, 255
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a5
-; RV32-NEXT:    vmerge.vim v10, v10, 0, v0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vi v11, v8, 8
-; RV32-NEXT:    vand.vv v10, v11, v10
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v11, a3
-; RV32-NEXT:    vmerge.vim v11, v11, 0, v0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vi v12, v8, 24
-; RV32-NEXT:    vand.vv v11, v12, v11
-; RV32-NEXT:    vor.vv v10, v11, v10
-; RV32-NEXT:    vsll.vx v11, v8, a2
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a4
-; RV32-NEXT:    vmerge.vim v12, v12, 0, v0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v11, v11, v12
-; RV32-NEXT:    vsll.vx v8, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsll.vx v10, v8, a1
+; RV32-NEXT:    vand.vx v12, v8, a3
+; RV32-NEXT:    vsll.vx v12, v12, a2
+; RV32-NEXT:    vor.vv v10, v10, v12
+; RV32-NEXT:    vand.vx v12, v8, a4
+; RV32-NEXT:    vsll.vi v12, v12, 24
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vsll.vi v8, v8, 8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
@@ -135,25 +120,22 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
 ; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vor.vv v9, v10, v9
 ; RV64-NEXT:    vsrl.vi v10, v8, 24
-; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v10, v10, a3
+; RV64-NEXT:    lui a4, 4080
+; RV64-NEXT:    vand.vx v10, v10, a4
 ; RV64-NEXT:    vsrl.vi v11, v8, 8
-; RV64-NEXT:    li a3, 255
-; RV64-NEXT:    slli a4, a3, 24
-; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    slli a5, a5, 24
+; RV64-NEXT:    vand.vx v11, v11, a5
 ; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsll.vi v10, v8, 8
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    vand.vx v10, v10, a4
-; RV64-NEXT:    vsll.vi v11, v8, 24
-; RV64-NEXT:    slli a4, a3, 40
-; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vand.vx v10, v8, a5
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vand.vx v11, v8, a4
+; RV64-NEXT:    vsll.vi v11, v11, 24
 ; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vsll.vx v11, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
 ; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    slli a1, a3, 48
-; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vor.vv v8, v11, v8
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -238,9 +220,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV32-NEXT:    vand.vx v10, v10, a1
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v12, v8, 24
 ; LMULMAX2-RV32-NEXT:    vor.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT:    vsll.vi v12, v8, 8
-; LMULMAX2-RV32-NEXT:    lui a1, 4080
-; LMULMAX2-RV32-NEXT:    vand.vx v12, v12, a1
+; LMULMAX2-RV32-NEXT:    vand.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT:    vsll.vi v12, v12, 8
 ; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 24
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
@@ -257,9 +238,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    vand.vx v10, v10, a1
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v12, v8, 24
 ; LMULMAX2-RV64-NEXT:    vor.vv v10, v10, v12
-; LMULMAX2-RV64-NEXT:    vsll.vi v12, v8, 8
-; LMULMAX2-RV64-NEXT:    lui a1, 4080
-; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a1
+; LMULMAX2-RV64-NEXT:    vand.vx v12, v8, a1
+; LMULMAX2-RV64-NEXT:    vsll.vi v12, v12, 8
 ; LMULMAX2-RV64-NEXT:    vsll.vi v8, v8, 24
 ; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v12
 ; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
@@ -278,9 +258,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a2
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT:    vsll.vi v11, v8, 8
-; LMULMAX1-RV32-NEXT:    lui a3, 4080
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT:    vsll.vi v11, v11, 8
 ; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v11
 ; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
@@ -288,8 +267,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a2
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT:    vsll.vi v11, v9, 8
-; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v9, a2
+; LMULMAX1-RV32-NEXT:    vsll.vi v11, v11, 8
 ; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 24
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v11
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
@@ -309,9 +288,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a2
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v11, v8, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV64-NEXT:    vsll.vi v11, v8, 8
-; LMULMAX1-RV64-NEXT:    lui a3, 4080
-; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v11, v8, a2
+; LMULMAX1-RV64-NEXT:    vsll.vi v11, v11, 8
 ; LMULMAX1-RV64-NEXT:    vsll.vi v8, v8, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
@@ -319,8 +297,8 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a2
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v11, v9, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v10, v10, v11
-; LMULMAX1-RV64-NEXT:    vsll.vi v11, v9, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, a3
+; LMULMAX1-RV64-NEXT:    vand.vx v11, v9, a2
+; LMULMAX1-RV64-NEXT:    vsll.vi v11, v11, 8
 ; LMULMAX1-RV64-NEXT:    vsll.vi v9, v9, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
@@ -359,32 +337,19 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV32-NEXT:    vmerge.vxm v14, v14, a5, v0
 ; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32-NEXT:    vsrl.vi v16, v8, 8
-; LMULMAX2-RV32-NEXT:    vand.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v14, v12
+; LMULMAX2-RV32-NEXT:    vand.vv v16, v16, v14
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v16, v12
 ; LMULMAX2-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV32-NEXT:    li a5, 255
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a5
-; LMULMAX2-RV32-NEXT:    vmerge.vim v12, v12, 0, v0
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vsll.vi v14, v8, 8
-; LMULMAX2-RV32-NEXT:    vand.vv v12, v14, v12
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v14, a3
-; LMULMAX2-RV32-NEXT:    vmerge.vim v14, v14, 0, v0
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vsll.vi v16, v8, 24
-; LMULMAX2-RV32-NEXT:    vand.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v12, v14, v12
-; LMULMAX2-RV32-NEXT:    vsll.vx v14, v8, a2
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v16, a4
-; LMULMAX2-RV32-NEXT:    vmerge.vim v16, v16, 0, v0
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v14, v14, v16
-; LMULMAX2-RV32-NEXT:    vsll.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v14
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v12
+; LMULMAX2-RV32-NEXT:    vsll.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a3
+; LMULMAX2-RV32-NEXT:    vsll.vx v16, v16, a2
+; LMULMAX2-RV32-NEXT:    vor.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT:    vand.vx v16, v8, a4
+; LMULMAX2-RV32-NEXT:    vsll.vi v16, v16, 24
+; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT:    vsll.vi v8, v8, 8
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v16, v8
+; LMULMAX2-RV32-NEXT:    vor.vv v8, v12, v8
 ; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
 ; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV32-NEXT:    ret
@@ -402,25 +367,22 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a3
 ; LMULMAX2-RV64-NEXT:    vor.vv v10, v12, v10
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v12, v8, 24
-; LMULMAX2-RV64-NEXT:    lui a3, 4080
-; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a3
+; LMULMAX2-RV64-NEXT:    lui a4, 4080
+; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a4
 ; LMULMAX2-RV64-NEXT:    vsrl.vi v14, v8, 8
-; LMULMAX2-RV64-NEXT:    li a3, 255
-; LMULMAX2-RV64-NEXT:    slli a4, a3, 24
-; LMULMAX2-RV64-NEXT:    vand.vx v14, v14, a4
+; LMULMAX2-RV64-NEXT:    li a5, 255
+; LMULMAX2-RV64-NEXT:    slli a5, a5, 24
+; LMULMAX2-RV64-NEXT:    vand.vx v14, v14, a5
 ; LMULMAX2-RV64-NEXT:    vor.vv v12, v14, v12
 ; LMULMAX2-RV64-NEXT:    vor.vv v10, v12, v10
-; LMULMAX2-RV64-NEXT:    vsll.vi v12, v8, 8
-; LMULMAX2-RV64-NEXT:    slli a4, a3, 32
-; LMULMAX2-RV64-NEXT:    vand.vx v12, v12, a4
-; LMULMAX2-RV64-NEXT:    vsll.vi v14, v8, 24
-; LMULMAX2-RV64-NEXT:    slli a4, a3, 40
-; LMULMAX2-RV64-NEXT:    vand.vx v14, v14, a4
+; LMULMAX2-RV64-NEXT:    vand.vx v12, v8, a5
+; LMULMAX2-RV64-NEXT:    vsll.vi v12, v12, 8
+; LMULMAX2-RV64-NEXT:    vand.vx v14, v8, a4
+; LMULMAX2-RV64-NEXT:    vsll.vi v14, v14, 24
 ; LMULMAX2-RV64-NEXT:    vor.vv v12, v14, v12
 ; LMULMAX2-RV64-NEXT:    vsll.vx v14, v8, a1
+; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a3
 ; LMULMAX2-RV64-NEXT:    vsll.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    slli a1, a3, 48
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
 ; LMULMAX2-RV64-NEXT:    vor.vv v8, v14, v8
 ; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v12
 ; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
@@ -431,17 +393,17 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32:       # %bb.0:
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v9, (a1)
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
+; LMULMAX1-RV32-NEXT:    vle64.v v8, (a1)
+; LMULMAX1-RV32-NEXT:    vle64.v v9, (a0)
 ; LMULMAX1-RV32-NEXT:    li a2, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v9, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v8, a2
 ; LMULMAX1-RV32-NEXT:    li a3, 40
-; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v9, a3
+; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v8, a3
 ; LMULMAX1-RV32-NEXT:    lui a4, 16
 ; LMULMAX1-RV32-NEXT:    addi a4, a4, -256
 ; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 24
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 24
 ; LMULMAX1-RV32-NEXT:    lui a5, 4080
 ; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a5
 ; LMULMAX1-RV32-NEXT:    li a6, 5
@@ -451,57 +413,44 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV32-NEXT:    lui a6, 1044480
 ; LMULMAX1-RV32-NEXT:    vmerge.vxm v12, v12, a6, v0
 ; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v8, 8
+; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v12
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT:    vsll.vx v11, v8, a2
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v8, a4
+; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v13
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v8, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT:    vsll.vi v8, v8, 8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v13, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v11, v8
+; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v9, a2
+; LMULMAX1-RV32-NEXT:    vsrl.vx v11, v9, a3
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a4
+; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
+; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 24
+; LMULMAX1-RV32-NEXT:    vand.vx v11, v11, a5
 ; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v9, 8
 ; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v12
 ; LMULMAX1-RV32-NEXT:    vor.vv v11, v13, v11
 ; LMULMAX1-RV32-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV32-NEXT:    li a6, 255
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v11, a6
-; LMULMAX1-RV32-NEXT:    vmerge.vim v11, v11, 0, v0
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsll.vi v13, v9, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v13, v11
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v14, a4
-; LMULMAX1-RV32-NEXT:    vmerge.vim v14, v14, 0, v0
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vsll.vi v15, v9, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v15, v15, v14
-; LMULMAX1-RV32-NEXT:    vor.vv v13, v15, v13
-; LMULMAX1-RV32-NEXT:    vsll.vx v15, v9, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v16, a5
-; LMULMAX1-RV32-NEXT:    vmerge.vim v16, v16, 0, v0
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v15, v15, v16
-; LMULMAX1-RV32-NEXT:    vsll.vx v9, v9, a2
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v13
+; LMULMAX1-RV32-NEXT:    vsll.vx v11, v9, a2
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v9, a4
+; LMULMAX1-RV32-NEXT:    vsll.vx v13, v13, a3
+; LMULMAX1-RV32-NEXT:    vor.vv v11, v11, v13
+; LMULMAX1-RV32-NEXT:    vand.vx v13, v9, a5
+; LMULMAX1-RV32-NEXT:    vsll.vi v13, v13, 24
+; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v12
+; LMULMAX1-RV32-NEXT:    vsll.vi v9, v9, 8
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v13, v9
+; LMULMAX1-RV32-NEXT:    vor.vv v9, v11, v9
 ; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT:    vsrl.vx v13, v8, a3
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v13, a4
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v13, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v8, 24
-; LMULMAX1-RV32-NEXT:    vand.vx v13, v13, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v8, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v15, v12
-; LMULMAX1-RV32-NEXT:    vor.vv v12, v12, v13
-; LMULMAX1-RV32-NEXT:    vor.vv v10, v12, v10
-; LMULMAX1-RV32-NEXT:    vsll.vi v12, v8, 8
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v12, v11
-; LMULMAX1-RV32-NEXT:    vsll.vi v12, v8, 24
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v14
-; LMULMAX1-RV32-NEXT:    vor.vv v11, v12, v11
-; LMULMAX1-RV32-NEXT:    vsll.vx v12, v8, a3
-; LMULMAX1-RV32-NEXT:    vand.vv v12, v12, v16
-; LMULMAX1-RV32-NEXT:    vsll.vx v8, v8, a2
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v12
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v9, (a1)
+; LMULMAX1-RV32-NEXT:    vse64.v v9, (a0)
+; LMULMAX1-RV32-NEXT:    vse64.v v8, (a1)
 ; LMULMAX1-RV32-NEXT:    ret
 ;
 ; LMULMAX1-RV64-LABEL: bswap_v4i64:
@@ -523,21 +472,18 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, a5
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v12, v8, 8
 ; LMULMAX1-RV64-NEXT:    li a6, 255
-; LMULMAX1-RV64-NEXT:    slli a7, a6, 24
-; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, a7
+; LMULMAX1-RV64-NEXT:    slli a6, a6, 24
+; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, a6
 ; LMULMAX1-RV64-NEXT:    vor.vv v11, v12, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV64-NEXT:    vsll.vi v11, v8, 8
-; LMULMAX1-RV64-NEXT:    slli t0, a6, 32
-; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, t0
-; LMULMAX1-RV64-NEXT:    vsll.vi v12, v8, 24
-; LMULMAX1-RV64-NEXT:    slli t1, a6, 40
-; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, t1
+; LMULMAX1-RV64-NEXT:    vand.vx v11, v8, a6
+; LMULMAX1-RV64-NEXT:    vsll.vi v11, v11, 8
+; LMULMAX1-RV64-NEXT:    vand.vx v12, v8, a5
+; LMULMAX1-RV64-NEXT:    vsll.vi v12, v12, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v11, v12, v11
 ; LMULMAX1-RV64-NEXT:    vsll.vx v12, v8, a2
+; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a4
 ; LMULMAX1-RV64-NEXT:    vsll.vx v8, v8, a3
-; LMULMAX1-RV64-NEXT:    slli a6, a6, 48
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a6
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v12, v8
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
@@ -548,17 +494,17 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) {
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v11, v9, 24
 ; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, a5
 ; LMULMAX1-RV64-NEXT:    vsrl.vi v12, v9, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, a7
+; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, a6
 ; LMULMAX1-RV64-NEXT:    vor.vv v11, v12, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v10, v11, v10
-; LMULMAX1-RV64-NEXT:    vsll.vi v11, v9, 8
-; LMULMAX1-RV64-NEXT:    vand.vx v11, v11, t0
-; LMULMAX1-RV64-NEXT:    vsll.vi v12, v9, 24
-; LMULMAX1-RV64-NEXT:    vand.vx v12, v12, t1
+; LMULMAX1-RV64-NEXT:    vand.vx v11, v9, a6
+; LMULMAX1-RV64-NEXT:    vsll.vi v11, v11, 8
+; LMULMAX1-RV64-NEXT:    vand.vx v12, v9, a5
+; LMULMAX1-RV64-NEXT:    vsll.vi v12, v12, 24
 ; LMULMAX1-RV64-NEXT:    vor.vv v11, v12, v11
 ; LMULMAX1-RV64-NEXT:    vsll.vx v12, v9, a2
+; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a4
 ; LMULMAX1-RV64-NEXT:    vsll.vx v9, v9, a3
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a6
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v12, v9
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v11
 ; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10


        


More information about the llvm-commits mailing list