[llvm] 824251c - Revert "[RISCV] Generaize reduction tree matching to all integer reductions (#68014)"

Alex Bradbury via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 4 04:51:42 PDT 2023


Author: Alex Bradbury
Date: 2023-10-04T12:51:01+01:00
New Revision: 824251c9b349d859a9169196cd9533c619a715ce

URL: https://github.com/llvm/llvm-project/commit/824251c9b349d859a9169196cd9533c619a715ce
DIFF: https://github.com/llvm/llvm-project/commit/824251c9b349d859a9169196cd9533c619a715ce.diff

LOG: Revert "[RISCV] Generaize reduction tree matching to all integer reductions (#68014)"

This reverts commit 7a0b9daac9edde4293d2e9fdf30d8b35c04d16a6 and
63bbc250440141b1c51593904fba9bdaa6724280.

I'm seeing issues (e.g. on the GCC torture suite) where
combineBinOpOfExtractToReduceTree is called when the V extensions aren't
enabled and triggers a crash due to RISCVSubtarget::getElen asserting.

I'll aim to follow up with a minimal reproducer. Although it's pretty
obvious how to avoid this crash with some extra gating, there are a few
options as to where that should be inserted so I think it's best to
revert and agree the appropriate fix separately.

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 413af1ff4b9439a..84a5223f91f0158 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -11112,31 +11112,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
-/// Given an integer binary operator, return the generic ISD::VECREDUCE_OP
-/// which corresponds to it.
-static unsigned getVecReduceOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    llvm_unreachable("Unhandled binary to transfrom reduction");
-  case ISD::ADD:
-    return ISD::VECREDUCE_ADD;
-  case ISD::UMAX:
-    return ISD::VECREDUCE_UMAX;
-  case ISD::SMAX:
-    return ISD::VECREDUCE_SMAX;
-  case ISD::UMIN:
-    return ISD::VECREDUCE_UMIN;
-  case ISD::SMIN:
-    return ISD::VECREDUCE_SMIN;
-  case ISD::AND:
-    return ISD::VECREDUCE_AND;
-  case ISD::OR:
-    return ISD::VECREDUCE_OR;
-  case ISD::XOR:
-    return ISD::VECREDUCE_XOR;
-  }
-}
-
 /// Perform two related transforms whose purpose is to incrementally recognize
 /// an explode_vector followed by scalar reduction as a vector reduction node.
 /// This exists to recover from a deficiency in SLP which can't handle
@@ -11155,15 +11130,8 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
 
   const SDLoc DL(N);
   const EVT VT = N->getValueType(0);
-
-  // TODO: Handle floating point here.
-  if (!VT.isInteger())
-    return SDValue();
-
-  const unsigned Opc = N->getOpcode();
-  const unsigned ReduceOpc = getVecReduceOpcode(Opc);
-  assert(Opc == ISD::getVecReduceBaseOpcode(ReduceOpc) &&
-         "Inconsistent mappings");
+  [[maybe_unused]] const unsigned Opc = N->getOpcode();
+  assert(Opc == ISD::ADD && "extend this to other reduction types");
   const SDValue LHS = N->getOperand(0);
   const SDValue RHS = N->getOperand(1);
 
@@ -11193,13 +11161,13 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
     EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, 2);
     SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec,
                               DAG.getVectorIdxConstant(0, DL));
-    return DAG.getNode(ReduceOpc, DL, VT, Vec);
+    return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, Vec);
   }
 
   // Match (binop (reduce (extract_subvector V, 0),
   //                      (extract_vector_elt V, sizeof(SubVec))))
   // into a reduction of one more element from the original vector V.
-  if (LHS.getOpcode() != ReduceOpc)
+  if (LHS.getOpcode() != ISD::VECREDUCE_ADD)
     return SDValue();
 
   SDValue ReduceVec = LHS.getOperand(0);
@@ -11215,7 +11183,7 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
       EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, Idx + 1);
       SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec,
                                 DAG.getVectorIdxConstant(0, DL));
-      return DAG.getNode(ReduceOpc, DL, VT, Vec);
+      return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, Vec);
     }
   }
 
@@ -11723,8 +11691,6 @@ static SDValue performANDCombine(SDNode *N,
 
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
-  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
-    return V;
 
   if (DCI.isAfterLegalizeDAG())
     if (SDValue V = combineDeMorganOfBoolean(N, DAG))
@@ -11777,8 +11743,6 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
 
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
-  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
-    return V;
 
   if (DCI.isAfterLegalizeDAG())
     if (SDValue V = combineDeMorganOfBoolean(N, DAG))
@@ -11830,9 +11794,6 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
 
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
-  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
-    return V;
-
   // fold (xor (select cond, 0, y), x) ->
   //      (select cond, x, (xor x, y))
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
@@ -14038,13 +13999,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::FMAXNUM:
-  case ISD::FMINNUM: {
-    if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
-      return V;
-    if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
-      return V;
-    return SDValue();
-  }
+  case ISD::FMINNUM:
+    return combineBinOpToReduce(N, DAG, Subtarget);
   case ISD::SETCC:
     return performSETCCCombine(N, DAG, Subtarget);
   case ISD::SIGN_EXTEND_INREG:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index f3570495600f3c3..ab137b1ac818299 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -5,10 +5,11 @@
 define i8 @explode_2xi8(<2 x i8> %v) {
 ; CHECK-LABEL: explode_2xi8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    xor a0, a0, a1
 ; CHECK-NEXT:    ret
   %e0 = extractelement <2 x i8> %v, i32 0
   %e1 = extractelement <2 x i8> %v, i32 1
@@ -20,16 +21,16 @@ define i8 @explode_4xi8(<4 x i8> %v) {
 ; CHECK-LABEL: explode_4xi8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.x.s a1, v9
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
-; CHECK-NEXT:    vmv.x.s a2, v8
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vmv.x.s a2, v9
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vmv.x.s a3, v8
+; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    ret
   %e0 = extractelement <4 x i8> %v, i32 0
   %e1 = extractelement <4 x i8> %v, i32 1
@@ -46,28 +47,28 @@ define i8 @explode_8xi8(<8 x i8> %v) {
 ; CHECK-LABEL: explode_8xi8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.x.s a1, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 4
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
 ; CHECK-NEXT:    vmv.x.s a2, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 5
+; CHECK-NEXT:    vslidedown.vi v9, v8, 3
 ; CHECK-NEXT:    vmv.x.s a3, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 6
+; CHECK-NEXT:    vslidedown.vi v9, v8, 4
 ; CHECK-NEXT:    vmv.x.s a4, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 7
+; CHECK-NEXT:    vslidedown.vi v9, v8, 5
 ; CHECK-NEXT:    vmv.x.s a5, v9
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
-; CHECK-NEXT:    vmv.x.s a6, v8
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, a6, a0
+; CHECK-NEXT:    vslidedown.vi v9, v8, 6
+; CHECK-NEXT:    vmv.x.s a6, v9
+; CHECK-NEXT:    vslidedown.vi v8, v8, 7
+; CHECK-NEXT:    vmv.x.s a7, v8
+; CHECK-NEXT:    xor a0, a0, a1
 ; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a5
+; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    add a4, a4, a6
+; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    add a0, a0, a7
 ; CHECK-NEXT:    ret
   %e0 = extractelement <8 x i8> %v, i32 0
   %e1 = extractelement <8 x i8> %v, i32 1
@@ -88,56 +89,119 @@ define i8 @explode_8xi8(<8 x i8> %v) {
 }
 
 define i8 @explode_16xi8(<16 x i8> %v) {
-; CHECK-LABEL: explode_16xi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
-; CHECK-NEXT:    vmv.x.s a1, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 4
-; CHECK-NEXT:    vmv.x.s a2, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 5
-; CHECK-NEXT:    vmv.x.s a3, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 6
-; CHECK-NEXT:    vmv.x.s a4, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 7
-; CHECK-NEXT:    vmv.x.s a5, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 8
-; CHECK-NEXT:    vmv.x.s a6, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 9
-; CHECK-NEXT:    vmv.x.s a7, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 10
-; CHECK-NEXT:    vmv.x.s t0, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 11
-; CHECK-NEXT:    vmv.x.s t1, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 12
-; CHECK-NEXT:    vmv.x.s t2, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 13
-; CHECK-NEXT:    vmv.x.s t3, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 14
-; CHECK-NEXT:    vmv.x.s t4, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 15
-; CHECK-NEXT:    vmv.x.s t5, v9
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
-; CHECK-NEXT:    vmv.x.s t6, v8
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, t6, a0
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, a5, a7
-; CHECK-NEXT:    add a5, a5, t0
-; CHECK-NEXT:    add a0, a0, a5
-; CHECK-NEXT:    add t1, t1, t2
-; CHECK-NEXT:    add t1, t1, t3
-; CHECK-NEXT:    add t1, t1, t4
-; CHECK-NEXT:    add t1, t1, t5
-; CHECK-NEXT:    add a0, a0, t1
-; CHECK-NEXT:    ret
+; RV32-LABEL: explode_16xi8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v9, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 8
+; RV32-NEXT:    vmv.x.s t0, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 9
+; RV32-NEXT:    vmv.x.s t1, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 10
+; RV32-NEXT:    vmv.x.s t2, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 11
+; RV32-NEXT:    vmv.x.s t3, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 12
+; RV32-NEXT:    vmv.x.s t4, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 13
+; RV32-NEXT:    vmv.x.s t5, v9
+; RV32-NEXT:    vslidedown.vi v9, v8, 14
+; RV32-NEXT:    vmv.x.s t6, v9
+; RV32-NEXT:    vslidedown.vi v8, v8, 15
+; RV32-NEXT:    vmv.x.s s0, v8
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a7, a7, t1
+; RV32-NEXT:    add a7, a7, t2
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    add t3, t3, t5
+; RV32-NEXT:    add t3, t3, t6
+; RV32-NEXT:    add t3, t3, s0
+; RV32-NEXT:    add a0, a0, t3
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: explode_16xi8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v9, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 8
+; RV64-NEXT:    vmv.x.s t0, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 9
+; RV64-NEXT:    vmv.x.s t1, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 10
+; RV64-NEXT:    vmv.x.s t2, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 11
+; RV64-NEXT:    vmv.x.s t3, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 12
+; RV64-NEXT:    vmv.x.s t4, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 13
+; RV64-NEXT:    vmv.x.s t5, v9
+; RV64-NEXT:    vslidedown.vi v9, v8, 14
+; RV64-NEXT:    vmv.x.s t6, v9
+; RV64-NEXT:    vslidedown.vi v8, v8, 15
+; RV64-NEXT:    vmv.x.s s0, v8
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    add a7, a7, t1
+; RV64-NEXT:    add a7, a7, t2
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    add t3, t3, t5
+; RV64-NEXT:    add t3, t3, t6
+; RV64-NEXT:    add t3, t3, s0
+; RV64-NEXT:    add a0, a0, t3
+; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
   %e0 = extractelement <16 x i8> %v, i32 0
   %e1 = extractelement <16 x i8> %v, i32 1
   %e2 = extractelement <16 x i8> %v, i32 2
@@ -175,10 +239,11 @@ define i8 @explode_16xi8(<16 x i8> %v) {
 define i16 @explode_2xi16(<2 x i16> %v) {
 ; CHECK-LABEL: explode_2xi16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    xor a0, a0, a1
 ; CHECK-NEXT:    ret
   %e0 = extractelement <2 x i16> %v, i32 0
   %e1 = extractelement <2 x i16> %v, i32 1
@@ -190,16 +255,16 @@ define i16 @explode_4xi16(<4 x i16> %v) {
 ; CHECK-LABEL: explode_4xi16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.x.s a1, v9
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
-; CHECK-NEXT:    vmv.x.s a2, v8
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vmv.x.s a2, v9
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vmv.x.s a3, v8
+; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    ret
   %e0 = extractelement <4 x i16> %v, i32 0
   %e1 = extractelement <4 x i16> %v, i32 1
@@ -216,28 +281,28 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 ; CHECK-LABEL: explode_8xi16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vmv.x.s a0, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.x.s a1, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 4
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
 ; CHECK-NEXT:    vmv.x.s a2, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 5
+; CHECK-NEXT:    vslidedown.vi v9, v8, 3
 ; CHECK-NEXT:    vmv.x.s a3, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 6
+; CHECK-NEXT:    vslidedown.vi v9, v8, 4
 ; CHECK-NEXT:    vmv.x.s a4, v9
-; CHECK-NEXT:    vslidedown.vi v9, v8, 7
+; CHECK-NEXT:    vslidedown.vi v9, v8, 5
 ; CHECK-NEXT:    vmv.x.s a5, v9
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
-; CHECK-NEXT:    vmv.x.s a6, v8
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, a6, a0
+; CHECK-NEXT:    vslidedown.vi v9, v8, 6
+; CHECK-NEXT:    vmv.x.s a6, v9
+; CHECK-NEXT:    vslidedown.vi v8, v8, 7
+; CHECK-NEXT:    vmv.x.s a7, v8
+; CHECK-NEXT:    xor a0, a0, a1
 ; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a5
+; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    add a4, a4, a6
+; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    add a0, a0, a7
 ; CHECK-NEXT:    ret
   %e0 = extractelement <8 x i16> %v, i32 0
   %e1 = extractelement <8 x i16> %v, i32 1
@@ -258,57 +323,121 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 }
 
 define i16 @explode_16xi16(<16 x i16> %v) {
-; CHECK-LABEL: explode_16xi16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 4
-; CHECK-NEXT:    vmv.x.s a2, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 5
-; CHECK-NEXT:    vmv.x.s a3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
-; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 7
-; CHECK-NEXT:    vmv.x.s a5, v10
-; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 8
-; CHECK-NEXT:    vmv.x.s a6, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 9
-; CHECK-NEXT:    vmv.x.s a7, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 10
-; CHECK-NEXT:    vmv.x.s t0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 11
-; CHECK-NEXT:    vmv.x.s t1, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 12
-; CHECK-NEXT:    vmv.x.s t2, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 13
-; CHECK-NEXT:    vmv.x.s t3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 14
-; CHECK-NEXT:    vmv.x.s t4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 15
-; CHECK-NEXT:    vmv.x.s t5, v10
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
-; CHECK-NEXT:    vmv.x.s t6, v8
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, t6, a0
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, a5, a7
-; CHECK-NEXT:    add a5, a5, t0
-; CHECK-NEXT:    add a0, a0, a5
-; CHECK-NEXT:    add t1, t1, t2
-; CHECK-NEXT:    add t1, t1, t3
-; CHECK-NEXT:    add t1, t1, t4
-; CHECK-NEXT:    add t1, t1, t5
-; CHECK-NEXT:    add a0, a0, t1
-; CHECK-NEXT:    ret
+; RV32-LABEL: explode_16xi16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 4
+; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v10
+; RV32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 8
+; RV32-NEXT:    vmv.x.s t0, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 9
+; RV32-NEXT:    vmv.x.s t1, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 10
+; RV32-NEXT:    vmv.x.s t2, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 11
+; RV32-NEXT:    vmv.x.s t3, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 12
+; RV32-NEXT:    vmv.x.s t4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 13
+; RV32-NEXT:    vmv.x.s t5, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 14
+; RV32-NEXT:    vmv.x.s t6, v10
+; RV32-NEXT:    vslidedown.vi v8, v8, 15
+; RV32-NEXT:    vmv.x.s s0, v8
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a7, a7, t1
+; RV32-NEXT:    add a7, a7, t2
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    add t3, t3, t5
+; RV32-NEXT:    add t3, t3, t6
+; RV32-NEXT:    add t3, t3, s0
+; RV32-NEXT:    add a0, a0, t3
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: explode_16xi16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v10
+; RV64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 8
+; RV64-NEXT:    vmv.x.s t0, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 9
+; RV64-NEXT:    vmv.x.s t1, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 10
+; RV64-NEXT:    vmv.x.s t2, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 11
+; RV64-NEXT:    vmv.x.s t3, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 12
+; RV64-NEXT:    vmv.x.s t4, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 13
+; RV64-NEXT:    vmv.x.s t5, v10
+; RV64-NEXT:    vslidedown.vi v10, v8, 14
+; RV64-NEXT:    vmv.x.s t6, v10
+; RV64-NEXT:    vslidedown.vi v8, v8, 15
+; RV64-NEXT:    vmv.x.s s0, v8
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    add a7, a7, t1
+; RV64-NEXT:    add a7, a7, t2
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    add t3, t3, t5
+; RV64-NEXT:    add t3, t3, t6
+; RV64-NEXT:    add t3, t3, s0
+; RV64-NEXT:    add a0, a0, t3
+; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
   %e0 = extractelement <16 x i16> %v, i32 0
   %e1 = extractelement <16 x i16> %v, i32 1
   %e2 = extractelement <16 x i16> %v, i32 2
@@ -346,10 +475,11 @@ define i16 @explode_16xi16(<16 x i16> %v) {
 define i32 @explode_2xi32(<2 x i32> %v) {
 ; CHECK-LABEL: explode_2xi32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    xor a0, a0, a1
 ; CHECK-NEXT:    ret
   %e0 = extractelement <2 x i32> %v, i32 0
   %e1 = extractelement <2 x i32> %v, i32 1
@@ -361,31 +491,31 @@ define i32 @explode_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: explode_4xi32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 3
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v9, v8, 1
 ; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v9
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vmv.x.s a3, v8
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_4xi32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    vslidedown.vi v9, v8, 3
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64-NEXT:    vmv.x.s a1, v9
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
-; RV64-NEXT:    vmv.x.s a2, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    addw a0, a2, a0
+; RV64-NEXT:    vslidedown.vi v9, v8, 2
+; RV64-NEXT:    vmv.x.s a2, v9
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v8
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addw a0, a0, a2
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -402,57 +532,57 @@ define i32 @explode_8xi32(<8 x i32> %v) {
 ; RV32-LABEL: explode_8xi32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    vmv.x.s a2, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vmv.x.s a3, v10
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 7
+; RV32-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32-NEXT:    vmv.x.s a5, v10
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
-; RV32-NEXT:    vmv.x.s a6, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a6, a0
+; RV32-NEXT:    vslidedown.vi v10, v8, 6
+; RV32-NEXT:    vmv.x.s a6, v10
+; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    vmv.x.s a7, v8
+; RV32-NEXT:    xor a0, a0, a1
 ; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a2, a4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a0, a0, a7
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_8xi32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    vmv.x.s a2, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v10
+; RV64-NEXT:    vmv.x.s a3, v10
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a2, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vmv.x.s a3, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
 ; RV64-NEXT:    vmv.x.s a4, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-NEXT:    vslidedown.vi v10, v8, 5
 ; RV64-NEXT:    vmv.x.s a5, v10
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
-; RV64-NEXT:    vmv.x.s a6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, a6, a0
+; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    vmv.x.s a6, v10
+; RV64-NEXT:    vslidedown.vi v8, v8, 7
+; RV64-NEXT:    vmv.x.s a7, v8
+; RV64-NEXT:    xor a0, a0, a1
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a2, a4
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    addw a0, a0, a5
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    addw a0, a0, a7
 ; RV64-NEXT:    ret
   %e0 = extractelement <8 x i32> %v, i32 0
   %e1 = extractelement <8 x i32> %v, i32 1
@@ -479,57 +609,60 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 128
 ; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 116(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s2, -12
 ; RV32-NEXT:    addi s0, sp, 128
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v12
+; RV32-NEXT:    vmv.x.s a2, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vmv.x.s a3, v12
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vmv.x.s a4, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vmv.x.s a5, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vmv.x.s a6, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    mv a6, sp
+; RV32-NEXT:    vmv.x.s a7, v12
+; RV32-NEXT:    mv t0, sp
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (a6)
-; RV32-NEXT:    lw a6, 32(sp)
-; RV32-NEXT:    lw a7, 36(sp)
-; RV32-NEXT:    lw t0, 40(sp)
-; RV32-NEXT:    lw t1, 44(sp)
-; RV32-NEXT:    lw t2, 48(sp)
-; RV32-NEXT:    lw t3, 52(sp)
-; RV32-NEXT:    lw t4, 56(sp)
-; RV32-NEXT:    lw t5, 60(sp)
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
-; RV32-NEXT:    vmv.x.s t6, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, t6, a0
+; RV32-NEXT:    vse32.v v8, (t0)
+; RV32-NEXT:    lw t0, 32(sp)
+; RV32-NEXT:    lw t1, 36(sp)
+; RV32-NEXT:    lw t2, 40(sp)
+; RV32-NEXT:    lw t3, 44(sp)
+; RV32-NEXT:    lw t4, 48(sp)
+; RV32-NEXT:    lw t5, 52(sp)
+; RV32-NEXT:    lw t6, 56(sp)
+; RV32-NEXT:    lw s2, 60(sp)
+; RV32-NEXT:    xor a0, a0, a1
 ; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a2, a4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a0, a0, a4
 ; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, a7, t1
 ; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    add t2, t2, t3
-; RV32-NEXT:    add t2, t2, t4
-; RV32-NEXT:    add t2, t2, t5
-; RV32-NEXT:    add a0, a0, t2
+; RV32-NEXT:    add t1, t1, t2
+; RV32-NEXT:    add t1, t1, t3
+; RV32-NEXT:    add a0, a0, t1
+; RV32-NEXT:    add t4, t4, t5
+; RV32-NEXT:    add t4, t4, t6
+; RV32-NEXT:    add t4, t4, s2
+; RV32-NEXT:    add a0, a0, t4
 ; RV32-NEXT:    addi sp, s0, -128
 ; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 116(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 128
 ; RV32-NEXT:    ret
 ;
@@ -539,57 +672,60 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV64-NEXT:    .cfi_def_cfa_offset 128
 ; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 104(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s2, -24
 ; RV64-NEXT:    addi s0, sp, 128
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v12
+; RV64-NEXT:    vmv.x.s a2, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v12
+; RV64-NEXT:    vmv.x.s a3, v12
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vmv.x.s a4, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vmv.x.s a5, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vmv.x.s a6, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 7
-; RV64-NEXT:    vmv.x.s a5, v12
-; RV64-NEXT:    mv a6, sp
+; RV64-NEXT:    vmv.x.s a7, v12
+; RV64-NEXT:    mv t0, sp
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (a6)
-; RV64-NEXT:    lw a6, 32(sp)
-; RV64-NEXT:    lw a7, 36(sp)
-; RV64-NEXT:    lw t0, 40(sp)
-; RV64-NEXT:    lw t1, 44(sp)
-; RV64-NEXT:    lw t2, 48(sp)
-; RV64-NEXT:    lw t3, 52(sp)
-; RV64-NEXT:    lw t4, 56(sp)
-; RV64-NEXT:    lw t5, 60(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
-; RV64-NEXT:    vmv.x.s t6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, t6, a0
+; RV64-NEXT:    vse32.v v8, (t0)
+; RV64-NEXT:    lw t0, 32(sp)
+; RV64-NEXT:    lw t1, 36(sp)
+; RV64-NEXT:    lw t2, 40(sp)
+; RV64-NEXT:    lw t3, 44(sp)
+; RV64-NEXT:    lw t4, 48(sp)
+; RV64-NEXT:    lw t5, 52(sp)
+; RV64-NEXT:    lw t6, 56(sp)
+; RV64-NEXT:    lw s2, 60(sp)
+; RV64-NEXT:    xor a0, a0, a1
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a2, a4
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a0, a0, a5
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a0, a0, a4
 ; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
 ; RV64-NEXT:    add a0, a0, a7
-; RV64-NEXT:    add t2, t2, t3
-; RV64-NEXT:    add t2, t2, t4
-; RV64-NEXT:    add t2, t2, t5
-; RV64-NEXT:    addw a0, a0, t2
+; RV64-NEXT:    add t1, t1, t2
+; RV64-NEXT:    add t1, t1, t3
+; RV64-NEXT:    add a0, a0, t1
+; RV64-NEXT:    add t4, t4, t5
+; RV64-NEXT:    add t4, t4, t6
+; RV64-NEXT:    add t4, t4, s2
+; RV64-NEXT:    addw a0, a0, t4
 ; RV64-NEXT:    addi sp, s0, -128
 ; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 104(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 128
 ; RV64-NEXT:    ret
   %e0 = extractelement <16 x i32> %v, i32 0
@@ -629,22 +765,26 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 define i64 @explode_2xi64(<2 x i64> %v) {
 ; RV32-LABEL: explode_2xi64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vredxor.vs v8, v8, v9
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a1
-; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vmv.x.s a3, v8
+; RV32-NEXT:    xor a1, a1, a0
+; RV32-NEXT:    xor a0, a2, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_2xi64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vredxor.vs v8, v8, v9
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    xor a0, a0, a1
 ; RV64-NEXT:    ret
   %e0 = extractelement <2 x i64> %v, i32 0
   %e1 = extractelement <2 x i64> %v, i32 1
@@ -655,46 +795,49 @@ define i64 @explode_2xi64(<2 x i64> %v) {
 define i64 @explode_4xi64(<4 x i64> %v) {
 ; RV32-LABEL: explode_4xi64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v12, v10, a0
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32-NEXT:    vsrl.vx v12, v10, a0
 ; RV32-NEXT:    vmv.x.s a3, v12
 ; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    vmv.x.s a5, v8
-; RV32-NEXT:    add a2, a5, a2
-; RV32-NEXT:    sltu a5, a2, a5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a5
-; RV32-NEXT:    add a1, a0, a3
-; RV32-NEXT:    add a0, a2, a4
-; RV32-NEXT:    sltu a2, a0, a2
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vsrl.vx v12, v10, a0
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vmv.x.s a6, v10
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vsrl.vx v10, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    vmv.x.s a7, v8
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    xor a2, a2, a4
+; RV32-NEXT:    add a6, a2, a6
+; RV32-NEXT:    sltu a2, a6, a2
+; RV32-NEXT:    add a1, a1, a5
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    add a0, a6, a7
+; RV32-NEXT:    sltu a2, a0, a6
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_4xi64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
-; RV64-NEXT:    vmv.x.s a2, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, a2, a0
+; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    vmv.x.s a3, v8
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i64> %v, i32 0
   %e1 = extractelement <4 x i64> %v, i32 1
@@ -710,63 +853,71 @@ define i64 @explode_4xi64(<4 x i64> %v) {
 define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV32-LABEL: explode_8xi64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a1, v16
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vx v12, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s a3, v16
 ; RV32-NEXT:    vmv.x.s a4, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s a5, v16
 ; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s a7, v16
 ; RV32-NEXT:    vmv.x.s t0, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vslidedown.vi v12, v8, 4
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s t1, v16
 ; RV32-NEXT:    vmv.x.s t2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
+; RV32-NEXT:    vslidedown.vi v12, v8, 5
 ; RV32-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-NEXT:    vmv.x.s t3, v16
 ; RV32-NEXT:    vmv.x.s t4, v12
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    vmv.x.s t5, v8
-; RV32-NEXT:    add a2, t5, a2
-; RV32-NEXT:    sltu t5, a2, t5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, t5
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    add a4, a2, a4
-; RV32-NEXT:    sltu a1, a4, a2
+; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vsrl.vx v16, v12, a0
+; RV32-NEXT:    vmv.x.s t5, v16
+; RV32-NEXT:    vmv.x.s t6, v12
+; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    vsrl.vx v12, v8, a0
+; RV32-NEXT:    vmv.x.s a0, v12
+; RV32-NEXT:    vmv.x.s s0, v8
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    xor a2, a2, a4
+; RV32-NEXT:    add a6, a2, a6
+; RV32-NEXT:    sltu a2, a6, a2
 ; RV32-NEXT:    add a1, a1, a5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a1, a6, a4
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a1, a1, a7
-; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add t0, a6, t0
-; RV32-NEXT:    sltu a1, t0, a6
-; RV32-NEXT:    add a1, a1, t1
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a2, t0, a6
+; RV32-NEXT:    add a2, a2, t1
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add t2, t0, t2
-; RV32-NEXT:    sltu a1, t2, t0
-; RV32-NEXT:    add a1, a1, t3
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    add a0, t2, t4
-; RV32-NEXT:    sltu a2, a0, t2
+; RV32-NEXT:    sltu a2, t2, t0
+; RV32-NEXT:    add a2, a2, t3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add t4, t2, t4
+; RV32-NEXT:    sltu a2, t4, t2
+; RV32-NEXT:    add a2, a2, t5
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add t6, t4, t6
+; RV32-NEXT:    sltu a2, t6, t4
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    add a0, t6, s0
+; RV32-NEXT:    sltu a2, a0, t6
 ; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_8xi64:
@@ -780,28 +931,29 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV64-NEXT:    addi s0, sp, 128
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v12
+; RV64-NEXT:    vmv.x.s a2, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    mv a4, sp
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 32(sp)
-; RV64-NEXT:    ld a3, 40(sp)
-; RV64-NEXT:    ld a4, 48(sp)
-; RV64-NEXT:    ld a5, 56(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
-; RV64-NEXT:    vmv.x.s a6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, a6, a0
+; RV64-NEXT:    vse64.v v8, (a4)
+; RV64-NEXT:    ld a4, 32(sp)
+; RV64-NEXT:    ld a5, 40(sp)
+; RV64-NEXT:    ld a6, 48(sp)
+; RV64-NEXT:    ld a7, 56(sp)
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    add a5, a5, a6
 ; RV64-NEXT:    add a0, a0, a5
+; RV64-NEXT:    add a0, a0, a7
 ; RV64-NEXT:    addi sp, s0, -128
 ; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
@@ -856,130 +1008,130 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    .cfi_offset s9, -44
 ; RV32-NEXT:    .cfi_offset s10, -48
 ; RV32-NEXT:    .cfi_offset s11, -52
-; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v8, 2
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s t6, v24
+; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v16
 ; RV32-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    vslidedown.vi v16, v8, 3
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a1, v24
-; RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 4
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s0, v24
-; RV32-NEXT:    vmv.x.s a3, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 5
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vslidedown.vi v16, v8, 1
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s1, v24
+; RV32-NEXT:    vmv.x.s a3, v24
 ; RV32-NEXT:    vmv.x.s a4, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 6
+; RV32-NEXT:    vslidedown.vi v16, v8, 2
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s2, v24
-; RV32-NEXT:    vmv.x.s a5, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 7
+; RV32-NEXT:    vmv.x.s a5, v24
+; RV32-NEXT:    vmv.x.s a6, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 3
+; RV32-NEXT:    vsrl.vx v24, v16, a0
+; RV32-NEXT:    vmv.x.s a7, v24
+; RV32-NEXT:    vmv.x.s t0, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 4
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s3, v24
-; RV32-NEXT:    vmv.x.s a6, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 8
+; RV32-NEXT:    vmv.x.s t1, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 5
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s4, v24
-; RV32-NEXT:    vmv.x.s a7, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 9
+; RV32-NEXT:    vmv.x.s t2, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 6
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s5, v24
-; RV32-NEXT:    vmv.x.s t0, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 10
+; RV32-NEXT:    vmv.x.s t3, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 7
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s6, v24
-; RV32-NEXT:    vmv.x.s t1, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 11
+; RV32-NEXT:    vmv.x.s t4, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 8
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s7, v24
-; RV32-NEXT:    vmv.x.s t2, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 12
+; RV32-NEXT:    vmv.x.s t5, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 9
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s8, v24
-; RV32-NEXT:    vmv.x.s t3, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 13
+; RV32-NEXT:    vmv.x.s t6, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 10
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s9, v24
-; RV32-NEXT:    vmv.x.s t4, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 14
+; RV32-NEXT:    vmv.x.s s0, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 11
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s10, v24
-; RV32-NEXT:    vmv.x.s t5, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 15
+; RV32-NEXT:    vmv.x.s s1, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 12
 ; RV32-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-NEXT:    vmv.x.s s11, v24
-; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vmv.x.s s2, v16
+; RV32-NEXT:    vslidedown.vi v24, v8, 13
+; RV32-NEXT:    vsrl.vx v16, v24, a0
 ; RV32-NEXT:    vmv.x.s ra, v16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    add a1, a0, t6
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    lw t6, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    add t6, a0, t6
-; RV32-NEXT:    sltu a0, t6, a0
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a2, t6, a2
-; RV32-NEXT:    sltu a1, a2, t6
-; RV32-NEXT:    add a1, a1, s0
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a3, a2, a3
-; RV32-NEXT:    sltu a1, a3, a2
-; RV32-NEXT:    add a1, a1, s1
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a4, a3, a4
-; RV32-NEXT:    sltu a1, a4, a3
-; RV32-NEXT:    add a1, a1, s2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a5, a4, a5
-; RV32-NEXT:    sltu a1, a5, a4
-; RV32-NEXT:    add a1, a1, s3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a6, a5, a6
-; RV32-NEXT:    sltu a1, a6, a5
-; RV32-NEXT:    add a1, a1, s4
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a7, a6, a7
-; RV32-NEXT:    sltu a1, a7, a6
-; RV32-NEXT:    add a1, a1, s5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t0, a7, t0
-; RV32-NEXT:    sltu a1, t0, a7
-; RV32-NEXT:    add a1, a1, s6
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vslidedown.vi v16, v8, 14
+; RV32-NEXT:    vsrl.vx v0, v16, a0
+; RV32-NEXT:    vslidedown.vi v8, v8, 15
+; RV32-NEXT:    vmv.x.s a1, v24
+; RV32-NEXT:    vsrl.vx v24, v8, a0
+; RV32-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    xor a2, a2, a4
+; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a6, a2, a6
+; RV32-NEXT:    sltu a2, a6, a2
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    add t0, a6, t0
+; RV32-NEXT:    sltu a2, t0, a6
+; RV32-NEXT:    add a2, a2, s3
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    add t1, t0, t1
-; RV32-NEXT:    sltu a1, t1, t0
-; RV32-NEXT:    add a1, a1, s7
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a2, t1, t0
+; RV32-NEXT:    add a2, a2, s4
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    add t2, t1, t2
-; RV32-NEXT:    sltu a1, t2, t1
-; RV32-NEXT:    add a1, a1, s8
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a2, t2, t1
+; RV32-NEXT:    add a2, a2, s5
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    add t3, t2, t3
-; RV32-NEXT:    sltu a1, t3, t2
-; RV32-NEXT:    add a1, a1, s9
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a2, t3, t2
+; RV32-NEXT:    add a2, a2, s6
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    add t4, t3, t4
-; RV32-NEXT:    sltu a1, t4, t3
-; RV32-NEXT:    add a1, a1, s10
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a2, t4, t3
+; RV32-NEXT:    add a2, a2, s7
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    add t5, t4, t5
-; RV32-NEXT:    sltu a1, t5, t4
-; RV32-NEXT:    add a1, a1, s11
+; RV32-NEXT:    sltu a2, t5, t4
+; RV32-NEXT:    add a2, a2, s8
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add t6, t5, t6
+; RV32-NEXT:    sltu a2, t6, t5
+; RV32-NEXT:    add a2, a2, s9
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add s0, t6, s0
+; RV32-NEXT:    sltu a2, s0, t6
+; RV32-NEXT:    add a2, a2, s10
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add s1, s0, s1
+; RV32-NEXT:    sltu a2, s1, s0
+; RV32-NEXT:    add a2, a2, s11
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add s2, s1, s2
+; RV32-NEXT:    sltu a2, s2, s1
+; RV32-NEXT:    add a2, a2, ra
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vmv.x.s a2, v0
+; RV32-NEXT:    add a1, s2, a1
+; RV32-NEXT:    sltu a3, a1, s2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    vmv.x.s a3, v16
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vmv.x.s a2, v24
+; RV32-NEXT:    add a3, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    add a0, t5, ra
-; RV32-NEXT:    sltu a2, a0, t5
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    sltu a2, a0, a3
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
@@ -1003,52 +1155,56 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV64-NEXT:    .cfi_def_cfa_offset 256
 ; RV64-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 232(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s2, -24
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslidedown.vi v16, v8, 1
+; RV64-NEXT:    vmv.x.s a1, v16
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v16
+; RV64-NEXT:    vmv.x.s a2, v16
 ; RV64-NEXT:    vslidedown.vi v16, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v16
-; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    vmv.x.s a3, v16
+; RV64-NEXT:    mv a4, sp
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 32(sp)
-; RV64-NEXT:    ld a3, 40(sp)
-; RV64-NEXT:    ld a4, 48(sp)
-; RV64-NEXT:    ld a5, 56(sp)
-; RV64-NEXT:    ld a6, 64(sp)
-; RV64-NEXT:    ld a7, 72(sp)
-; RV64-NEXT:    ld t0, 80(sp)
-; RV64-NEXT:    ld t1, 88(sp)
-; RV64-NEXT:    ld t2, 96(sp)
-; RV64-NEXT:    ld t3, 104(sp)
-; RV64-NEXT:    ld t4, 112(sp)
-; RV64-NEXT:    ld t5, 120(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
-; RV64-NEXT:    vmv.x.s t6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, t6, a0
+; RV64-NEXT:    vse64.v v8, (a4)
+; RV64-NEXT:    ld a4, 32(sp)
+; RV64-NEXT:    ld a5, 40(sp)
+; RV64-NEXT:    ld a6, 48(sp)
+; RV64-NEXT:    ld a7, 56(sp)
+; RV64-NEXT:    ld t0, 64(sp)
+; RV64-NEXT:    ld t1, 72(sp)
+; RV64-NEXT:    ld t2, 80(sp)
+; RV64-NEXT:    ld t3, 88(sp)
+; RV64-NEXT:    ld t4, 96(sp)
+; RV64-NEXT:    ld t5, 104(sp)
+; RV64-NEXT:    ld t6, 112(sp)
+; RV64-NEXT:    ld s2, 120(sp)
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a0, a0, a4
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, a5, a7
 ; RV64-NEXT:    add a0, a0, a5
-; RV64-NEXT:    add t0, t0, t1
-; RV64-NEXT:    add t0, t0, t2
-; RV64-NEXT:    add t0, t0, t3
-; RV64-NEXT:    add a0, a0, t0
-; RV64-NEXT:    add t4, t4, t5
-; RV64-NEXT:    add a0, a0, t4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    add a7, a7, t1
+; RV64-NEXT:    add a0, a0, a7
+; RV64-NEXT:    add t2, t2, t3
+; RV64-NEXT:    add t2, t2, t4
+; RV64-NEXT:    add t2, t2, t5
+; RV64-NEXT:    add a0, a0, t2
+; RV64-NEXT:    add t6, t6, s2
+; RV64-NEXT:    add a0, a0, t6
 ; RV64-NEXT:    addi sp, s0, -256
 ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 232(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 256
 ; RV64-NEXT:    ret
   %e0 = extractelement <16 x i64> %v, i32 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 8c96392f08a5dbe..173b70def03d4c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s
 
 define i32 @reduce_sum_2xi32(<2 x i32> %v) {
 ; CHECK-LABEL: reduce_sum_2xi32:
@@ -448,336 +448,3 @@ define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
   %add13 = add i32 %add12, %e14
   ret i32 %add13
 }
-
-;; Most of the cornercases are exercised above, the following just
-;; makes sure that other opcodes work as expected.
-
-define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_xor_16xi32_prefix2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vredxor.vs v8, v8, v9
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %xor0 = xor i32 %e0, %e1
-  ret i32 %xor0
-}
-
-define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
-; CHECK-LABEL: reduce_xor_16xi32_prefix5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 224
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vmv.v.i v8, -1
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsext.vf4 v12, v8
-; CHECK-NEXT:    vand.vv v8, v10, v12
-; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vredxor.vs v8, v8, v10
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %e2 = extractelement <16 x i32> %v, i32 2
-  %e3 = extractelement <16 x i32> %v, i32 3
-  %e4 = extractelement <16 x i32> %v, i32 4
-  %xor0 = xor i32 %e0, %e1
-  %xor1 = xor i32 %xor0, %e2
-  %xor2 = xor i32 %xor1, %e3
-  %xor3 = xor i32 %xor2, %e4
-  ret i32 %xor3
-}
-
-define i32 @reduce_and_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_and_16xi32_prefix2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vredand.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %and0 = and i32 %e0, %e1
-  ret i32 %and0
-}
-
-define i32 @reduce_and_16xi32_prefix5(ptr %p) {
-; CHECK-LABEL: reduce_and_16xi32_prefix5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, -1
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 5
-; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 7
-; CHECK-NEXT:    vredand.vs v8, v10, v10
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %e2 = extractelement <16 x i32> %v, i32 2
-  %e3 = extractelement <16 x i32> %v, i32 3
-  %e4 = extractelement <16 x i32> %v, i32 4
-  %and0 = and i32 %e0, %e1
-  %and1 = and i32 %and0, %e2
-  %and2 = and i32 %and1, %e3
-  %and3 = and i32 %and2, %e4
-  ret i32 %and3
-}
-
-define i32 @reduce_or_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_or_16xi32_prefix2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vredor.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %or0 = or i32 %e0, %e1
-  ret i32 %or0
-}
-
-define i32 @reduce_or_16xi32_prefix5(ptr %p) {
-; CHECK-LABEL: reduce_or_16xi32_prefix5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 224
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vmv.v.i v8, -1
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsext.vf4 v12, v8
-; CHECK-NEXT:    vand.vv v8, v10, v12
-; CHECK-NEXT:    vredor.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %e2 = extractelement <16 x i32> %v, i32 2
-  %e3 = extractelement <16 x i32> %v, i32 3
-  %e4 = extractelement <16 x i32> %v, i32 4
-  %or0 = or i32 %e0, %e1
-  %or1 = or i32 %or0, %e2
-  %or2 = or i32 %or1, %e3
-  %or3 = or i32 %or2, %e4
-  ret i32 %or3
-}
-
-declare i32 @llvm.smax.i32(i32 %a, i32 %b)
-declare i32 @llvm.smin.i32(i32 %a, i32 %b)
-declare i32 @llvm.umax.i32(i32 %a, i32 %b)
-declare i32 @llvm.umin.i32(i32 %a, i32 %b)
-
-define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_smax_16xi32_prefix2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vredmax.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
-  ret i32 %smax0
-}
-
-define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
-; CHECK-LABEL: reduce_smax_16xi32_prefix5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 524288
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 5
-; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 6
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 7
-; CHECK-NEXT:    vredmax.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %e2 = extractelement <16 x i32> %v, i32 2
-  %e3 = extractelement <16 x i32> %v, i32 3
-  %e4 = extractelement <16 x i32> %v, i32 4
-  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
-  %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2)
-  %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3)
-  %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4)
-  ret i32 %smax3
-}
-
-define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_smin_16xi32_prefix2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vredmin.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
-  ret i32 %smin0
-}
-
-define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
-; RV32-LABEL: reduce_smin_16xi32_prefix5:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, 524288
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vmv.s.x v10, a1
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; RV32-NEXT:    vslideup.vi v8, v10, 5
-; RV32-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
-; RV32-NEXT:    vslideup.vi v8, v10, 6
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vslideup.vi v8, v10, 7
-; RV32-NEXT:    vredmin.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: reduce_smin_16xi32_prefix5:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 524288
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vmv.s.x v10, a1
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; RV64-NEXT:    vslideup.vi v8, v10, 5
-; RV64-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
-; RV64-NEXT:    vslideup.vi v8, v10, 6
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vslideup.vi v8, v10, 7
-; RV64-NEXT:    vredmin.vs v8, v8, v8
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %e2 = extractelement <16 x i32> %v, i32 2
-  %e3 = extractelement <16 x i32> %v, i32 3
-  %e4 = extractelement <16 x i32> %v, i32 4
-  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
-  %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2)
-  %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3)
-  %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4)
-  ret i32 %smin3
-}
-
-define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_umax_16xi32_prefix2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
-  ret i32 %umax0
-}
-
-define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
-; CHECK-LABEL: reduce_umax_16xi32_prefix5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 224
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vmv.v.i v8, -1
-; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsext.vf4 v12, v8
-; CHECK-NEXT:    vand.vv v8, v10, v12
-; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %e2 = extractelement <16 x i32> %v, i32 2
-  %e3 = extractelement <16 x i32> %v, i32 3
-  %e4 = extractelement <16 x i32> %v, i32 4
-  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
-  %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2)
-  %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3)
-  %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4)
-  ret i32 %umax3
-}
-
-define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
-; CHECK-LABEL: reduce_umin_16xi32_prefix2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vredminu.vs v8, v8, v8
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
-  ret i32 %umin0
-}
-
-define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
-; CHECK-LABEL: reduce_umin_16xi32_prefix5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, -1
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 5
-; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 7
-; CHECK-NEXT:    vredminu.vs v8, v10, v10
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-  %v = load <16 x i32>, ptr %p, align 256
-  %e0 = extractelement <16 x i32> %v, i32 0
-  %e1 = extractelement <16 x i32> %v, i32 1
-  %e2 = extractelement <16 x i32> %v, i32 2
-  %e3 = extractelement <16 x i32> %v, i32 3
-  %e4 = extractelement <16 x i32> %v, i32 4
-  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
-  %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2)
-  %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3)
-  %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4)
-  ret i32 %umin3
-}


        


More information about the llvm-commits mailing list