[llvm] 824251c - Revert "[RISCV] Generaize reduction tree matching to all integer reductions (#68014)"

Philip Reames via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 4 10:43:00 PDT 2023


Fixed in 199cbec9 and reapplied in 45a334d.

Philip

On 10/4/23 08:54, Alex Bradbury wrote:
> You're welcome! I left a comment on reproducers here
> <https://github.com/llvm/llvm-project/pull/68014#issuecomment-1746782691>.
>
> On 2023-10-04 15:47, Philip Reames wrote:
>> Thanks for the revert.
>>
>> Given this change was expanding an existing transform which had the same problem, we need to either revert the prior change as well, or fix forward quickly.  I'll revert the prior change by mid-morning if the gating isn't obvious.
>>
>> Philip
>>
>> On 10/4/23 04:51, Alex Bradbury via llvm-commits wrote:
>>> Author: Alex Bradbury
>>> Date: 2023-10-04T12:51:01+01:00
>>> New Revision: 824251c9b349d859a9169196cd9533c619a715ce
>>>
>>> URL: https://github.com/llvm/llvm-project/commit/824251c9b349d859a9169196cd9533c619a715ce
>>> DIFF: https://github.com/llvm/llvm-project/commit/824251c9b349d859a9169196cd9533c619a715ce.diff
>>>
>>> LOG: Revert "[RISCV] Generaize reduction tree matching to all integer reductions (#68014)"
>>>
>>> This reverts commit 7a0b9daac9edde4293d2e9fdf30d8b35c04d16a6 and
>>> 63bbc250440141b1c51593904fba9bdaa6724280.
>>>
>>> I'm seeing issues (e.g. on the GCC torture suite) where
>>> combineBinOpOfExtractToReduceTree is called when the V extensions aren't
>>> enabled and triggers a crash due to RISCVSubtarget::getElen asserting.
>>>
>>> I'll aim to follow up with a minimal reproducer. Although it's pretty
>>> obvious how to avoid this crash with some extra gating, there are a few
>>> options as to where that should be inserted so I think it's best to
>>> revert and agree the appropriate fix separately.
>>>
>>> Added:
>>>       Modified:
>>>       llvm/lib/Target/RISCV/RISCVISelLowering.cpp
>>>       llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
>>>       llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
>>>
>>> Removed:
>>>       ################################################################################
>>> diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
>>> index 413af1ff4b9439a..84a5223f91f0158 100644
>>> --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
>>> +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
>>> @@ -11112,31 +11112,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
>>>      }
>>>    }
>>>    -/// Given an integer binary operator, return the generic ISD::VECREDUCE_OP
>>> -/// which corresponds to it.
>>> -static unsigned getVecReduceOpcode(unsigned Opc) {
>>> -  switch (Opc) {
>>> -  default:
>>> -    llvm_unreachable("Unhandled binary to transfrom reduction");
>>> -  case ISD::ADD:
>>> -    return ISD::VECREDUCE_ADD;
>>> -  case ISD::UMAX:
>>> -    return ISD::VECREDUCE_UMAX;
>>> -  case ISD::SMAX:
>>> -    return ISD::VECREDUCE_SMAX;
>>> -  case ISD::UMIN:
>>> -    return ISD::VECREDUCE_UMIN;
>>> -  case ISD::SMIN:
>>> -    return ISD::VECREDUCE_SMIN;
>>> -  case ISD::AND:
>>> -    return ISD::VECREDUCE_AND;
>>> -  case ISD::OR:
>>> -    return ISD::VECREDUCE_OR;
>>> -  case ISD::XOR:
>>> -    return ISD::VECREDUCE_XOR;
>>> -  }
>>> -}
>>> -
>>>    /// Perform two related transforms whose purpose is to incrementally recognize
>>>    /// an explode_vector followed by scalar reduction as a vector reduction node.
>>>    /// This exists to recover from a deficiency in SLP which can't handle
>>> @@ -11155,15 +11130,8 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
>>>        const SDLoc DL(N);
>>>      const EVT VT = N->getValueType(0);
>>> -
>>> -  // TODO: Handle floating point here.
>>> -  if (!VT.isInteger())
>>> -    return SDValue();
>>> -
>>> -  const unsigned Opc = N->getOpcode();
>>> -  const unsigned ReduceOpc = getVecReduceOpcode(Opc);
>>> -  assert(Opc == ISD::getVecReduceBaseOpcode(ReduceOpc) &&
>>> -         "Inconsistent mappings");
>>> +  [[maybe_unused]] const unsigned Opc = N->getOpcode();
>>> +  assert(Opc == ISD::ADD && "extend this to other reduction types");
>>>      const SDValue LHS = N->getOperand(0);
>>>      const SDValue RHS = N->getOperand(1);
>>>    @@ -11193,13 +11161,13 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
>>>        EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, 2);
>>>        SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec,
>>>                                  DAG.getVectorIdxConstant(0, DL));
>>> -    return DAG.getNode(ReduceOpc, DL, VT, Vec);
>>> +    return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, Vec);
>>>      }
>>>        // Match (binop (reduce (extract_subvector V, 0),
>>>      //                      (extract_vector_elt V, sizeof(SubVec))))
>>>      // into a reduction of one more element from the original vector V.
>>> -  if (LHS.getOpcode() != ReduceOpc)
>>> +  if (LHS.getOpcode() != ISD::VECREDUCE_ADD)
>>>        return SDValue();
>>>        SDValue ReduceVec = LHS.getOperand(0);
>>> @@ -11215,7 +11183,7 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
>>>          EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, Idx + 1);
>>>          SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec,
>>>                                    DAG.getVectorIdxConstant(0, DL));
>>> -      return DAG.getNode(ReduceOpc, DL, VT, Vec);
>>> +      return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, Vec);
>>>        }
>>>      }
>>>    @@ -11723,8 +11691,6 @@ static SDValue performANDCombine(SDNode *N,
>>>        if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
>>>        return V;
>>> -  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
>>> -    return V;
>>>        if (DCI.isAfterLegalizeDAG())
>>>        if (SDValue V = combineDeMorganOfBoolean(N, DAG))
>>> @@ -11777,8 +11743,6 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
>>>        if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
>>>        return V;
>>> -  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
>>> -    return V;
>>>        if (DCI.isAfterLegalizeDAG())
>>>        if (SDValue V = combineDeMorganOfBoolean(N, DAG))
>>> @@ -11830,9 +11794,6 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
>>>        if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
>>>        return V;
>>> -  if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
>>> -    return V;
>>> -
>>>      // fold (xor (select cond, 0, y), x) ->
>>>      //      (select cond, x, (xor x, y))
>>>      return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
>>> @@ -14038,13 +13999,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
>>>      case ISD::SMAX:
>>>      case ISD::SMIN:
>>>      case ISD::FMAXNUM:
>>> -  case ISD::FMINNUM: {
>>> -    if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
>>> -      return V;
>>> -    if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
>>> -      return V;
>>> -    return SDValue();
>>> -  }
>>> +  case ISD::FMINNUM:
>>> +    return combineBinOpToReduce(N, DAG, Subtarget);
>>>      case ISD::SETCC:
>>>        return performSETCCCombine(N, DAG, Subtarget);
>>>      case ISD::SIGN_EXTEND_INREG:
>>>
>>> diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
>>> index f3570495600f3c3..ab137b1ac818299 100644
>>> --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
>>> +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
>>> @@ -5,10 +5,11 @@
>>>    define i8 @explode_2xi8(<2 x i8> %v) {
>>>    ; CHECK-LABEL: explode_2xi8:
>>>    ; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> +; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
>>>    ; CHECK-NEXT:    vmv.x.s a0, v8
>>> +; CHECK-NEXT:    vslidedown.vi v8, v8, 1
>>> +; CHECK-NEXT:    vmv.x.s a1, v8
>>> +; CHECK-NEXT:    xor a0, a0, a1
>>>    ; CHECK-NEXT:    ret
>>>      %e0 = extractelement <2 x i8> %v, i32 0
>>>      %e1 = extractelement <2 x i8> %v, i32 1
>>> @@ -20,16 +21,16 @@ define i8 @explode_4xi8(<4 x i8> %v) {
>>>    ; CHECK-LABEL: explode_4xi8:
>>>    ; CHECK:       # %bb.0:
>>>    ; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>> -; CHECK-NEXT:    vmv.x.s a0, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 3
>>> +; CHECK-NEXT:    vmv.x.s a0, v8
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 1
>>>    ; CHECK-NEXT:    vmv.x.s a1, v9
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> -; CHECK-NEXT:    vmv.x.s a2, v8
>>> -; CHECK-NEXT:    add a0, a0, a1
>>> -; CHECK-NEXT:    add a0, a2, a0
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>> +; CHECK-NEXT:    vmv.x.s a2, v9
>>> +; CHECK-NEXT:    vslidedown.vi v8, v8, 3
>>> +; CHECK-NEXT:    vmv.x.s a3, v8
>>> +; CHECK-NEXT:    xor a0, a0, a1
>>> +; CHECK-NEXT:    add a2, a2, a3
>>> +; CHECK-NEXT:    add a0, a0, a2
>>>    ; CHECK-NEXT:    ret
>>>      %e0 = extractelement <4 x i8> %v, i32 0
>>>      %e1 = extractelement <4 x i8> %v, i32 1
>>> @@ -46,28 +47,28 @@ define i8 @explode_8xi8(<8 x i8> %v) {
>>>    ; CHECK-LABEL: explode_8xi8:
>>>    ; CHECK:       # %bb.0:
>>>    ; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>> -; CHECK-NEXT:    vmv.x.s a0, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 3
>>> +; CHECK-NEXT:    vmv.x.s a0, v8
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 1
>>>    ; CHECK-NEXT:    vmv.x.s a1, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 4
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>>    ; CHECK-NEXT:    vmv.x.s a2, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 5
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 3
>>>    ; CHECK-NEXT:    vmv.x.s a3, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 6
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 4
>>>    ; CHECK-NEXT:    vmv.x.s a4, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 7
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 5
>>>    ; CHECK-NEXT:    vmv.x.s a5, v9
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> -; CHECK-NEXT:    vmv.x.s a6, v8
>>> -; CHECK-NEXT:    add a0, a0, a1
>>> -; CHECK-NEXT:    add a0, a6, a0
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 6
>>> +; CHECK-NEXT:    vmv.x.s a6, v9
>>> +; CHECK-NEXT:    vslidedown.vi v8, v8, 7
>>> +; CHECK-NEXT:    vmv.x.s a7, v8
>>> +; CHECK-NEXT:    xor a0, a0, a1
>>>    ; CHECK-NEXT:    add a2, a2, a3
>>> -; CHECK-NEXT:    add a2, a2, a4
>>>    ; CHECK-NEXT:    add a0, a0, a2
>>> -; CHECK-NEXT:    add a0, a0, a5
>>> +; CHECK-NEXT:    add a4, a4, a5
>>> +; CHECK-NEXT:    add a4, a4, a6
>>> +; CHECK-NEXT:    add a0, a0, a4
>>> +; CHECK-NEXT:    add a0, a0, a7
>>>    ; CHECK-NEXT:    ret
>>>      %e0 = extractelement <8 x i8> %v, i32 0
>>>      %e1 = extractelement <8 x i8> %v, i32 1
>>> @@ -88,56 +89,119 @@ define i8 @explode_8xi8(<8 x i8> %v) {
>>>    }
>>>      define i8 @explode_16xi8(<16 x i8> %v) {
>>> -; CHECK-LABEL: explode_16xi8:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>> -; CHECK-NEXT:    vmv.x.s a0, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 3
>>> -; CHECK-NEXT:    vmv.x.s a1, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 4
>>> -; CHECK-NEXT:    vmv.x.s a2, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 5
>>> -; CHECK-NEXT:    vmv.x.s a3, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 6
>>> -; CHECK-NEXT:    vmv.x.s a4, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 7
>>> -; CHECK-NEXT:    vmv.x.s a5, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 8
>>> -; CHECK-NEXT:    vmv.x.s a6, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 9
>>> -; CHECK-NEXT:    vmv.x.s a7, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 10
>>> -; CHECK-NEXT:    vmv.x.s t0, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 11
>>> -; CHECK-NEXT:    vmv.x.s t1, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 12
>>> -; CHECK-NEXT:    vmv.x.s t2, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 13
>>> -; CHECK-NEXT:    vmv.x.s t3, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 14
>>> -; CHECK-NEXT:    vmv.x.s t4, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 15
>>> -; CHECK-NEXT:    vmv.x.s t5, v9
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> -; CHECK-NEXT:    vmv.x.s t6, v8
>>> -; CHECK-NEXT:    add a0, a0, a1
>>> -; CHECK-NEXT:    add a0, t6, a0
>>> -; CHECK-NEXT:    add a2, a2, a3
>>> -; CHECK-NEXT:    add a2, a2, a4
>>> -; CHECK-NEXT:    add a0, a0, a2
>>> -; CHECK-NEXT:    add a5, a5, a6
>>> -; CHECK-NEXT:    add a5, a5, a7
>>> -; CHECK-NEXT:    add a5, a5, t0
>>> -; CHECK-NEXT:    add a0, a0, a5
>>> -; CHECK-NEXT:    add t1, t1, t2
>>> -; CHECK-NEXT:    add t1, t1, t3
>>> -; CHECK-NEXT:    add t1, t1, t4
>>> -; CHECK-NEXT:    add t1, t1, t5
>>> -; CHECK-NEXT:    add a0, a0, t1
>>> -; CHECK-NEXT:    ret
>>> +; RV32-LABEL: explode_16xi8:
>>> +; RV32:       # %bb.0:
>>> +; RV32-NEXT:    addi sp, sp, -16
>>> +; RV32-NEXT:    .cfi_def_cfa_offset 16
>>> +; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
>>> +; RV32-NEXT:    .cfi_offset s0, -4
>>> +; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
>>> +; RV32-NEXT:    vmv.x.s a0, v8
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 1
>>> +; RV32-NEXT:    vmv.x.s a1, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 2
>>> +; RV32-NEXT:    vmv.x.s a2, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 3
>>> +; RV32-NEXT:    vmv.x.s a3, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 4
>>> +; RV32-NEXT:    vmv.x.s a4, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 5
>>> +; RV32-NEXT:    vmv.x.s a5, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 6
>>> +; RV32-NEXT:    vmv.x.s a6, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 7
>>> +; RV32-NEXT:    vmv.x.s a7, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 8
>>> +; RV32-NEXT:    vmv.x.s t0, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 9
>>> +; RV32-NEXT:    vmv.x.s t1, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 10
>>> +; RV32-NEXT:    vmv.x.s t2, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 11
>>> +; RV32-NEXT:    vmv.x.s t3, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 12
>>> +; RV32-NEXT:    vmv.x.s t4, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 13
>>> +; RV32-NEXT:    vmv.x.s t5, v9
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 14
>>> +; RV32-NEXT:    vmv.x.s t6, v9
>>> +; RV32-NEXT:    vslidedown.vi v8, v8, 15
>>> +; RV32-NEXT:    vmv.x.s s0, v8
>>> +; RV32-NEXT:    xor a0, a0, a1
>>> +; RV32-NEXT:    add a2, a2, a3
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    add a4, a4, a5
>>> +; RV32-NEXT:    add a4, a4, a6
>>> +; RV32-NEXT:    add a0, a0, a4
>>> +; RV32-NEXT:    add a7, a7, t0
>>> +; RV32-NEXT:    add a7, a7, t1
>>> +; RV32-NEXT:    add a7, a7, t2
>>> +; RV32-NEXT:    add a0, a0, a7
>>> +; RV32-NEXT:    add t3, t3, t4
>>> +; RV32-NEXT:    add t3, t3, t5
>>> +; RV32-NEXT:    add t3, t3, t6
>>> +; RV32-NEXT:    add t3, t3, s0
>>> +; RV32-NEXT:    add a0, a0, t3
>>> +; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
>>> +; RV32-NEXT:    addi sp, sp, 16
>>> +; RV32-NEXT:    ret
>>> +;
>>> +; RV64-LABEL: explode_16xi8:
>>> +; RV64:       # %bb.0:
>>> +; RV64-NEXT:    addi sp, sp, -16
>>> +; RV64-NEXT:    .cfi_def_cfa_offset 16
>>> +; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
>>> +; RV64-NEXT:    .cfi_offset s0, -8
>>> +; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
>>> +; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 1
>>> +; RV64-NEXT:    vmv.x.s a1, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 2
>>> +; RV64-NEXT:    vmv.x.s a2, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 3
>>> +; RV64-NEXT:    vmv.x.s a3, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 4
>>> +; RV64-NEXT:    vmv.x.s a4, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 5
>>> +; RV64-NEXT:    vmv.x.s a5, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 6
>>> +; RV64-NEXT:    vmv.x.s a6, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 7
>>> +; RV64-NEXT:    vmv.x.s a7, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 8
>>> +; RV64-NEXT:    vmv.x.s t0, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 9
>>> +; RV64-NEXT:    vmv.x.s t1, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 10
>>> +; RV64-NEXT:    vmv.x.s t2, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 11
>>> +; RV64-NEXT:    vmv.x.s t3, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 12
>>> +; RV64-NEXT:    vmv.x.s t4, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 13
>>> +; RV64-NEXT:    vmv.x.s t5, v9
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 14
>>> +; RV64-NEXT:    vmv.x.s t6, v9
>>> +; RV64-NEXT:    vslidedown.vi v8, v8, 15
>>> +; RV64-NEXT:    vmv.x.s s0, v8
>>> +; RV64-NEXT:    xor a0, a0, a1
>>> +; RV64-NEXT:    add a2, a2, a3
>>> +; RV64-NEXT:    add a0, a0, a2
>>> +; RV64-NEXT:    add a4, a4, a5
>>> +; RV64-NEXT:    add a4, a4, a6
>>> +; RV64-NEXT:    add a0, a0, a4
>>> +; RV64-NEXT:    add a7, a7, t0
>>> +; RV64-NEXT:    add a7, a7, t1
>>> +; RV64-NEXT:    add a7, a7, t2
>>> +; RV64-NEXT:    add a0, a0, a7
>>> +; RV64-NEXT:    add t3, t3, t4
>>> +; RV64-NEXT:    add t3, t3, t5
>>> +; RV64-NEXT:    add t3, t3, t6
>>> +; RV64-NEXT:    add t3, t3, s0
>>> +; RV64-NEXT:    add a0, a0, t3
>>> +; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
>>> +; RV64-NEXT:    addi sp, sp, 16
>>> +; RV64-NEXT:    ret
>>>      %e0 = extractelement <16 x i8> %v, i32 0
>>>      %e1 = extractelement <16 x i8> %v, i32 1
>>>      %e2 = extractelement <16 x i8> %v, i32 2
>>> @@ -175,10 +239,11 @@ define i8 @explode_16xi8(<16 x i8> %v) {
>>>    define i16 @explode_2xi16(<2 x i16> %v) {
>>>    ; CHECK-LABEL: explode_2xi16:
>>>    ; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> +; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
>>>    ; CHECK-NEXT:    vmv.x.s a0, v8
>>> +; CHECK-NEXT:    vslidedown.vi v8, v8, 1
>>> +; CHECK-NEXT:    vmv.x.s a1, v8
>>> +; CHECK-NEXT:    xor a0, a0, a1
>>>    ; CHECK-NEXT:    ret
>>>      %e0 = extractelement <2 x i16> %v, i32 0
>>>      %e1 = extractelement <2 x i16> %v, i32 1
>>> @@ -190,16 +255,16 @@ define i16 @explode_4xi16(<4 x i16> %v) {
>>>    ; CHECK-LABEL: explode_4xi16:
>>>    ; CHECK:       # %bb.0:
>>>    ; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>> -; CHECK-NEXT:    vmv.x.s a0, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 3
>>> +; CHECK-NEXT:    vmv.x.s a0, v8
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 1
>>>    ; CHECK-NEXT:    vmv.x.s a1, v9
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> -; CHECK-NEXT:    vmv.x.s a2, v8
>>> -; CHECK-NEXT:    add a0, a0, a1
>>> -; CHECK-NEXT:    add a0, a2, a0
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>> +; CHECK-NEXT:    vmv.x.s a2, v9
>>> +; CHECK-NEXT:    vslidedown.vi v8, v8, 3
>>> +; CHECK-NEXT:    vmv.x.s a3, v8
>>> +; CHECK-NEXT:    xor a0, a0, a1
>>> +; CHECK-NEXT:    add a2, a2, a3
>>> +; CHECK-NEXT:    add a0, a0, a2
>>>    ; CHECK-NEXT:    ret
>>>      %e0 = extractelement <4 x i16> %v, i32 0
>>>      %e1 = extractelement <4 x i16> %v, i32 1
>>> @@ -216,28 +281,28 @@ define i16 @explode_8xi16(<8 x i16> %v) {
>>>    ; CHECK-LABEL: explode_8xi16:
>>>    ; CHECK:       # %bb.0:
>>>    ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>> -; CHECK-NEXT:    vmv.x.s a0, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 3
>>> +; CHECK-NEXT:    vmv.x.s a0, v8
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 1
>>>    ; CHECK-NEXT:    vmv.x.s a1, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 4
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 2
>>>    ; CHECK-NEXT:    vmv.x.s a2, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 5
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 3
>>>    ; CHECK-NEXT:    vmv.x.s a3, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 6
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 4
>>>    ; CHECK-NEXT:    vmv.x.s a4, v9
>>> -; CHECK-NEXT:    vslidedown.vi v9, v8, 7
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 5
>>>    ; CHECK-NEXT:    vmv.x.s a5, v9
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> -; CHECK-NEXT:    vmv.x.s a6, v8
>>> -; CHECK-NEXT:    add a0, a0, a1
>>> -; CHECK-NEXT:    add a0, a6, a0
>>> +; CHECK-NEXT:    vslidedown.vi v9, v8, 6
>>> +; CHECK-NEXT:    vmv.x.s a6, v9
>>> +; CHECK-NEXT:    vslidedown.vi v8, v8, 7
>>> +; CHECK-NEXT:    vmv.x.s a7, v8
>>> +; CHECK-NEXT:    xor a0, a0, a1
>>>    ; CHECK-NEXT:    add a2, a2, a3
>>> -; CHECK-NEXT:    add a2, a2, a4
>>>    ; CHECK-NEXT:    add a0, a0, a2
>>> -; CHECK-NEXT:    add a0, a0, a5
>>> +; CHECK-NEXT:    add a4, a4, a5
>>> +; CHECK-NEXT:    add a4, a4, a6
>>> +; CHECK-NEXT:    add a0, a0, a4
>>> +; CHECK-NEXT:    add a0, a0, a7
>>>    ; CHECK-NEXT:    ret
>>>      %e0 = extractelement <8 x i16> %v, i32 0
>>>      %e1 = extractelement <8 x i16> %v, i32 1
>>> @@ -258,57 +323,121 @@ define i16 @explode_8xi16(<8 x i16> %v) {
>>>    }
>>>      define i16 @explode_16xi16(<16 x i16> %v) {
>>> -; CHECK-LABEL: explode_16xi16:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 2
>>> -; CHECK-NEXT:    vmv.x.s a0, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 3
>>> -; CHECK-NEXT:    vmv.x.s a1, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 4
>>> -; CHECK-NEXT:    vmv.x.s a2, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 5
>>> -; CHECK-NEXT:    vmv.x.s a3, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 6
>>> -; CHECK-NEXT:    vmv.x.s a4, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 7
>>> -; CHECK-NEXT:    vmv.x.s a5, v10
>>> -; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 8
>>> -; CHECK-NEXT:    vmv.x.s a6, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 9
>>> -; CHECK-NEXT:    vmv.x.s a7, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 10
>>> -; CHECK-NEXT:    vmv.x.s t0, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 11
>>> -; CHECK-NEXT:    vmv.x.s t1, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 12
>>> -; CHECK-NEXT:    vmv.x.s t2, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 13
>>> -; CHECK-NEXT:    vmv.x.s t3, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 14
>>> -; CHECK-NEXT:    vmv.x.s t4, v10
>>> -; CHECK-NEXT:    vslidedown.vi v10, v8, 15
>>> -; CHECK-NEXT:    vmv.x.s t5, v10
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> -; CHECK-NEXT:    vmv.x.s t6, v8
>>> -; CHECK-NEXT:    add a0, a0, a1
>>> -; CHECK-NEXT:    add a0, t6, a0
>>> -; CHECK-NEXT:    add a2, a2, a3
>>> -; CHECK-NEXT:    add a2, a2, a4
>>> -; CHECK-NEXT:    add a0, a0, a2
>>> -; CHECK-NEXT:    add a5, a5, a6
>>> -; CHECK-NEXT:    add a5, a5, a7
>>> -; CHECK-NEXT:    add a5, a5, t0
>>> -; CHECK-NEXT:    add a0, a0, a5
>>> -; CHECK-NEXT:    add t1, t1, t2
>>> -; CHECK-NEXT:    add t1, t1, t3
>>> -; CHECK-NEXT:    add t1, t1, t4
>>> -; CHECK-NEXT:    add t1, t1, t5
>>> -; CHECK-NEXT:    add a0, a0, t1
>>> -; CHECK-NEXT:    ret
>>> +; RV32-LABEL: explode_16xi16:
>>> +; RV32:       # %bb.0:
>>> +; RV32-NEXT:    addi sp, sp, -16
>>> +; RV32-NEXT:    .cfi_def_cfa_offset 16
>>> +; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
>>> +; RV32-NEXT:    .cfi_offset s0, -4
>>> +; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
>>> +; RV32-NEXT:    vmv.x.s a0, v8
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 1
>>> +; RV32-NEXT:    vmv.x.s a1, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 2
>>> +; RV32-NEXT:    vmv.x.s a2, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 3
>>> +; RV32-NEXT:    vmv.x.s a3, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 4
>>> +; RV32-NEXT:    vmv.x.s a4, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 5
>>> +; RV32-NEXT:    vmv.x.s a5, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 6
>>> +; RV32-NEXT:    vmv.x.s a6, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 7
>>> +; RV32-NEXT:    vmv.x.s a7, v10
>>> +; RV32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 8
>>> +; RV32-NEXT:    vmv.x.s t0, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 9
>>> +; RV32-NEXT:    vmv.x.s t1, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 10
>>> +; RV32-NEXT:    vmv.x.s t2, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 11
>>> +; RV32-NEXT:    vmv.x.s t3, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 12
>>> +; RV32-NEXT:    vmv.x.s t4, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 13
>>> +; RV32-NEXT:    vmv.x.s t5, v10
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 14
>>> +; RV32-NEXT:    vmv.x.s t6, v10
>>> +; RV32-NEXT:    vslidedown.vi v8, v8, 15
>>> +; RV32-NEXT:    vmv.x.s s0, v8
>>> +; RV32-NEXT:    xor a0, a0, a1
>>> +; RV32-NEXT:    add a2, a2, a3
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    add a4, a4, a5
>>> +; RV32-NEXT:    add a4, a4, a6
>>> +; RV32-NEXT:    add a0, a0, a4
>>> +; RV32-NEXT:    add a7, a7, t0
>>> +; RV32-NEXT:    add a7, a7, t1
>>> +; RV32-NEXT:    add a7, a7, t2
>>> +; RV32-NEXT:    add a0, a0, a7
>>> +; RV32-NEXT:    add t3, t3, t4
>>> +; RV32-NEXT:    add t3, t3, t5
>>> +; RV32-NEXT:    add t3, t3, t6
>>> +; RV32-NEXT:    add t3, t3, s0
>>> +; RV32-NEXT:    add a0, a0, t3
>>> +; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
>>> +; RV32-NEXT:    addi sp, sp, 16
>>> +; RV32-NEXT:    ret
>>> +;
>>> +; RV64-LABEL: explode_16xi16:
>>> +; RV64:       # %bb.0:
>>> +; RV64-NEXT:    addi sp, sp, -16
>>> +; RV64-NEXT:    .cfi_def_cfa_offset 16
>>> +; RV64-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
>>> +; RV64-NEXT:    .cfi_offset s0, -8
>>> +; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
>>> +; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 1
>>> +; RV64-NEXT:    vmv.x.s a1, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 2
>>> +; RV64-NEXT:    vmv.x.s a2, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 3
>>> +; RV64-NEXT:    vmv.x.s a3, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 4
>>> +; RV64-NEXT:    vmv.x.s a4, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 5
>>> +; RV64-NEXT:    vmv.x.s a5, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 6
>>> +; RV64-NEXT:    vmv.x.s a6, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 7
>>> +; RV64-NEXT:    vmv.x.s a7, v10
>>> +; RV64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 8
>>> +; RV64-NEXT:    vmv.x.s t0, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 9
>>> +; RV64-NEXT:    vmv.x.s t1, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 10
>>> +; RV64-NEXT:    vmv.x.s t2, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 11
>>> +; RV64-NEXT:    vmv.x.s t3, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 12
>>> +; RV64-NEXT:    vmv.x.s t4, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 13
>>> +; RV64-NEXT:    vmv.x.s t5, v10
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 14
>>> +; RV64-NEXT:    vmv.x.s t6, v10
>>> +; RV64-NEXT:    vslidedown.vi v8, v8, 15
>>> +; RV64-NEXT:    vmv.x.s s0, v8
>>> +; RV64-NEXT:    xor a0, a0, a1
>>> +; RV64-NEXT:    add a2, a2, a3
>>> +; RV64-NEXT:    add a0, a0, a2
>>> +; RV64-NEXT:    add a4, a4, a5
>>> +; RV64-NEXT:    add a4, a4, a6
>>> +; RV64-NEXT:    add a0, a0, a4
>>> +; RV64-NEXT:    add a7, a7, t0
>>> +; RV64-NEXT:    add a7, a7, t1
>>> +; RV64-NEXT:    add a7, a7, t2
>>> +; RV64-NEXT:    add a0, a0, a7
>>> +; RV64-NEXT:    add t3, t3, t4
>>> +; RV64-NEXT:    add t3, t3, t5
>>> +; RV64-NEXT:    add t3, t3, t6
>>> +; RV64-NEXT:    add t3, t3, s0
>>> +; RV64-NEXT:    add a0, a0, t3
>>> +; RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
>>> +; RV64-NEXT:    addi sp, sp, 16
>>> +; RV64-NEXT:    ret
>>>      %e0 = extractelement <16 x i16> %v, i32 0
>>>      %e1 = extractelement <16 x i16> %v, i32 1
>>>      %e2 = extractelement <16 x i16> %v, i32 2
>>> @@ -346,10 +475,11 @@ define i16 @explode_16xi16(<16 x i16> %v) {
>>>    define i32 @explode_2xi32(<2 x i32> %v) {
>>>    ; CHECK-LABEL: explode_2xi32:
>>>    ; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> +; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
>>>    ; CHECK-NEXT:    vmv.x.s a0, v8
>>> +; CHECK-NEXT:    vslidedown.vi v8, v8, 1
>>> +; CHECK-NEXT:    vmv.x.s a1, v8
>>> +; CHECK-NEXT:    xor a0, a0, a1
>>>    ; CHECK-NEXT:    ret
>>>      %e0 = extractelement <2 x i32> %v, i32 0
>>>      %e1 = extractelement <2 x i32> %v, i32 1
>>> @@ -361,31 +491,31 @@ define i32 @explode_4xi32(<4 x i32> %v) {
>>>    ; RV32-LABEL: explode_4xi32:
>>>    ; RV32:       # %bb.0:
>>>    ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
>>> -; RV32-NEXT:    vslidedown.vi v9, v8, 2
>>> -; RV32-NEXT:    vmv.x.s a0, v9
>>> -; RV32-NEXT:    vslidedown.vi v9, v8, 3
>>> +; RV32-NEXT:    vmv.x.s a0, v8
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 1
>>>    ; RV32-NEXT:    vmv.x.s a1, v9
>>> -; RV32-NEXT:    vmv.s.x v9, zero
>>> -; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; RV32-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV32-NEXT:    vmv.x.s a2, v8
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a0, a2, a0
>>> +; RV32-NEXT:    vslidedown.vi v9, v8, 2
>>> +; RV32-NEXT:    vmv.x.s a2, v9
>>> +; RV32-NEXT:    vslidedown.vi v8, v8, 3
>>> +; RV32-NEXT:    vmv.x.s a3, v8
>>> +; RV32-NEXT:    xor a0, a0, a1
>>> +; RV32-NEXT:    add a2, a2, a3
>>> +; RV32-NEXT:    add a0, a0, a2
>>>    ; RV32-NEXT:    ret
>>>    ;
>>>    ; RV64-LABEL: explode_4xi32:
>>>    ; RV64:       # %bb.0:
>>>    ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
>>> -; RV64-NEXT:    vslidedown.vi v9, v8, 2
>>> -; RV64-NEXT:    vmv.x.s a0, v9
>>> -; RV64-NEXT:    vslidedown.vi v9, v8, 3
>>> +; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 1
>>>    ; RV64-NEXT:    vmv.x.s a1, v9
>>> -; RV64-NEXT:    vmv.s.x v9, zero
>>> -; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; RV64-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV64-NEXT:    vmv.x.s a2, v8
>>> -; RV64-NEXT:    add a0, a0, a1
>>> -; RV64-NEXT:    addw a0, a2, a0
>>> +; RV64-NEXT:    vslidedown.vi v9, v8, 2
>>> +; RV64-NEXT:    vmv.x.s a2, v9
>>> +; RV64-NEXT:    vslidedown.vi v8, v8, 3
>>> +; RV64-NEXT:    vmv.x.s a3, v8
>>> +; RV64-NEXT:    xor a0, a0, a1
>>> +; RV64-NEXT:    add a2, a2, a3
>>> +; RV64-NEXT:    addw a0, a0, a2
>>>    ; RV64-NEXT:    ret
>>>      %e0 = extractelement <4 x i32> %v, i32 0
>>>      %e1 = extractelement <4 x i32> %v, i32 1
>>> @@ -402,57 +532,57 @@ define i32 @explode_8xi32(<8 x i32> %v) {
>>>    ; RV32-LABEL: explode_8xi32:
>>>    ; RV32:       # %bb.0:
>>>    ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
>>> +; RV32-NEXT:    vmv.x.s a0, v8
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 1
>>> +; RV32-NEXT:    vmv.x.s a1, v10
>>>    ; RV32-NEXT:    vslidedown.vi v10, v8, 2
>>> -; RV32-NEXT:    vmv.x.s a0, v10
>>> +; RV32-NEXT:    vmv.x.s a2, v10
>>>    ; RV32-NEXT:    vslidedown.vi v10, v8, 3
>>> -; RV32-NEXT:    vmv.x.s a1, v10
>>> +; RV32-NEXT:    vmv.x.s a3, v10
>>>    ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
>>>    ; RV32-NEXT:    vslidedown.vi v10, v8, 4
>>> -; RV32-NEXT:    vmv.x.s a2, v10
>>> -; RV32-NEXT:    vslidedown.vi v10, v8, 5
>>> -; RV32-NEXT:    vmv.x.s a3, v10
>>> -; RV32-NEXT:    vslidedown.vi v10, v8, 6
>>>    ; RV32-NEXT:    vmv.x.s a4, v10
>>> -; RV32-NEXT:    vslidedown.vi v10, v8, 7
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 5
>>>    ; RV32-NEXT:    vmv.x.s a5, v10
>>> -; RV32-NEXT:    vmv.s.x v9, zero
>>> -; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; RV32-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV32-NEXT:    vmv.x.s a6, v8
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a0, a6, a0
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 6
>>> +; RV32-NEXT:    vmv.x.s a6, v10
>>> +; RV32-NEXT:    vslidedown.vi v8, v8, 7
>>> +; RV32-NEXT:    vmv.x.s a7, v8
>>> +; RV32-NEXT:    xor a0, a0, a1
>>>    ; RV32-NEXT:    add a2, a2, a3
>>> -; RV32-NEXT:    add a2, a2, a4
>>>    ; RV32-NEXT:    add a0, a0, a2
>>> -; RV32-NEXT:    add a0, a0, a5
>>> +; RV32-NEXT:    add a4, a4, a5
>>> +; RV32-NEXT:    add a4, a4, a6
>>> +; RV32-NEXT:    add a0, a0, a4
>>> +; RV32-NEXT:    add a0, a0, a7
>>>    ; RV32-NEXT:    ret
>>>    ;
>>>    ; RV64-LABEL: explode_8xi32:
>>>    ; RV64:       # %bb.0:
>>>    ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
>>> +; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 1
>>> +; RV64-NEXT:    vmv.x.s a1, v10
>>>    ; RV64-NEXT:    vslidedown.vi v10, v8, 2
>>> -; RV64-NEXT:    vmv.x.s a0, v10
>>> +; RV64-NEXT:    vmv.x.s a2, v10
>>>    ; RV64-NEXT:    vslidedown.vi v10, v8, 3
>>> -; RV64-NEXT:    vmv.x.s a1, v10
>>> +; RV64-NEXT:    vmv.x.s a3, v10
>>>    ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
>>>    ; RV64-NEXT:    vslidedown.vi v10, v8, 4
>>> -; RV64-NEXT:    vmv.x.s a2, v10
>>> -; RV64-NEXT:    vslidedown.vi v10, v8, 5
>>> -; RV64-NEXT:    vmv.x.s a3, v10
>>> -; RV64-NEXT:    vslidedown.vi v10, v8, 6
>>>    ; RV64-NEXT:    vmv.x.s a4, v10
>>> -; RV64-NEXT:    vslidedown.vi v10, v8, 7
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 5
>>>    ; RV64-NEXT:    vmv.x.s a5, v10
>>> -; RV64-NEXT:    vmv.s.x v9, zero
>>> -; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; RV64-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV64-NEXT:    vmv.x.s a6, v8
>>> -; RV64-NEXT:    add a0, a0, a1
>>> -; RV64-NEXT:    add a0, a6, a0
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 6
>>> +; RV64-NEXT:    vmv.x.s a6, v10
>>> +; RV64-NEXT:    vslidedown.vi v8, v8, 7
>>> +; RV64-NEXT:    vmv.x.s a7, v8
>>> +; RV64-NEXT:    xor a0, a0, a1
>>>    ; RV64-NEXT:    add a2, a2, a3
>>> -; RV64-NEXT:    add a2, a2, a4
>>>    ; RV64-NEXT:    add a0, a0, a2
>>> -; RV64-NEXT:    addw a0, a0, a5
>>> +; RV64-NEXT:    add a4, a4, a5
>>> +; RV64-NEXT:    add a4, a4, a6
>>> +; RV64-NEXT:    add a0, a0, a4
>>> +; RV64-NEXT:    addw a0, a0, a7
>>>    ; RV64-NEXT:    ret
>>>      %e0 = extractelement <8 x i32> %v, i32 0
>>>      %e1 = extractelement <8 x i32> %v, i32 1
>>> @@ -479,57 +609,60 @@ define i32 @explode_16xi32(<16 x i32> %v) {
>>>    ; RV32-NEXT:    .cfi_def_cfa_offset 128
>>>    ; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
>>>    ; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
>>> +; RV32-NEXT:    sw s2, 116(sp) # 4-byte Folded Spill
>>>    ; RV32-NEXT:    .cfi_offset ra, -4
>>>    ; RV32-NEXT:    .cfi_offset s0, -8
>>> +; RV32-NEXT:    .cfi_offset s2, -12
>>>    ; RV32-NEXT:    addi s0, sp, 128
>>>    ; RV32-NEXT:    .cfi_def_cfa s0, 0
>>>    ; RV32-NEXT:    andi sp, sp, -64
>>>    ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
>>> +; RV32-NEXT:    vmv.x.s a0, v8
>>> +; RV32-NEXT:    vslidedown.vi v12, v8, 1
>>> +; RV32-NEXT:    vmv.x.s a1, v12
>>>    ; RV32-NEXT:    vslidedown.vi v12, v8, 2
>>> -; RV32-NEXT:    vmv.x.s a0, v12
>>> +; RV32-NEXT:    vmv.x.s a2, v12
>>>    ; RV32-NEXT:    vslidedown.vi v12, v8, 3
>>> -; RV32-NEXT:    vmv.x.s a1, v12
>>> +; RV32-NEXT:    vmv.x.s a3, v12
>>>    ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
>>>    ; RV32-NEXT:    vslidedown.vi v12, v8, 4
>>> -; RV32-NEXT:    vmv.x.s a2, v12
>>> +; RV32-NEXT:    vmv.x.s a4, v12
>>>    ; RV32-NEXT:    vslidedown.vi v12, v8, 5
>>> -; RV32-NEXT:    vmv.x.s a3, v12
>>> +; RV32-NEXT:    vmv.x.s a5, v12
>>>    ; RV32-NEXT:    vslidedown.vi v12, v8, 6
>>> -; RV32-NEXT:    vmv.x.s a4, v12
>>> +; RV32-NEXT:    vmv.x.s a6, v12
>>>    ; RV32-NEXT:    vslidedown.vi v12, v8, 7
>>> -; RV32-NEXT:    vmv.x.s a5, v12
>>> -; RV32-NEXT:    mv a6, sp
>>> +; RV32-NEXT:    vmv.x.s a7, v12
>>> +; RV32-NEXT:    mv t0, sp
>>>    ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
>>> -; RV32-NEXT:    vse32.v v8, (a6)
>>> -; RV32-NEXT:    lw a6, 32(sp)
>>> -; RV32-NEXT:    lw a7, 36(sp)
>>> -; RV32-NEXT:    lw t0, 40(sp)
>>> -; RV32-NEXT:    lw t1, 44(sp)
>>> -; RV32-NEXT:    lw t2, 48(sp)
>>> -; RV32-NEXT:    lw t3, 52(sp)
>>> -; RV32-NEXT:    lw t4, 56(sp)
>>> -; RV32-NEXT:    lw t5, 60(sp)
>>> -; RV32-NEXT:    vmv.s.x v9, zero
>>> -; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; RV32-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV32-NEXT:    vmv.x.s t6, v8
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a0, t6, a0
>>> +; RV32-NEXT:    vse32.v v8, (t0)
>>> +; RV32-NEXT:    lw t0, 32(sp)
>>> +; RV32-NEXT:    lw t1, 36(sp)
>>> +; RV32-NEXT:    lw t2, 40(sp)
>>> +; RV32-NEXT:    lw t3, 44(sp)
>>> +; RV32-NEXT:    lw t4, 48(sp)
>>> +; RV32-NEXT:    lw t5, 52(sp)
>>> +; RV32-NEXT:    lw t6, 56(sp)
>>> +; RV32-NEXT:    lw s2, 60(sp)
>>> +; RV32-NEXT:    xor a0, a0, a1
>>>    ; RV32-NEXT:    add a2, a2, a3
>>> -; RV32-NEXT:    add a2, a2, a4
>>>    ; RV32-NEXT:    add a0, a0, a2
>>> -; RV32-NEXT:    add a5, a5, a6
>>> -; RV32-NEXT:    add a0, a0, a5
>>> +; RV32-NEXT:    add a4, a4, a5
>>> +; RV32-NEXT:    add a4, a4, a6
>>> +; RV32-NEXT:    add a0, a0, a4
>>>    ; RV32-NEXT:    add a7, a7, t0
>>> -; RV32-NEXT:    add a7, a7, t1
>>>    ; RV32-NEXT:    add a0, a0, a7
>>> -; RV32-NEXT:    add t2, t2, t3
>>> -; RV32-NEXT:    add t2, t2, t4
>>> -; RV32-NEXT:    add t2, t2, t5
>>> -; RV32-NEXT:    add a0, a0, t2
>>> +; RV32-NEXT:    add t1, t1, t2
>>> +; RV32-NEXT:    add t1, t1, t3
>>> +; RV32-NEXT:    add a0, a0, t1
>>> +; RV32-NEXT:    add t4, t4, t5
>>> +; RV32-NEXT:    add t4, t4, t6
>>> +; RV32-NEXT:    add t4, t4, s2
>>> +; RV32-NEXT:    add a0, a0, t4
>>>    ; RV32-NEXT:    addi sp, s0, -128
>>>    ; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
>>>    ; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
>>> +; RV32-NEXT:    lw s2, 116(sp) # 4-byte Folded Reload
>>>    ; RV32-NEXT:    addi sp, sp, 128
>>>    ; RV32-NEXT:    ret
>>>    ;
>>> @@ -539,57 +672,60 @@ define i32 @explode_16xi32(<16 x i32> %v) {
>>>    ; RV64-NEXT:    .cfi_def_cfa_offset 128
>>>    ; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
>>>    ; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
>>> +; RV64-NEXT:    sd s2, 104(sp) # 8-byte Folded Spill
>>>    ; RV64-NEXT:    .cfi_offset ra, -8
>>>    ; RV64-NEXT:    .cfi_offset s0, -16
>>> +; RV64-NEXT:    .cfi_offset s2, -24
>>>    ; RV64-NEXT:    addi s0, sp, 128
>>>    ; RV64-NEXT:    .cfi_def_cfa s0, 0
>>>    ; RV64-NEXT:    andi sp, sp, -64
>>>    ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
>>> +; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v12, v8, 1
>>> +; RV64-NEXT:    vmv.x.s a1, v12
>>>    ; RV64-NEXT:    vslidedown.vi v12, v8, 2
>>> -; RV64-NEXT:    vmv.x.s a0, v12
>>> +; RV64-NEXT:    vmv.x.s a2, v12
>>>    ; RV64-NEXT:    vslidedown.vi v12, v8, 3
>>> -; RV64-NEXT:    vmv.x.s a1, v12
>>> +; RV64-NEXT:    vmv.x.s a3, v12
>>>    ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
>>>    ; RV64-NEXT:    vslidedown.vi v12, v8, 4
>>> -; RV64-NEXT:    vmv.x.s a2, v12
>>> +; RV64-NEXT:    vmv.x.s a4, v12
>>>    ; RV64-NEXT:    vslidedown.vi v12, v8, 5
>>> -; RV64-NEXT:    vmv.x.s a3, v12
>>> +; RV64-NEXT:    vmv.x.s a5, v12
>>>    ; RV64-NEXT:    vslidedown.vi v12, v8, 6
>>> -; RV64-NEXT:    vmv.x.s a4, v12
>>> +; RV64-NEXT:    vmv.x.s a6, v12
>>>    ; RV64-NEXT:    vslidedown.vi v12, v8, 7
>>> -; RV64-NEXT:    vmv.x.s a5, v12
>>> -; RV64-NEXT:    mv a6, sp
>>> +; RV64-NEXT:    vmv.x.s a7, v12
>>> +; RV64-NEXT:    mv t0, sp
>>>    ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
>>> -; RV64-NEXT:    vse32.v v8, (a6)
>>> -; RV64-NEXT:    lw a6, 32(sp)
>>> -; RV64-NEXT:    lw a7, 36(sp)
>>> -; RV64-NEXT:    lw t0, 40(sp)
>>> -; RV64-NEXT:    lw t1, 44(sp)
>>> -; RV64-NEXT:    lw t2, 48(sp)
>>> -; RV64-NEXT:    lw t3, 52(sp)
>>> -; RV64-NEXT:    lw t4, 56(sp)
>>> -; RV64-NEXT:    lw t5, 60(sp)
>>> -; RV64-NEXT:    vmv.s.x v9, zero
>>> -; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; RV64-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV64-NEXT:    vmv.x.s t6, v8
>>> -; RV64-NEXT:    add a0, a0, a1
>>> -; RV64-NEXT:    add a0, t6, a0
>>> +; RV64-NEXT:    vse32.v v8, (t0)
>>> +; RV64-NEXT:    lw t0, 32(sp)
>>> +; RV64-NEXT:    lw t1, 36(sp)
>>> +; RV64-NEXT:    lw t2, 40(sp)
>>> +; RV64-NEXT:    lw t3, 44(sp)
>>> +; RV64-NEXT:    lw t4, 48(sp)
>>> +; RV64-NEXT:    lw t5, 52(sp)
>>> +; RV64-NEXT:    lw t6, 56(sp)
>>> +; RV64-NEXT:    lw s2, 60(sp)
>>> +; RV64-NEXT:    xor a0, a0, a1
>>>    ; RV64-NEXT:    add a2, a2, a3
>>> -; RV64-NEXT:    add a2, a2, a4
>>>    ; RV64-NEXT:    add a0, a0, a2
>>> -; RV64-NEXT:    add a5, a5, a6
>>> -; RV64-NEXT:    add a0, a0, a5
>>> +; RV64-NEXT:    add a4, a4, a5
>>> +; RV64-NEXT:    add a4, a4, a6
>>> +; RV64-NEXT:    add a0, a0, a4
>>>    ; RV64-NEXT:    add a7, a7, t0
>>> -; RV64-NEXT:    add a7, a7, t1
>>>    ; RV64-NEXT:    add a0, a0, a7
>>> -; RV64-NEXT:    add t2, t2, t3
>>> -; RV64-NEXT:    add t2, t2, t4
>>> -; RV64-NEXT:    add t2, t2, t5
>>> -; RV64-NEXT:    addw a0, a0, t2
>>> +; RV64-NEXT:    add t1, t1, t2
>>> +; RV64-NEXT:    add t1, t1, t3
>>> +; RV64-NEXT:    add a0, a0, t1
>>> +; RV64-NEXT:    add t4, t4, t5
>>> +; RV64-NEXT:    add t4, t4, t6
>>> +; RV64-NEXT:    add t4, t4, s2
>>> +; RV64-NEXT:    addw a0, a0, t4
>>>    ; RV64-NEXT:    addi sp, s0, -128
>>>    ; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
>>>    ; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
>>> +; RV64-NEXT:    ld s2, 104(sp) # 8-byte Folded Reload
>>>    ; RV64-NEXT:    addi sp, sp, 128
>>>    ; RV64-NEXT:    ret
>>>      %e0 = extractelement <16 x i32> %v, i32 0
>>> @@ -629,22 +765,26 @@ define i32 @explode_16xi32(<16 x i32> %v) {
>>>    define i64 @explode_2xi64(<2 x i64> %v) {
>>>    ; RV32-LABEL: explode_2xi64:
>>>    ; RV32:       # %bb.0:
>>> -; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
>>> -; RV32-NEXT:    vmv.s.x v9, zero
>>> -; RV32-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV32-NEXT:    vmv.x.s a0, v8
>>> -; RV32-NEXT:    li a1, 32
>>> +; RV32-NEXT:    li a0, 32
>>>    ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
>>> -; RV32-NEXT:    vsrl.vx v8, v8, a1
>>> -; RV32-NEXT:    vmv.x.s a1, v8
>>> +; RV32-NEXT:    vsrl.vx v9, v8, a0
>>> +; RV32-NEXT:    vmv.x.s a1, v9
>>> +; RV32-NEXT:    vmv.x.s a2, v8
>>> +; RV32-NEXT:    vslidedown.vi v8, v8, 1
>>> +; RV32-NEXT:    vsrl.vx v9, v8, a0
>>> +; RV32-NEXT:    vmv.x.s a0, v9
>>> +; RV32-NEXT:    vmv.x.s a3, v8
>>> +; RV32-NEXT:    xor a1, a1, a0
>>> +; RV32-NEXT:    xor a0, a2, a3
>>>    ; RV32-NEXT:    ret
>>>    ;
>>>    ; RV64-LABEL: explode_2xi64:
>>>    ; RV64:       # %bb.0:
>>> -; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
>>> -; RV64-NEXT:    vmv.s.x v9, zero
>>> -; RV64-NEXT:    vredxor.vs v8, v8, v9
>>> +; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
>>>    ; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v8, v8, 1
>>> +; RV64-NEXT:    vmv.x.s a1, v8
>>> +; RV64-NEXT:    xor a0, a0, a1
>>>    ; RV64-NEXT:    ret
>>>      %e0 = extractelement <2 x i64> %v, i32 0
>>>      %e1 = extractelement <2 x i64> %v, i32 1
>>> @@ -655,46 +795,49 @@ define i64 @explode_2xi64(<2 x i64> %v) {
>>>    define i64 @explode_4xi64(<4 x i64> %v) {
>>>    ; RV32-LABEL: explode_4xi64:
>>>    ; RV32:       # %bb.0:
>>> -; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
>>> -; RV32-NEXT:    vslidedown.vi v10, v8, 2
>>>    ; RV32-NEXT:    li a0, 32
>>> -; RV32-NEXT:    vsrl.vx v12, v10, a0
>>> -; RV32-NEXT:    vmv.x.s a1, v12
>>> -; RV32-NEXT:    vmv.x.s a2, v10
>>> -; RV32-NEXT:    vslidedown.vi v10, v8, 3
>>> +; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
>>> +; RV32-NEXT:    vsrl.vx v10, v8, a0
>>> +; RV32-NEXT:    vmv.x.s a1, v10
>>> +; RV32-NEXT:    vmv.x.s a2, v8
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 1
>>>    ; RV32-NEXT:    vsrl.vx v12, v10, a0
>>>    ; RV32-NEXT:    vmv.x.s a3, v12
>>>    ; RV32-NEXT:    vmv.x.s a4, v10
>>> -; RV32-NEXT:    vmv.s.x v9, zero
>>> -; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
>>> -; RV32-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
>>> -; RV32-NEXT:    vsrl.vx v9, v8, a0
>>> -; RV32-NEXT:    vmv.x.s a0, v9
>>> -; RV32-NEXT:    vmv.x.s a5, v8
>>> -; RV32-NEXT:    add a2, a5, a2
>>> -; RV32-NEXT:    sltu a5, a2, a5
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a0, a0, a5
>>> -; RV32-NEXT:    add a1, a0, a3
>>> -; RV32-NEXT:    add a0, a2, a4
>>> -; RV32-NEXT:    sltu a2, a0, a2
>>> +; RV32-NEXT:    vslidedown.vi v10, v8, 2
>>> +; RV32-NEXT:    vsrl.vx v12, v10, a0
>>> +; RV32-NEXT:    vmv.x.s a5, v12
>>> +; RV32-NEXT:    vmv.x.s a6, v10
>>> +; RV32-NEXT:    vslidedown.vi v8, v8, 3
>>> +; RV32-NEXT:    vsrl.vx v10, v8, a0
>>> +; RV32-NEXT:    vmv.x.s a0, v10
>>> +; RV32-NEXT:    vmv.x.s a7, v8
>>> +; RV32-NEXT:    xor a1, a1, a3
>>> +; RV32-NEXT:    xor a2, a2, a4
>>> +; RV32-NEXT:    add a6, a2, a6
>>> +; RV32-NEXT:    sltu a2, a6, a2
>>> +; RV32-NEXT:    add a1, a1, a5
>>> +; RV32-NEXT:    add a1, a1, a2
>>> +; RV32-NEXT:    add a1, a1, a0
>>> +; RV32-NEXT:    add a0, a6, a7
>>> +; RV32-NEXT:    sltu a2, a0, a6
>>>    ; RV32-NEXT:    add a1, a1, a2
>>>    ; RV32-NEXT:    ret
>>>    ;
>>>    ; RV64-LABEL: explode_4xi64:
>>>    ; RV64:       # %bb.0:
>>> +; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
>>> +; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v10, v8, 1
>>> +; RV64-NEXT:    vmv.x.s a1, v10
>>>    ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
>>>    ; RV64-NEXT:    vslidedown.vi v10, v8, 2
>>> -; RV64-NEXT:    vmv.x.s a0, v10
>>> -; RV64-NEXT:    vslidedown.vi v10, v8, 3
>>> -; RV64-NEXT:    vmv.x.s a1, v10
>>> -; RV64-NEXT:    vmv.s.x v9, zero
>>> -; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
>>> -; RV64-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV64-NEXT:    vmv.x.s a2, v8
>>> -; RV64-NEXT:    add a0, a0, a1
>>> -; RV64-NEXT:    add a0, a2, a0
>>> +; RV64-NEXT:    vmv.x.s a2, v10
>>> +; RV64-NEXT:    vslidedown.vi v8, v8, 3
>>> +; RV64-NEXT:    vmv.x.s a3, v8
>>> +; RV64-NEXT:    xor a0, a0, a1
>>> +; RV64-NEXT:    add a2, a2, a3
>>> +; RV64-NEXT:    add a0, a0, a2
>>>    ; RV64-NEXT:    ret
>>>      %e0 = extractelement <4 x i64> %v, i32 0
>>>      %e1 = extractelement <4 x i64> %v, i32 1
>>> @@ -710,63 +853,71 @@ define i64 @explode_4xi64(<4 x i64> %v) {
>>>    define i64 @explode_8xi64(<8 x i64> %v) {
>>>    ; RV32-LABEL: explode_8xi64:
>>>    ; RV32:       # %bb.0:
>>> -; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
>>> -; RV32-NEXT:    vslidedown.vi v12, v8, 2
>>> +; RV32-NEXT:    addi sp, sp, -16
>>> +; RV32-NEXT:    .cfi_def_cfa_offset 16
>>> +; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
>>> +; RV32-NEXT:    .cfi_offset s0, -4
>>>    ; RV32-NEXT:    li a0, 32
>>> -; RV32-NEXT:    vsrl.vx v16, v12, a0
>>> -; RV32-NEXT:    vmv.x.s a1, v16
>>> -; RV32-NEXT:    vmv.x.s a2, v12
>>> -; RV32-NEXT:    vslidedown.vi v12, v8, 3
>>> +; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
>>> +; RV32-NEXT:    vsrl.vx v12, v8, a0
>>> +; RV32-NEXT:    vmv.x.s a1, v12
>>> +; RV32-NEXT:    vmv.x.s a2, v8
>>> +; RV32-NEXT:    vslidedown.vi v12, v8, 1
>>>    ; RV32-NEXT:    vsrl.vx v16, v12, a0
>>>    ; RV32-NEXT:    vmv.x.s a3, v16
>>>    ; RV32-NEXT:    vmv.x.s a4, v12
>>> -; RV32-NEXT:    vslidedown.vi v12, v8, 4
>>> +; RV32-NEXT:    vslidedown.vi v12, v8, 2
>>>    ; RV32-NEXT:    vsrl.vx v16, v12, a0
>>>    ; RV32-NEXT:    vmv.x.s a5, v16
>>>    ; RV32-NEXT:    vmv.x.s a6, v12
>>> -; RV32-NEXT:    vslidedown.vi v12, v8, 5
>>> +; RV32-NEXT:    vslidedown.vi v12, v8, 3
>>>    ; RV32-NEXT:    vsrl.vx v16, v12, a0
>>>    ; RV32-NEXT:    vmv.x.s a7, v16
>>>    ; RV32-NEXT:    vmv.x.s t0, v12
>>> -; RV32-NEXT:    vslidedown.vi v12, v8, 6
>>> +; RV32-NEXT:    vslidedown.vi v12, v8, 4
>>>    ; RV32-NEXT:    vsrl.vx v16, v12, a0
>>>    ; RV32-NEXT:    vmv.x.s t1, v16
>>>    ; RV32-NEXT:    vmv.x.s t2, v12
>>> -; RV32-NEXT:    vslidedown.vi v12, v8, 7
>>> +; RV32-NEXT:    vslidedown.vi v12, v8, 5
>>>    ; RV32-NEXT:    vsrl.vx v16, v12, a0
>>>    ; RV32-NEXT:    vmv.x.s t3, v16
>>>    ; RV32-NEXT:    vmv.x.s t4, v12
>>> -; RV32-NEXT:    vmv.s.x v9, zero
>>> -; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
>>> -; RV32-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
>>> -; RV32-NEXT:    vsrl.vx v9, v8, a0
>>> -; RV32-NEXT:    vmv.x.s a0, v9
>>> -; RV32-NEXT:    vmv.x.s t5, v8
>>> -; RV32-NEXT:    add a2, t5, a2
>>> -; RV32-NEXT:    sltu t5, a2, t5
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a0, a0, t5
>>> -; RV32-NEXT:    add a0, a0, a3
>>> -; RV32-NEXT:    add a4, a2, a4
>>> -; RV32-NEXT:    sltu a1, a4, a2
>>> +; RV32-NEXT:    vslidedown.vi v12, v8, 6
>>> +; RV32-NEXT:    vsrl.vx v16, v12, a0
>>> +; RV32-NEXT:    vmv.x.s t5, v16
>>> +; RV32-NEXT:    vmv.x.s t6, v12
>>> +; RV32-NEXT:    vslidedown.vi v8, v8, 7
>>> +; RV32-NEXT:    vsrl.vx v12, v8, a0
>>> +; RV32-NEXT:    vmv.x.s a0, v12
>>> +; RV32-NEXT:    vmv.x.s s0, v8
>>> +; RV32-NEXT:    xor a1, a1, a3
>>> +; RV32-NEXT:    xor a2, a2, a4
>>> +; RV32-NEXT:    add a6, a2, a6
>>> +; RV32-NEXT:    sltu a2, a6, a2
>>>    ; RV32-NEXT:    add a1, a1, a5
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a6, a4, a6
>>> -; RV32-NEXT:    sltu a1, a6, a4
>>> +; RV32-NEXT:    add a1, a1, a2
>>>    ; RV32-NEXT:    add a1, a1, a7
>>> -; RV32-NEXT:    add a0, a0, a1
>>>    ; RV32-NEXT:    add t0, a6, t0
>>> -; RV32-NEXT:    sltu a1, t0, a6
>>> -; RV32-NEXT:    add a1, a1, t1
>>> -; RV32-NEXT:    add a0, a0, a1
>>> +; RV32-NEXT:    sltu a2, t0, a6
>>> +; RV32-NEXT:    add a2, a2, t1
>>> +; RV32-NEXT:    add a1, a1, a2
>>>    ; RV32-NEXT:    add t2, t0, t2
>>> -; RV32-NEXT:    sltu a1, t2, t0
>>> -; RV32-NEXT:    add a1, a1, t3
>>> -; RV32-NEXT:    add a1, a0, a1
>>> -; RV32-NEXT:    add a0, t2, t4
>>> -; RV32-NEXT:    sltu a2, a0, t2
>>> +; RV32-NEXT:    sltu a2, t2, t0
>>> +; RV32-NEXT:    add a2, a2, t3
>>> +; RV32-NEXT:    add a1, a1, a2
>>> +; RV32-NEXT:    add t4, t2, t4
>>> +; RV32-NEXT:    sltu a2, t4, t2
>>> +; RV32-NEXT:    add a2, a2, t5
>>> +; RV32-NEXT:    add a1, a1, a2
>>> +; RV32-NEXT:    add t6, t4, t6
>>> +; RV32-NEXT:    sltu a2, t6, t4
>>> +; RV32-NEXT:    add a0, a2, a0
>>> +; RV32-NEXT:    add a1, a1, a0
>>> +; RV32-NEXT:    add a0, t6, s0
>>> +; RV32-NEXT:    sltu a2, a0, t6
>>>    ; RV32-NEXT:    add a1, a1, a2
>>> +; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
>>> +; RV32-NEXT:    addi sp, sp, 16
>>>    ; RV32-NEXT:    ret
>>>    ;
>>>    ; RV64-LABEL: explode_8xi64:
>>> @@ -780,28 +931,29 @@ define i64 @explode_8xi64(<8 x i64> %v) {
>>>    ; RV64-NEXT:    addi s0, sp, 128
>>>    ; RV64-NEXT:    .cfi_def_cfa s0, 0
>>>    ; RV64-NEXT:    andi sp, sp, -64
>>> +; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
>>> +; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v12, v8, 1
>>> +; RV64-NEXT:    vmv.x.s a1, v12
>>>    ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
>>>    ; RV64-NEXT:    vslidedown.vi v12, v8, 2
>>> -; RV64-NEXT:    vmv.x.s a0, v12
>>> +; RV64-NEXT:    vmv.x.s a2, v12
>>>    ; RV64-NEXT:    vslidedown.vi v12, v8, 3
>>> -; RV64-NEXT:    vmv.x.s a1, v12
>>> -; RV64-NEXT:    mv a2, sp
>>> +; RV64-NEXT:    vmv.x.s a3, v12
>>> +; RV64-NEXT:    mv a4, sp
>>>    ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
>>> -; RV64-NEXT:    vse64.v v8, (a2)
>>> -; RV64-NEXT:    ld a2, 32(sp)
>>> -; RV64-NEXT:    ld a3, 40(sp)
>>> -; RV64-NEXT:    ld a4, 48(sp)
>>> -; RV64-NEXT:    ld a5, 56(sp)
>>> -; RV64-NEXT:    vmv.s.x v9, zero
>>> -; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
>>> -; RV64-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV64-NEXT:    vmv.x.s a6, v8
>>> -; RV64-NEXT:    add a0, a0, a1
>>> -; RV64-NEXT:    add a0, a6, a0
>>> +; RV64-NEXT:    vse64.v v8, (a4)
>>> +; RV64-NEXT:    ld a4, 32(sp)
>>> +; RV64-NEXT:    ld a5, 40(sp)
>>> +; RV64-NEXT:    ld a6, 48(sp)
>>> +; RV64-NEXT:    ld a7, 56(sp)
>>> +; RV64-NEXT:    xor a0, a0, a1
>>> +; RV64-NEXT:    add a2, a2, a3
>>>    ; RV64-NEXT:    add a0, a0, a2
>>> -; RV64-NEXT:    add a3, a3, a4
>>> -; RV64-NEXT:    add a0, a0, a3
>>> +; RV64-NEXT:    add a0, a0, a4
>>> +; RV64-NEXT:    add a5, a5, a6
>>>    ; RV64-NEXT:    add a0, a0, a5
>>> +; RV64-NEXT:    add a0, a0, a7
>>>    ; RV64-NEXT:    addi sp, s0, -128
>>>    ; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
>>>    ; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
>>> @@ -856,130 +1008,130 @@ define i64 @explode_16xi64(<16 x i64> %v) {
>>>    ; RV32-NEXT:    .cfi_offset s9, -44
>>>    ; RV32-NEXT:    .cfi_offset s10, -48
>>>    ; RV32-NEXT:    .cfi_offset s11, -52
>>> -; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 2
>>>    ; RV32-NEXT:    li a0, 32
>>> -; RV32-NEXT:    vsrl.vx v24, v16, a0
>>> -; RV32-NEXT:    vmv.x.s t6, v24
>>> +; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
>>> +; RV32-NEXT:    vsrl.vx v16, v8, a0
>>>    ; RV32-NEXT:    vmv.x.s a1, v16
>>>    ; RV32-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 3
>>> -; RV32-NEXT:    vsrl.vx v24, v16, a0
>>> -; RV32-NEXT:    vmv.x.s a1, v24
>>> -; RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
>>> -; RV32-NEXT:    vmv.x.s a2, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 4
>>> -; RV32-NEXT:    vsrl.vx v24, v16, a0
>>> -; RV32-NEXT:    vmv.x.s s0, v24
>>> -; RV32-NEXT:    vmv.x.s a3, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 5
>>> +; RV32-NEXT:    vmv.x.s a2, v8
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 1
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>> -; RV32-NEXT:    vmv.x.s s1, v24
>>> +; RV32-NEXT:    vmv.x.s a3, v24
>>>    ; RV32-NEXT:    vmv.x.s a4, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 6
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 2
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>> -; RV32-NEXT:    vmv.x.s s2, v24
>>> -; RV32-NEXT:    vmv.x.s a5, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 7
>>> +; RV32-NEXT:    vmv.x.s a5, v24
>>> +; RV32-NEXT:    vmv.x.s a6, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 3
>>> +; RV32-NEXT:    vsrl.vx v24, v16, a0
>>> +; RV32-NEXT:    vmv.x.s a7, v24
>>> +; RV32-NEXT:    vmv.x.s t0, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 4
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s3, v24
>>> -; RV32-NEXT:    vmv.x.s a6, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 8
>>> +; RV32-NEXT:    vmv.x.s t1, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 5
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s4, v24
>>> -; RV32-NEXT:    vmv.x.s a7, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 9
>>> +; RV32-NEXT:    vmv.x.s t2, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 6
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s5, v24
>>> -; RV32-NEXT:    vmv.x.s t0, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 10
>>> +; RV32-NEXT:    vmv.x.s t3, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 7
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s6, v24
>>> -; RV32-NEXT:    vmv.x.s t1, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 11
>>> +; RV32-NEXT:    vmv.x.s t4, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 8
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s7, v24
>>> -; RV32-NEXT:    vmv.x.s t2, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 12
>>> +; RV32-NEXT:    vmv.x.s t5, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 9
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s8, v24
>>> -; RV32-NEXT:    vmv.x.s t3, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 13
>>> +; RV32-NEXT:    vmv.x.s t6, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 10
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s9, v24
>>> -; RV32-NEXT:    vmv.x.s t4, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 14
>>> +; RV32-NEXT:    vmv.x.s s0, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 11
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s10, v24
>>> -; RV32-NEXT:    vmv.x.s t5, v16
>>> -; RV32-NEXT:    vslidedown.vi v16, v8, 15
>>> +; RV32-NEXT:    vmv.x.s s1, v16
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 12
>>>    ; RV32-NEXT:    vsrl.vx v24, v16, a0
>>>    ; RV32-NEXT:    vmv.x.s s11, v24
>>> -; RV32-NEXT:    vmv.s.x v9, zero
>>> +; RV32-NEXT:    vmv.x.s s2, v16
>>> +; RV32-NEXT:    vslidedown.vi v24, v8, 13
>>> +; RV32-NEXT:    vsrl.vx v16, v24, a0
>>>    ; RV32-NEXT:    vmv.x.s ra, v16
>>> -; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
>>> -; RV32-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
>>> -; RV32-NEXT:    vsrl.vx v9, v8, a0
>>> -; RV32-NEXT:    vmv.x.s a0, v9
>>> -; RV32-NEXT:    add a1, a0, t6
>>> -; RV32-NEXT:    vmv.x.s a0, v8
>>> -; RV32-NEXT:    lw t6, 8(sp) # 4-byte Folded Reload
>>> -; RV32-NEXT:    add t6, a0, t6
>>> -; RV32-NEXT:    sltu a0, t6, a0
>>> -; RV32-NEXT:    add a0, a1, a0
>>> -; RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a2, t6, a2
>>> -; RV32-NEXT:    sltu a1, a2, t6
>>> -; RV32-NEXT:    add a1, a1, s0
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a3, a2, a3
>>> -; RV32-NEXT:    sltu a1, a3, a2
>>> -; RV32-NEXT:    add a1, a1, s1
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a4, a3, a4
>>> -; RV32-NEXT:    sltu a1, a4, a3
>>> -; RV32-NEXT:    add a1, a1, s2
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a5, a4, a5
>>> -; RV32-NEXT:    sltu a1, a5, a4
>>> -; RV32-NEXT:    add a1, a1, s3
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a6, a5, a6
>>> -; RV32-NEXT:    sltu a1, a6, a5
>>> -; RV32-NEXT:    add a1, a1, s4
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add a7, a6, a7
>>> -; RV32-NEXT:    sltu a1, a7, a6
>>> -; RV32-NEXT:    add a1, a1, s5
>>> -; RV32-NEXT:    add a0, a0, a1
>>> -; RV32-NEXT:    add t0, a7, t0
>>> -; RV32-NEXT:    sltu a1, t0, a7
>>> -; RV32-NEXT:    add a1, a1, s6
>>> -; RV32-NEXT:    add a0, a0, a1
>>> +; RV32-NEXT:    vslidedown.vi v16, v8, 14
>>> +; RV32-NEXT:    vsrl.vx v0, v16, a0
>>> +; RV32-NEXT:    vslidedown.vi v8, v8, 15
>>> +; RV32-NEXT:    vmv.x.s a1, v24
>>> +; RV32-NEXT:    vsrl.vx v24, v8, a0
>>> +; RV32-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
>>> +; RV32-NEXT:    xor a0, a0, a3
>>> +; RV32-NEXT:    xor a2, a2, a4
>>> +; RV32-NEXT:    add a0, a0, a5
>>> +; RV32-NEXT:    add a6, a2, a6
>>> +; RV32-NEXT:    sltu a2, a6, a2
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    add a0, a0, a7
>>> +; RV32-NEXT:    add t0, a6, t0
>>> +; RV32-NEXT:    sltu a2, t0, a6
>>> +; RV32-NEXT:    add a2, a2, s3
>>> +; RV32-NEXT:    add a0, a0, a2
>>>    ; RV32-NEXT:    add t1, t0, t1
>>> -; RV32-NEXT:    sltu a1, t1, t0
>>> -; RV32-NEXT:    add a1, a1, s7
>>> -; RV32-NEXT:    add a0, a0, a1
>>> +; RV32-NEXT:    sltu a2, t1, t0
>>> +; RV32-NEXT:    add a2, a2, s4
>>> +; RV32-NEXT:    add a0, a0, a2
>>>    ; RV32-NEXT:    add t2, t1, t2
>>> -; RV32-NEXT:    sltu a1, t2, t1
>>> -; RV32-NEXT:    add a1, a1, s8
>>> -; RV32-NEXT:    add a0, a0, a1
>>> +; RV32-NEXT:    sltu a2, t2, t1
>>> +; RV32-NEXT:    add a2, a2, s5
>>> +; RV32-NEXT:    add a0, a0, a2
>>>    ; RV32-NEXT:    add t3, t2, t3
>>> -; RV32-NEXT:    sltu a1, t3, t2
>>> -; RV32-NEXT:    add a1, a1, s9
>>> -; RV32-NEXT:    add a0, a0, a1
>>> +; RV32-NEXT:    sltu a2, t3, t2
>>> +; RV32-NEXT:    add a2, a2, s6
>>> +; RV32-NEXT:    add a0, a0, a2
>>>    ; RV32-NEXT:    add t4, t3, t4
>>> -; RV32-NEXT:    sltu a1, t4, t3
>>> -; RV32-NEXT:    add a1, a1, s10
>>> -; RV32-NEXT:    add a0, a0, a1
>>> +; RV32-NEXT:    sltu a2, t4, t3
>>> +; RV32-NEXT:    add a2, a2, s7
>>> +; RV32-NEXT:    add a0, a0, a2
>>>    ; RV32-NEXT:    add t5, t4, t5
>>> -; RV32-NEXT:    sltu a1, t5, t4
>>> -; RV32-NEXT:    add a1, a1, s11
>>> +; RV32-NEXT:    sltu a2, t5, t4
>>> +; RV32-NEXT:    add a2, a2, s8
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    add t6, t5, t6
>>> +; RV32-NEXT:    sltu a2, t6, t5
>>> +; RV32-NEXT:    add a2, a2, s9
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    add s0, t6, s0
>>> +; RV32-NEXT:    sltu a2, s0, t6
>>> +; RV32-NEXT:    add a2, a2, s10
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    add s1, s0, s1
>>> +; RV32-NEXT:    sltu a2, s1, s0
>>> +; RV32-NEXT:    add a2, a2, s11
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    add s2, s1, s2
>>> +; RV32-NEXT:    sltu a2, s2, s1
>>> +; RV32-NEXT:    add a2, a2, ra
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    vmv.x.s a2, v0
>>> +; RV32-NEXT:    add a1, s2, a1
>>> +; RV32-NEXT:    sltu a3, a1, s2
>>> +; RV32-NEXT:    add a2, a3, a2
>>> +; RV32-NEXT:    vmv.x.s a3, v16
>>> +; RV32-NEXT:    add a0, a0, a2
>>> +; RV32-NEXT:    vmv.x.s a2, v24
>>> +; RV32-NEXT:    add a3, a1, a3
>>> +; RV32-NEXT:    sltu a1, a3, a1
>>> +; RV32-NEXT:    add a1, a1, a2
>>>    ; RV32-NEXT:    add a1, a0, a1
>>> -; RV32-NEXT:    add a0, t5, ra
>>> -; RV32-NEXT:    sltu a2, a0, t5
>>> +; RV32-NEXT:    vmv.x.s a0, v8
>>> +; RV32-NEXT:    add a0, a3, a0
>>> +; RV32-NEXT:    sltu a2, a0, a3
>>>    ; RV32-NEXT:    add a1, a1, a2
>>>    ; RV32-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
>>>    ; RV32-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
>>> @@ -1003,52 +1155,56 @@ define i64 @explode_16xi64(<16 x i64> %v) {
>>>    ; RV64-NEXT:    .cfi_def_cfa_offset 256
>>>    ; RV64-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
>>>    ; RV64-NEXT:    sd s0, 240(sp) # 8-byte Folded Spill
>>> +; RV64-NEXT:    sd s2, 232(sp) # 8-byte Folded Spill
>>>    ; RV64-NEXT:    .cfi_offset ra, -8
>>>    ; RV64-NEXT:    .cfi_offset s0, -16
>>> +; RV64-NEXT:    .cfi_offset s2, -24
>>>    ; RV64-NEXT:    addi s0, sp, 256
>>>    ; RV64-NEXT:    .cfi_def_cfa s0, 0
>>>    ; RV64-NEXT:    andi sp, sp, -128
>>> +; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
>>> +; RV64-NEXT:    vmv.x.s a0, v8
>>> +; RV64-NEXT:    vslidedown.vi v16, v8, 1
>>> +; RV64-NEXT:    vmv.x.s a1, v16
>>>    ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
>>>    ; RV64-NEXT:    vslidedown.vi v16, v8, 2
>>> -; RV64-NEXT:    vmv.x.s a0, v16
>>> +; RV64-NEXT:    vmv.x.s a2, v16
>>>    ; RV64-NEXT:    vslidedown.vi v16, v8, 3
>>> -; RV64-NEXT:    vmv.x.s a1, v16
>>> -; RV64-NEXT:    mv a2, sp
>>> +; RV64-NEXT:    vmv.x.s a3, v16
>>> +; RV64-NEXT:    mv a4, sp
>>>    ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
>>> -; RV64-NEXT:    vse64.v v8, (a2)
>>> -; RV64-NEXT:    ld a2, 32(sp)
>>> -; RV64-NEXT:    ld a3, 40(sp)
>>> -; RV64-NEXT:    ld a4, 48(sp)
>>> -; RV64-NEXT:    ld a5, 56(sp)
>>> -; RV64-NEXT:    ld a6, 64(sp)
>>> -; RV64-NEXT:    ld a7, 72(sp)
>>> -; RV64-NEXT:    ld t0, 80(sp)
>>> -; RV64-NEXT:    ld t1, 88(sp)
>>> -; RV64-NEXT:    ld t2, 96(sp)
>>> -; RV64-NEXT:    ld t3, 104(sp)
>>> -; RV64-NEXT:    ld t4, 112(sp)
>>> -; RV64-NEXT:    ld t5, 120(sp)
>>> -; RV64-NEXT:    vmv.s.x v9, zero
>>> -; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
>>> -; RV64-NEXT:    vredxor.vs v8, v8, v9
>>> -; RV64-NEXT:    vmv.x.s t6, v8
>>> -; RV64-NEXT:    add a0, a0, a1
>>> -; RV64-NEXT:    add a0, t6, a0
>>> +; RV64-NEXT:    vse64.v v8, (a4)
>>> +; RV64-NEXT:    ld a4, 32(sp)
>>> +; RV64-NEXT:    ld a5, 40(sp)
>>> +; RV64-NEXT:    ld a6, 48(sp)
>>> +; RV64-NEXT:    ld a7, 56(sp)
>>> +; RV64-NEXT:    ld t0, 64(sp)
>>> +; RV64-NEXT:    ld t1, 72(sp)
>>> +; RV64-NEXT:    ld t2, 80(sp)
>>> +; RV64-NEXT:    ld t3, 88(sp)
>>> +; RV64-NEXT:    ld t4, 96(sp)
>>> +; RV64-NEXT:    ld t5, 104(sp)
>>> +; RV64-NEXT:    ld t6, 112(sp)
>>> +; RV64-NEXT:    ld s2, 120(sp)
>>> +; RV64-NEXT:    xor a0, a0, a1
>>> +; RV64-NEXT:    add a2, a2, a3
>>>    ; RV64-NEXT:    add a0, a0, a2
>>> -; RV64-NEXT:    add a3, a3, a4
>>> -; RV64-NEXT:    add a0, a0, a3
>>> +; RV64-NEXT:    add a0, a0, a4
>>>    ; RV64-NEXT:    add a5, a5, a6
>>> -; RV64-NEXT:    add a5, a5, a7
>>>    ; RV64-NEXT:    add a0, a0, a5
>>> -; RV64-NEXT:    add t0, t0, t1
>>> -; RV64-NEXT:    add t0, t0, t2
>>> -; RV64-NEXT:    add t0, t0, t3
>>> -; RV64-NEXT:    add a0, a0, t0
>>> -; RV64-NEXT:    add t4, t4, t5
>>> -; RV64-NEXT:    add a0, a0, t4
>>> +; RV64-NEXT:    add a7, a7, t0
>>> +; RV64-NEXT:    add a7, a7, t1
>>> +; RV64-NEXT:    add a0, a0, a7
>>> +; RV64-NEXT:    add t2, t2, t3
>>> +; RV64-NEXT:    add t2, t2, t4
>>> +; RV64-NEXT:    add t2, t2, t5
>>> +; RV64-NEXT:    add a0, a0, t2
>>> +; RV64-NEXT:    add t6, t6, s2
>>> +; RV64-NEXT:    add a0, a0, t6
>>>    ; RV64-NEXT:    addi sp, s0, -256
>>>    ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
>>>    ; RV64-NEXT:    ld s0, 240(sp) # 8-byte Folded Reload
>>> +; RV64-NEXT:    ld s2, 232(sp) # 8-byte Folded Reload
>>>    ; RV64-NEXT:    addi sp, sp, 256
>>>    ; RV64-NEXT:    ret
>>>      %e0 = extractelement <16 x i64> %v, i32 0
>>>
>>> diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
>>> index 8c96392f08a5dbe..173b70def03d4c5 100644
>>> --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
>>> +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
>>> @@ -1,6 +1,6 @@
>>>    ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
>>> -; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
>>> -; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
>>> +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s
>>> +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s
>>>      define i32 @reduce_sum_2xi32(<2 x i32> %v) {
>>>    ; CHECK-LABEL: reduce_sum_2xi32:
>>> @@ -448,336 +448,3 @@ define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
>>>      %add13 = add i32 %add12, %e14
>>>      ret i32 %add13
>>>    }
>>> -
>>> -;; Most of the cornercases are exercised above, the following just
>>> -;; makes sure that other opcodes work as expected.
>>> -
>>> -define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
>>> -; CHECK-LABEL: reduce_xor_16xi32_prefix2:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v8, (a0)
>>> -; CHECK-NEXT:    vmv.s.x v9, zero
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v9
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %xor0 = xor i32 %e0, %e1
>>> -  ret i32 %xor0
>>> -}
>>> -
>>> -define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
>>> -; CHECK-LABEL: reduce_xor_16xi32_prefix5:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    li a1, 224
>>> -; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
>>> -; CHECK-NEXT:    vmv.s.x v0, a1
>>> -; CHECK-NEXT:    vmv.v.i v8, -1
>>> -; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
>>> -; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v10, (a0)
>>> -; CHECK-NEXT:    vsext.vf4 v12, v8
>>> -; CHECK-NEXT:    vand.vv v8, v10, v12
>>> -; CHECK-NEXT:    vmv.s.x v10, zero
>>> -; CHECK-NEXT:    vredxor.vs v8, v8, v10
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %e2 = extractelement <16 x i32> %v, i32 2
>>> -  %e3 = extractelement <16 x i32> %v, i32 3
>>> -  %e4 = extractelement <16 x i32> %v, i32 4
>>> -  %xor0 = xor i32 %e0, %e1
>>> -  %xor1 = xor i32 %xor0, %e2
>>> -  %xor2 = xor i32 %xor1, %e3
>>> -  %xor3 = xor i32 %xor2, %e4
>>> -  ret i32 %xor3
>>> -}
>>> -
>>> -define i32 @reduce_and_16xi32_prefix2(ptr %p) {
>>> -; CHECK-LABEL: reduce_and_16xi32_prefix2:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v8, (a0)
>>> -; CHECK-NEXT:    vredand.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %and0 = and i32 %e0, %e1
>>> -  ret i32 %and0
>>> -}
>>> -
>>> -define i32 @reduce_and_16xi32_prefix5(ptr %p) {
>>> -; CHECK-LABEL: reduce_and_16xi32_prefix5:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
>>> -; CHECK-NEXT:    vmv.v.i v8, -1
>>> -; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v10, (a0)
>>> -; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
>>> -; CHECK-NEXT:    vslideup.vi v10, v8, 5
>>> -; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
>>> -; CHECK-NEXT:    vslideup.vi v10, v8, 6
>>> -; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vslideup.vi v10, v8, 7
>>> -; CHECK-NEXT:    vredand.vs v8, v10, v10
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %e2 = extractelement <16 x i32> %v, i32 2
>>> -  %e3 = extractelement <16 x i32> %v, i32 3
>>> -  %e4 = extractelement <16 x i32> %v, i32 4
>>> -  %and0 = and i32 %e0, %e1
>>> -  %and1 = and i32 %and0, %e2
>>> -  %and2 = and i32 %and1, %e3
>>> -  %and3 = and i32 %and2, %e4
>>> -  ret i32 %and3
>>> -}
>>> -
>>> -define i32 @reduce_or_16xi32_prefix2(ptr %p) {
>>> -; CHECK-LABEL: reduce_or_16xi32_prefix2:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v8, (a0)
>>> -; CHECK-NEXT:    vredor.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %or0 = or i32 %e0, %e1
>>> -  ret i32 %or0
>>> -}
>>> -
>>> -define i32 @reduce_or_16xi32_prefix5(ptr %p) {
>>> -; CHECK-LABEL: reduce_or_16xi32_prefix5:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    li a1, 224
>>> -; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
>>> -; CHECK-NEXT:    vmv.s.x v0, a1
>>> -; CHECK-NEXT:    vmv.v.i v8, -1
>>> -; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
>>> -; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v10, (a0)
>>> -; CHECK-NEXT:    vsext.vf4 v12, v8
>>> -; CHECK-NEXT:    vand.vv v8, v10, v12
>>> -; CHECK-NEXT:    vredor.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %e2 = extractelement <16 x i32> %v, i32 2
>>> -  %e3 = extractelement <16 x i32> %v, i32 3
>>> -  %e4 = extractelement <16 x i32> %v, i32 4
>>> -  %or0 = or i32 %e0, %e1
>>> -  %or1 = or i32 %or0, %e2
>>> -  %or2 = or i32 %or1, %e3
>>> -  %or3 = or i32 %or2, %e4
>>> -  ret i32 %or3
>>> -}
>>> -
>>> -declare i32 @llvm.smax.i32(i32 %a, i32 %b)
>>> -declare i32 @llvm.smin.i32(i32 %a, i32 %b)
>>> -declare i32 @llvm.umax.i32(i32 %a, i32 %b)
>>> -declare i32 @llvm.umin.i32(i32 %a, i32 %b)
>>> -
>>> -define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
>>> -; CHECK-LABEL: reduce_smax_16xi32_prefix2:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v8, (a0)
>>> -; CHECK-NEXT:    vredmax.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
>>> -  ret i32 %smax0
>>> -}
>>> -
>>> -define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
>>> -; CHECK-LABEL: reduce_smax_16xi32_prefix5:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    lui a1, 524288
>>> -; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v8, (a0)
>>> -; CHECK-NEXT:    vmv.s.x v10, a1
>>> -; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
>>> -; CHECK-NEXT:    vslideup.vi v8, v10, 5
>>> -; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
>>> -; CHECK-NEXT:    vslideup.vi v8, v10, 6
>>> -; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vslideup.vi v8, v10, 7
>>> -; CHECK-NEXT:    vredmax.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %e2 = extractelement <16 x i32> %v, i32 2
>>> -  %e3 = extractelement <16 x i32> %v, i32 3
>>> -  %e4 = extractelement <16 x i32> %v, i32 4
>>> -  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
>>> -  %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2)
>>> -  %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3)
>>> -  %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4)
>>> -  ret i32 %smax3
>>> -}
>>> -
>>> -define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
>>> -; CHECK-LABEL: reduce_smin_16xi32_prefix2:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v8, (a0)
>>> -; CHECK-NEXT:    vredmin.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
>>> -  ret i32 %smin0
>>> -}
>>> -
>>> -define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
>>> -; RV32-LABEL: reduce_smin_16xi32_prefix5:
>>> -; RV32:       # %bb.0:
>>> -; RV32-NEXT:    lui a1, 524288
>>> -; RV32-NEXT:    addi a1, a1, -1
>>> -; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; RV32-NEXT:    vle32.v v8, (a0)
>>> -; RV32-NEXT:    vmv.s.x v10, a1
>>> -; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
>>> -; RV32-NEXT:    vslideup.vi v8, v10, 5
>>> -; RV32-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
>>> -; RV32-NEXT:    vslideup.vi v8, v10, 6
>>> -; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; RV32-NEXT:    vslideup.vi v8, v10, 7
>>> -; RV32-NEXT:    vredmin.vs v8, v8, v8
>>> -; RV32-NEXT:    vmv.x.s a0, v8
>>> -; RV32-NEXT:    ret
>>> -;
>>> -; RV64-LABEL: reduce_smin_16xi32_prefix5:
>>> -; RV64:       # %bb.0:
>>> -; RV64-NEXT:    lui a1, 524288
>>> -; RV64-NEXT:    addiw a1, a1, -1
>>> -; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; RV64-NEXT:    vle32.v v8, (a0)
>>> -; RV64-NEXT:    vmv.s.x v10, a1
>>> -; RV64-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
>>> -; RV64-NEXT:    vslideup.vi v8, v10, 5
>>> -; RV64-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
>>> -; RV64-NEXT:    vslideup.vi v8, v10, 6
>>> -; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; RV64-NEXT:    vslideup.vi v8, v10, 7
>>> -; RV64-NEXT:    vredmin.vs v8, v8, v8
>>> -; RV64-NEXT:    vmv.x.s a0, v8
>>> -; RV64-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %e2 = extractelement <16 x i32> %v, i32 2
>>> -  %e3 = extractelement <16 x i32> %v, i32 3
>>> -  %e4 = extractelement <16 x i32> %v, i32 4
>>> -  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
>>> -  %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2)
>>> -  %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3)
>>> -  %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4)
>>> -  ret i32 %smin3
>>> -}
>>> -
>>> -define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
>>> -; CHECK-LABEL: reduce_umax_16xi32_prefix2:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v8, (a0)
>>> -; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
>>> -  ret i32 %umax0
>>> -}
>>> -
>>> -define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
>>> -; CHECK-LABEL: reduce_umax_16xi32_prefix5:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    li a1, 224
>>> -; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
>>> -; CHECK-NEXT:    vmv.s.x v0, a1
>>> -; CHECK-NEXT:    vmv.v.i v8, -1
>>> -; CHECK-NEXT:    vmerge.vim v8, v8, 0, v0
>>> -; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v10, (a0)
>>> -; CHECK-NEXT:    vsext.vf4 v12, v8
>>> -; CHECK-NEXT:    vand.vv v8, v10, v12
>>> -; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %e2 = extractelement <16 x i32> %v, i32 2
>>> -  %e3 = extractelement <16 x i32> %v, i32 3
>>> -  %e4 = extractelement <16 x i32> %v, i32 4
>>> -  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
>>> -  %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2)
>>> -  %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3)
>>> -  %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4)
>>> -  ret i32 %umax3
>>> -}
>>> -
>>> -define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
>>> -; CHECK-LABEL: reduce_umin_16xi32_prefix2:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v8, (a0)
>>> -; CHECK-NEXT:    vredminu.vs v8, v8, v8
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
>>> -  ret i32 %umin0
>>> -}
>>> -
>>> -define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
>>> -; CHECK-LABEL: reduce_umin_16xi32_prefix5:
>>> -; CHECK:       # %bb.0:
>>> -; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
>>> -; CHECK-NEXT:    vmv.v.i v8, -1
>>> -; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vle32.v v10, (a0)
>>> -; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
>>> -; CHECK-NEXT:    vslideup.vi v10, v8, 5
>>> -; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
>>> -; CHECK-NEXT:    vslideup.vi v10, v8, 6
>>> -; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
>>> -; CHECK-NEXT:    vslideup.vi v10, v8, 7
>>> -; CHECK-NEXT:    vredminu.vs v8, v10, v10
>>> -; CHECK-NEXT:    vmv.x.s a0, v8
>>> -; CHECK-NEXT:    ret
>>> -  %v = load <16 x i32>, ptr %p, align 256
>>> -  %e0 = extractelement <16 x i32> %v, i32 0
>>> -  %e1 = extractelement <16 x i32> %v, i32 1
>>> -  %e2 = extractelement <16 x i32> %v, i32 2
>>> -  %e3 = extractelement <16 x i32> %v, i32 3
>>> -  %e4 = extractelement <16 x i32> %v, i32 4
>>> -  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
>>> -  %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2)
>>> -  %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3)
>>> -  %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4)
>>> -  ret i32 %umin3
>>> -}
>>>
>>>
>>>           _______________________________________________
>>> llvm-commits mailing list
>>> llvm-commits at lists.llvm.org
>>> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits


More information about the llvm-commits mailing list