[llvm] 98dff5e - [RISCV] Move SHFLI matching to DAG combine. Add 32-bit support for RV64

Fri Feb 19 10:07:20 PST 2021

Author: Craig Topper
Date: 2021-02-19T10:07:12-08:00
New Revision: 98dff5e804229d1d2fc139e44e7a04fc06bb6f92

URL: https://github.com/llvm/llvm-project/commit/98dff5e804229d1d2fc139e44e7a04fc06bb6f92
DIFF: https://github.com/llvm/llvm-project/commit/98dff5e804229d1d2fc139e44e7a04fc06bb6f92.diff

LOG: [RISCV] Move SHFLI matching to DAG combine. Add 32-bit support for RV64

We previously used isel patterns for this, but that used quite
a bit of space in the isel table due to OR being associative
and commutative. It also wouldn't handle shifts/ands being in
reversed order.

This generalizes the shift/and matching from GREVI to
take the expected mask table as input so we can reuse it for
SHFLI.

There is no SHFLIW instruction, but we can promote a 32-bit
SHFLI to i64 on RV64. As long as bit 4 of the control bit isn't
set, a 64-bit SHFLI will preserve 33 sign bits if the input had
at least 33 sign bits. ComputeNumSignBits has been updated to
account for that to avoid sext.w in the tests.

Reviewed By: frasercrmck

Differential Revision: https://reviews.llvm.org/D96661

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/RISCV/RISCVInstrInfoB.td
    llvm/test/CodeGen/RISCV/rv64Zbp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9576d3cbd6ed..8c07f4b1a28a 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2545,6 +2545,20 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
     break;
   }
+  case RISCVISD::SHFLI: {
+    // There is no SHFLIW instruction, but we can just promote the operation.
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    SDLoc DL(N);
+    SDValue NewOp0 =
+        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+    SDValue NewRes =
+        DAG.getNode(RISCVISD::SHFLI, DL, MVT::i64, NewOp0, N->getOperand(1));
+    // ReplaceNodeResults requires we maintain the same type for the return
+    // value.
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
+    break;
+  }
   case ISD::BSWAP:
   case ISD::BITREVERSE: {
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
@@ -2674,19 +2688,21 @@ struct RISCVBitmanipPat {
   }
 };
 
-// Matches any of the following bit-manipulation patterns:
-//   (and (shl x, 1), (0x55555555 << 1))
-//   (and (srl x, 1), 0x55555555)
-//   (shl (and x, 0x55555555), 1)
-//   (srl (and x, (0x55555555 << 1)), 1)
-// where the shift amount and mask may vary thus:
-//   [1]  = 0x55555555 / 0xAAAAAAAA
-//   [2]  = 0x33333333 / 0xCCCCCCCC
-//   [4]  = 0x0F0F0F0F / 0xF0F0F0F0
-//   [8]  = 0x00FF00FF / 0xFF00FF00
-//   [16] = 0x0000FFFF / 0xFFFFFFFF
-//   [32] = 0x00000000FFFFFFFF / 0xFFFFFFFF00000000 (for RV64)
-static Optional<RISCVBitmanipPat> matchRISCVBitmanipPat(SDValue Op) {
+// Matches patterns of the form
+//   (and (shl x, C2), (C1 << C2))
+//   (and (srl x, C2), C1)
+//   (shl (and x, C1), C2)
+//   (srl (and x, (C1 << C2)), C2)
+// Where C2 is a power of 2 and C1 has at least that many leading zeroes.
+// The expected masks for each shift amount are specified in BitmanipMasks where
+// BitmanipMasks[log2(C2)] specifies the expected C1 value.
+// The max allowed shift amount is either XLen/2 or XLen/4 determined by whether
+// BitmanipMasks contains 6 or 5 entries assuming that the maximum possible
+// XLen is 64.
+static Optional<RISCVBitmanipPat>
+matchRISCVBitmanipPat(SDValue Op, ArrayRef<uint64_t> BitmanipMasks) {
+  assert((BitmanipMasks.size() == 5 || BitmanipMasks.size() == 6) &&
+         "Unexpected number of masks");
   Optional<uint64_t> Mask;
   // Optionally consume a mask around the shift operation.
   if (Op.getOpcode() == ISD::AND && isa<ConstantSDNode>(Op.getOperand(1))) {
@@ -2699,26 +2715,17 @@ static Optional<RISCVBitmanipPat> matchRISCVBitmanipPat(SDValue Op) {
 
   if (!isa<ConstantSDNode>(Op.getOperand(1)))
     return None;
-  auto ShAmt = Op.getConstantOperandVal(1);
+  uint64_t ShAmt = Op.getConstantOperandVal(1);
 
-  if (!isPowerOf2_64(ShAmt))
+  unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
+  if (ShAmt >= Width && !isPowerOf2_64(ShAmt))
     return None;
-
-  // These are the unshifted masks which we use to match bit-manipulation
-  // patterns. They may be shifted left in certain circumstances.
-  static const uint64_t BitmanipMasks[] = {
-      0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
-      0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL,
-  };
-
-  unsigned MaskIdx = Log2_64(ShAmt);
-  if (MaskIdx >= array_lengthof(BitmanipMasks))
+  // If we don't have enough masks for 64 bit, then we must be trying to
+  // match SHFL so we're only allowed to shift 1/4 of the width.
+  if (BitmanipMasks.size() == 5 && ShAmt >= (Width / 2))
     return None;
 
-  auto Src = Op.getOperand(0);
-
-  unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
-  auto ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+  SDValue Src = Op.getOperand(0);
 
   // The expected mask is shifted left when the AND is found around SHL
   // patterns.
@@ -2745,6 +2752,9 @@ static Optional<RISCVBitmanipPat> matchRISCVBitmanipPat(SDValue Op) {
     }
   }
 
+  unsigned MaskIdx = Log2_32(ShAmt);
+  uint64_t ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+
   if (SHLExpMask)
     ExpMask <<= ShAmt;
 
@@ -2754,15 +2764,38 @@ static Optional<RISCVBitmanipPat> matchRISCVBitmanipPat(SDValue Op) {
   return RISCVBitmanipPat{Src, (unsigned)ShAmt, IsSHL};
 }
 
+// Matches any of the following bit-manipulation patterns:
+//   (and (shl x, 1), (0x55555555 << 1))
+//   (and (srl x, 1), 0x55555555)
+//   (shl (and x, 0x55555555), 1)
+//   (srl (and x, (0x55555555 << 1)), 1)
+// where the shift amount and mask may vary thus:
+//   [1]  = 0x55555555 / 0xAAAAAAAA
+//   [2]  = 0x33333333 / 0xCCCCCCCC
+//   [4]  = 0x0F0F0F0F / 0xF0F0F0F0
+//   [8]  = 0x00FF00FF / 0xFF00FF00
+//   [16] = 0x0000FFFF / 0xFFFFFFFF
+//   [32] = 0x00000000FFFFFFFF / 0xFFFFFFFF00000000 (for RV64)
+static Optional<RISCVBitmanipPat> matchGREVIPat(SDValue Op) {
+  // These are the unshifted masks which we use to match bit-manipulation
+  // patterns. They may be shifted left in certain circumstances.
+  static const uint64_t BitmanipMasks[] = {
+      0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
+      0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
+
+  return matchRISCVBitmanipPat(Op, BitmanipMasks);
+}
+
 // Match the following pattern as a GREVI(W) operation
 //   (or (BITMANIP_SHL x), (BITMANIP_SRL x))
 static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG,
                                const RISCVSubtarget &Subtarget) {
+  assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
   EVT VT = Op.getValueType();
 
   if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
-    auto LHS = matchRISCVBitmanipPat(Op.getOperand(0));
-    auto RHS = matchRISCVBitmanipPat(Op.getOperand(1));
+    auto LHS = matchGREVIPat(Op.getOperand(0));
+    auto RHS = matchGREVIPat(Op.getOperand(1));
     if (LHS && RHS && LHS->formsPairWith(*RHS)) {
       SDLoc DL(Op);
       return DAG.getNode(
@@ -2784,6 +2817,7 @@ static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG,
 // 4.  (or (rotl/rotr x, bitwidth/2), x)
 static SDValue combineORToGORC(SDValue Op, SelectionDAG &DAG,
                                const RISCVSubtarget &Subtarget) {
+  assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
   EVT VT = Op.getValueType();
 
   if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
@@ -2822,14 +2856,14 @@ static SDValue combineORToGORC(SDValue Op, SelectionDAG &DAG,
       return SDValue();
     SDValue OrOp0 = Op0.getOperand(0);
     SDValue OrOp1 = Op0.getOperand(1);
-    auto LHS = matchRISCVBitmanipPat(OrOp0);
+    auto LHS = matchGREVIPat(OrOp0);
     // OR is commutable so swap the operands and try again: x might have been
     // on the left
     if (!LHS) {
       std::swap(OrOp0, OrOp1);
-      LHS = matchRISCVBitmanipPat(OrOp0);
+      LHS = matchGREVIPat(OrOp0);
     }
-    auto RHS = matchRISCVBitmanipPat(Op1);
+    auto RHS = matchGREVIPat(Op1);
     if (LHS && RHS && LHS->formsPairWith(*RHS) && LHS->Op == OrOp1) {
       return DAG.getNode(
           RISCVISD::GORCI, DL, VT, LHS->Op,
@@ -2839,6 +2873,102 @@ static SDValue combineORToGORC(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Matches any of the following bit-manipulation patterns:
+//   (and (shl x, 1), (0x22222222 << 1))
+//   (and (srl x, 1), 0x22222222)
+//   (shl (and x, 0x22222222), 1)
+//   (srl (and x, (0x22222222 << 1)), 1)
+// where the shift amount and mask may vary thus:
+//   [1]  = 0x22222222 / 0x44444444
+//   [2]  = 0x0C0C0C0C / 0x3C3C3C3C
+//   [4]  = 0x00F000F0 / 0x0F000F00
+//   [8]  = 0x0000FF00 / 0x00FF0000
+//   [16] = 0x00000000FFFF0000 / 0x0000FFFF00000000 (for RV64)
+static Optional<RISCVBitmanipPat> matchSHFLPat(SDValue Op) {
+  // These are the unshifted masks which we use to match bit-manipulation
+  // patterns. They may be shifted left in certain circumstances.
+  static const uint64_t BitmanipMasks[] = {
+      0x2222222222222222ULL, 0x0C0C0C0C0C0C0C0CULL, 0x00F000F000F000F0ULL,
+      0x0000FF000000FF00ULL, 0x00000000FFFF0000ULL};
+
+  return matchRISCVBitmanipPat(Op, BitmanipMasks);
+}
+
+// Match (or (or (SHFL_SHL x), (SHFL_SHR x)), (SHFL_AND x)
+static SDValue combineORToSHFL(SDValue Op, SelectionDAG &DAG,
+                               const RISCVSubtarget &Subtarget) {
+  assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
+  EVT VT = Op.getValueType();
+
+  if (VT != MVT::i32 && VT != Subtarget.getXLenVT())
+    return SDValue();
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+
+  // Or is commutable so canonicalize the second OR to the LHS.
+  if (Op0.getOpcode() != ISD::OR)
+    std::swap(Op0, Op1);
+  if (Op0.getOpcode() != ISD::OR)
+    return SDValue();
+
+  // We found an inner OR, so our operands are the operands of the inner OR
+  // and the other operand of the outer OR.
+  SDValue A = Op0.getOperand(0);
+  SDValue B = Op0.getOperand(1);
+  SDValue C = Op1;
+
+  auto Match1 = matchSHFLPat(A);
+  auto Match2 = matchSHFLPat(B);
+
+  // If neither matched, we failed.
+  if (!Match1 && !Match2)
+    return SDValue();
+
+  // We had at least one match. if one failed, try the remaining C operand.
+  if (!Match1) {
+    std::swap(A, C);
+    Match1 = matchSHFLPat(A);
+    if (!Match1)
+      return SDValue();
+  } else if (!Match2) {
+    std::swap(B, C);
+    Match2 = matchSHFLPat(B);
+    if (!Match2)
+      return SDValue();
+  }
+  assert(Match1 && Match2);
+
+  // Make sure our matches pair up.
+  if (!Match1->formsPairWith(*Match2))
+    return SDValue();
+
+  // All the remains is to make sure C is an AND with the same input, that masks
+  // out the bits that are being shuffled.
+  if (C.getOpcode() != ISD::AND || !isa<ConstantSDNode>(C.getOperand(1)) ||
+      C.getOperand(0) != Match1->Op)
+    return SDValue();
+
+  uint64_t Mask = C.getConstantOperandVal(1);
+
+  static const uint64_t BitmanipMasks[] = {
+      0x9999999999999999ULL, 0xC3C3C3C3C3C3C3C3ULL, 0xF00FF00FF00FF00FULL,
+      0xFF0000FFFF0000FFULL, 0xFFFF00000000FFFFULL,
+  };
+
+  unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
+  unsigned MaskIdx = Log2_32(Match1->ShAmt);
+  uint64_t ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+
+  if (Mask != ExpMask)
+    return SDValue();
+
+  SDLoc DL(Op);
+  return DAG.getNode(
+      RISCVISD::SHFLI, DL, VT, Match1->Op,
+      DAG.getTargetConstant(Match1->ShAmt, DL, Subtarget.getXLenVT()));
+}
+
 // Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is
 // non-zero, and to x when it is. Any repeated GREVI stage undoes itself.
 // Combine (GORCI (GORCI x, C2), C1) -> (GORCI x, C1|C2). Repeated stage does
@@ -3018,6 +3148,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return GREV;
     if (auto GORC = combineORToGORC(SDValue(N, 0), DCI.DAG, Subtarget))
       return GORC;
+    if (auto SHFL = combineORToSHFL(SDValue(N, 0), DCI.DAG, Subtarget))
+      return SHFL;
     break;
   case RISCVISD::SELECT_CC: {
     // Transform
@@ -3265,6 +3397,19 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
     // more precise answer could be calculated for SRAW depending on known
     // bits in the shift amount.
     return 33;
+  case RISCVISD::SHFLI: {
+    // There is no SHFLIW, but a i64 SHFLI with bit 4 of the control word
+    // cleared doesn't affect bit 31. The upper 32 bits will be shuffled, but
+    // will stay within the upper 32 bits. If there were more than 32 sign bits
+    // before there will be at least 33 sign bits after.
+    if (Op.getValueType() == MVT::i64 &&
+        (Op.getConstantOperandVal(1) & 0x10) == 0) {
+      unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+      if (Tmp > 32)
+        return 33;
+    }
+    break;
+  }
   case RISCVISD::VMV_X_S:
     // The number of sign bits of the scalar result is computed by obtaining the
     // element type of the input vector operand, subtracting its width from the
@@ -4928,6 +5073,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(GREVIW)
   NODE_NAME_CASE(GORCI)
   NODE_NAME_CASE(GORCIW)
+  NODE_NAME_CASE(SHFLI)
   NODE_NAME_CASE(VMV_V_X_VL)
   NODE_NAME_CASE(VFMV_V_F_VL)
   NODE_NAME_CASE(VMV_X_S)

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 8d761d26e06e..65c25e3e9719 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -88,6 +88,7 @@ enum NodeType : unsigned {
   GREVIW,
   GORCI,
   GORCIW,
+  SHFLI,
   // Vector Extension
   // VMV_V_X_VL matches the semantics of vmv.v.x but includes an extra operand
   // for the VL value to be used for the operation.

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index c0ced3ae82ed..1b287409eecd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -671,8 +671,10 @@ def riscv_grevi    : SDNode<"RISCVISD::GREVI", SDTIntBinOp, []>;
 def riscv_greviw   : SDNode<"RISCVISD::GREVIW", SDTIntBinOp, []>;
 def riscv_gorci    : SDNode<"RISCVISD::GORCI", SDTIntBinOp, []>;
 def riscv_gorciw   : SDNode<"RISCVISD::GORCIW", SDTIntBinOp, []>;
+def riscv_shfli    : SDNode<"RISCVISD::SHFLI", SDTIntBinOp, []>;
 
 let Predicates = [HasStdExtZbp] in {
+def : Pat<(riscv_shfli GPR:$rs1, timm:$shamt), (SHFLI GPR:$rs1, timm:$shamt)>;
 def : Pat<(riscv_grevi GPR:$rs1, timm:$shamt), (GREVI GPR:$rs1, timm:$shamt)>;
 def : Pat<(riscv_gorci GPR:$rs1, timm:$shamt), (GORCI GPR:$rs1, timm:$shamt)>;
 
@@ -789,48 +791,6 @@ let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
 def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV64 GPR:$rs)>;
 }
 
-let Predicates = [HasStdExtZbp, IsRV32] in {
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
-                  (and GPR:$rs1, (i32 0xFF0000FF))),
-              (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))),
-          (SHFLI GPR:$rs1, (i32 8))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)),
-                  (and GPR:$rs1, (i32 0xF00FF00F))),
-              (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))),
-          (SHFLI GPR:$rs1, (i32 4))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)),
-                  (and GPR:$rs1, (i32 0xC3C3C3C3))),
-              (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))),
-          (SHFLI GPR:$rs1, (i32 2))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)),
-                  (and GPR:$rs1, (i32 0x99999999))),
-              (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))),
-          (SHFLI GPR:$rs1, (i32 1))>;
-} // Predicates = [HasStdExtZbp, IsRV32]
-
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)),
-                  (and GPR:$rs1, (i64 0xFFFF00000000FFFF))),
-              (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))),
-          (SHFLI GPR:$rs1, (i64 16))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)),
-                  (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))),
-              (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))),
-          (SHFLI GPR:$rs1, (i64 8))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)),
-                  (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))),
-              (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))),
-          (SHFLI GPR:$rs1, (i64 4))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)),
-                  (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))),
-              (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))),
-          (SHFLI GPR:$rs1, (i64 2))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)),
-                  (and GPR:$rs1, (i64 0x9999999999999999))),
-              (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))),
-          (SHFLI GPR:$rs1, (i64 1))>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
 let Predicates = [HasStdExtZba] in {
 def : Pat<(add (shl GPR:$rs1, (XLenVT 1)), GPR:$rs2),
           (SH1ADD GPR:$rs1, GPR:$rs2)>;

diff  --git a/llvm/test/CodeGen/RISCV/rv64Zbp.ll b/llvm/test/CodeGen/RISCV/rv64Zbp.ll
index 2cb2fd957d8c..a7af0e023d72 100644
--- a/llvm/test/CodeGen/RISCV/rv64Zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64Zbp.ll
@@ -3430,36 +3430,12 @@ define signext i32 @shfl1_i32(i32 signext %a, i32 signext %b) nounwind {
 ;
 ; RV64IB-LABEL: shfl1_i32:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    lui a1, 629146
-; RV64IB-NEXT:    addiw a1, a1, -1639
-; RV64IB-NEXT:    and a1, a0, a1
-; RV64IB-NEXT:    slli a2, a0, 1
-; RV64IB-NEXT:    lui a3, 279620
-; RV64IB-NEXT:    addiw a3, a3, 1092
-; RV64IB-NEXT:    and a2, a2, a3
-; RV64IB-NEXT:    or a1, a2, a1
-; RV64IB-NEXT:    srli a0, a0, 1
-; RV64IB-NEXT:    lui a2, 139810
-; RV64IB-NEXT:    addiw a2, a2, 546
-; RV64IB-NEXT:    and a0, a0, a2
-; RV64IB-NEXT:    or a0, a1, a0
+; RV64IB-NEXT:    zip.n a0, a0
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: shfl1_i32:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    lui a1, 629146
-; RV64IBP-NEXT:    addiw a1, a1, -1639
-; RV64IBP-NEXT:    and a1, a0, a1
-; RV64IBP-NEXT:    slli a2, a0, 1
-; RV64IBP-NEXT:    lui a3, 279620
-; RV64IBP-NEXT:    addiw a3, a3, 1092
-; RV64IBP-NEXT:    and a2, a2, a3
-; RV64IBP-NEXT:    or a1, a2, a1
-; RV64IBP-NEXT:    srli a0, a0, 1
-; RV64IBP-NEXT:    lui a2, 139810
-; RV64IBP-NEXT:    addiw a2, a2, 546
-; RV64IBP-NEXT:    and a0, a0, a2
-; RV64IBP-NEXT:    or a0, a1, a0
+; RV64IBP-NEXT:    zip.n a0, a0
 ; RV64IBP-NEXT:    ret
   %and = and i32 %a, -1717986919
   %shl = shl i32 %a, 1
@@ -3540,36 +3516,12 @@ define signext i32 @shfl2_i32(i32 signext %a, i32 signext %b) nounwind {
 ;
 ; RV64IB-LABEL: shfl2_i32:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    lui a1, 801852
-; RV64IB-NEXT:    addiw a1, a1, 963
-; RV64IB-NEXT:    and a1, a0, a1
-; RV64IB-NEXT:    slli a2, a0, 2
-; RV64IB-NEXT:    lui a3, 197379
-; RV64IB-NEXT:    addiw a3, a3, 48
-; RV64IB-NEXT:    and a2, a2, a3
-; RV64IB-NEXT:    or a1, a2, a1
-; RV64IB-NEXT:    srli a0, a0, 2
-; RV64IB-NEXT:    lui a2, 49345
-; RV64IB-NEXT:    addiw a2, a2, -1012
-; RV64IB-NEXT:    and a0, a0, a2
-; RV64IB-NEXT:    or a0, a0, a1
+; RV64IB-NEXT:    zip2.b a0, a0
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: shfl2_i32:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    lui a1, 801852
-; RV64IBP-NEXT:    addiw a1, a1, 963
-; RV64IBP-NEXT:    and a1, a0, a1
-; RV64IBP-NEXT:    slli a2, a0, 2
-; RV64IBP-NEXT:    lui a3, 197379
-; RV64IBP-NEXT:    addiw a3, a3, 48
-; RV64IBP-NEXT:    and a2, a2, a3
-; RV64IBP-NEXT:    or a1, a2, a1
-; RV64IBP-NEXT:    srli a0, a0, 2
-; RV64IBP-NEXT:    lui a2, 49345
-; RV64IBP-NEXT:    addiw a2, a2, -1012
-; RV64IBP-NEXT:    and a0, a0, a2
-; RV64IBP-NEXT:    or a0, a0, a1
+; RV64IBP-NEXT:    zip2.b a0, a0
 ; RV64IBP-NEXT:    ret
   %and = and i32 %a, -1010580541
   %shl = shl i32 %a, 2
@@ -3652,36 +3604,12 @@ define signext i32 @shfl4_i32(i32 signext %a, i32 signext %b) nounwind {
 ;
 ; RV64IB-LABEL: shfl4_i32:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    lui a1, 983295
-; RV64IB-NEXT:    addiw a1, a1, 15
-; RV64IB-NEXT:    and a1, a0, a1
-; RV64IB-NEXT:    slli a2, a0, 4
-; RV64IB-NEXT:    lui a3, 61441
-; RV64IB-NEXT:    addiw a3, a3, -256
-; RV64IB-NEXT:    and a2, a2, a3
-; RV64IB-NEXT:    srli a0, a0, 4
-; RV64IB-NEXT:    lui a3, 3840
-; RV64IB-NEXT:    addiw a3, a3, 240
-; RV64IB-NEXT:    and a0, a0, a3
-; RV64IB-NEXT:    or a0, a0, a1
-; RV64IB-NEXT:    or a0, a0, a2
+; RV64IB-NEXT:    zip4.h a0, a0
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: shfl4_i32:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    lui a1, 983295
-; RV64IBP-NEXT:    addiw a1, a1, 15
-; RV64IBP-NEXT:    and a1, a0, a1
-; RV64IBP-NEXT:    slli a2, a0, 4
-; RV64IBP-NEXT:    lui a3, 61441
-; RV64IBP-NEXT:    addiw a3, a3, -256
-; RV64IBP-NEXT:    and a2, a2, a3
-; RV64IBP-NEXT:    srli a0, a0, 4
-; RV64IBP-NEXT:    lui a3, 3840
-; RV64IBP-NEXT:    addiw a3, a3, 240
-; RV64IBP-NEXT:    and a0, a0, a3
-; RV64IBP-NEXT:    or a0, a0, a1
-; RV64IBP-NEXT:    or a0, a0, a2
+; RV64IBP-NEXT:    zip4.h a0, a0
 ; RV64IBP-NEXT:    ret
   %and = and i32 %a, -267390961
   %shl = shl i32 %a, 4
@@ -3761,34 +3689,12 @@ define signext i32 @shfl8_i32(i32 signext %a, i32 signext %b) nounwind {
 ;
 ; RV64IB-LABEL: shfl8_i32:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    lui a1, 1044480
-; RV64IB-NEXT:    addiw a1, a1, 255
-; RV64IB-NEXT:    and a1, a0, a1
-; RV64IB-NEXT:    slli a2, a0, 8
-; RV64IB-NEXT:    lui a3, 4080
-; RV64IB-NEXT:    and a2, a2, a3
-; RV64IB-NEXT:    srli a0, a0, 8
-; RV64IB-NEXT:    lui a3, 16
-; RV64IB-NEXT:    addiw a3, a3, -256
-; RV64IB-NEXT:    and a0, a0, a3
-; RV64IB-NEXT:    or a0, a1, a0
-; RV64IB-NEXT:    or a0, a0, a2
+; RV64IB-NEXT:    zip8.w a0, a0
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: shfl8_i32:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    lui a1, 1044480
-; RV64IBP-NEXT:    addiw a1, a1, 255
-; RV64IBP-NEXT:    and a1, a0, a1
-; RV64IBP-NEXT:    slli a2, a0, 8
-; RV64IBP-NEXT:    lui a3, 4080
-; RV64IBP-NEXT:    and a2, a2, a3
-; RV64IBP-NEXT:    srli a0, a0, 8
-; RV64IBP-NEXT:    lui a3, 16
-; RV64IBP-NEXT:    addiw a3, a3, -256
-; RV64IBP-NEXT:    and a0, a0, a3
-; RV64IBP-NEXT:    or a0, a1, a0
-; RV64IBP-NEXT:    or a0, a0, a2
+; RV64IBP-NEXT:    zip8.w a0, a0
 ; RV64IBP-NEXT:    ret
   %and = and i32 %a, -16776961
   %shl = shl i32 %a, 8