[llvm-branch-commits] [RISCV] Support memcmp expansion for vectors (PR #114517)

Fri Nov 1 00:03:06 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Pengcheng Wang (wangpc-pp)

<details>
<summary>Changes</summary>



---

Patch is 404.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114517.diff


4 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+100-3) 
- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+5) 
- (modified) llvm/test/CodeGen/RISCV/memcmp-optsize.ll (+920-530) 
- (modified) llvm/test/CodeGen/RISCV/memcmp.ll (+4570-1843) 


``````````diff

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3b3f8772a08940..89b4f22a1260db 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -14474,17 +14475,116 @@ static bool narrowIndex(SDValue &N, ISD::MemIndexType IndexType, SelectionDAG &D
   return true;
 }
 
+/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
+/// recognizable memcmp expansion.
+static bool isOrXorXorTree(SDValue X, bool Root = true) {
+  if (X.getOpcode() == ISD::OR)
+    return isOrXorXorTree(X.getOperand(0), false) &&
+           isOrXorXorTree(X.getOperand(1), false);
+  if (Root)
+    return false;
+  return X.getOpcode() == ISD::XOR;
+}
+
+/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
+/// expansion.
+static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
+                                EVT VecVT, EVT CmpVT) {
+  SDValue Op0 = X.getOperand(0);
+  SDValue Op1 = X.getOperand(1);
+  if (X.getOpcode() == ISD::OR) {
+    SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT);
+    SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT);
+    if (VecVT != CmpVT)
+      return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
+    return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
+  }
+  if (X.getOpcode() == ISD::XOR) {
+    SDValue A = DAG.getBitcast(VecVT, Op0);
+    SDValue B = DAG.getBitcast(VecVT, Op1);
+    if (VecVT != CmpVT)
+      return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
+    return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+  }
+  llvm_unreachable("Impossible");
+}
+
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue
+combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
+                                const SDLoc &DL, SelectionDAG &DAG,
+                                const RISCVSubtarget &Subtarget) {
+  assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+  EVT OpVT = X.getValueType();
+  MVT XLenVT = Subtarget.getXLenVT();
+  unsigned OpSize = OpVT.getSizeInBits();
+
+  // We're looking for an oversized integer equality comparison.
+  if (!Subtarget.hasVInstructions() || !OpVT.isScalarInteger() ||
+      OpSize < Subtarget.getRealMinVLen() ||
+      OpSize > Subtarget.getRealMinVLen() * 8)
+    return SDValue();
+
+  bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
+  if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+    return SDValue();
+
+  // Don't perform this combine if constructing the vector will be expensive.
+  auto IsVectorBitCastCheap = [](SDValue X) {
+    X = peekThroughBitcasts(X);
+    return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+           X.getOpcode() == ISD::LOAD;
+  };
+  if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
+      !IsOrXorXorTreeCCZero)
+    return SDValue();
+
+  bool NoImplicitFloatOps =
+      DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat);
+  if (!NoImplicitFloatOps && Subtarget.hasVInstructions()) {
+    unsigned VecSize = OpSize / 8;
+    EVT VecVT = MVT::getVectorVT(MVT::i8, VecSize);
+    EVT CmpVT = MVT::getVectorVT(MVT::i1, VecSize);
+
+    SDValue Cmp;
+    if (IsOrXorXorTreeCCZero) {
+      Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT);
+    } else {
+      SDValue VecX = DAG.getBitcast(VecVT, X);
+      SDValue VecY = DAG.getBitcast(VecVT, Y);
+      Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+    }
+    return DAG.getSetCC(DL, VT,
+                        DAG.getNode(ISD::VECREDUCE_AND, DL, XLenVT, Cmp),
+                        DAG.getConstant(0, DL, XLenVT), CC);
+  }
+
+  return SDValue();
+}
+
 // Replace (seteq (i64 (and X, 0xffffffff)), C1) with
 // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
 // bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
 // can become a sext.w instead of a shift pair.
 static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
+  SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
+  // Looking for an equality compare.
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (Cond == ISD::SETNE || Cond == ISD::SETEQ) {
+    if (SDValue V = combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG,
+                                                    Subtarget))
+      return V;
+  }
+
   if (OpVT != MVT::i64 || !Subtarget.is64Bit())
     return SDValue();
 
@@ -14499,8 +14599,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
       N0.getConstantOperandVal(1) != UINT64_C(0xffffffff))
     return SDValue();
 
-  // Looking for an equality compare.
-  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
   if (!isIntEqualitySetCC(Cond))
     return SDValue();
 
@@ -14512,7 +14610,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
 
   const APInt &C1 = N1C->getAPIntValue();
 
-  SDLoc dl(N);
   // If the constant is larger than 2^32 - 1 it is impossible for both sides
   // to be equal.
   if (C1.getActiveBits() > 32)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5f5a18e2868730..d7b05001185f32 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2504,5 +2504,10 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
     Options.LoadSizes = {8, 4, 2, 1};
   else
     Options.LoadSizes = {4, 2, 1};
+  if (IsZeroCmp && ST->hasVInstructions()) {
+    unsigned RealMinVLen = ST->getRealMinVLen() / 8;
+    for (int LMUL = 1; LMUL <= 8; LMUL *= 2)
+      Options.LoadSizes.insert(Options.LoadSizes.begin(), RealMinVLen * LMUL);
+  }
   return Options;
 }
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index 06fb88b02ea4a6..ba702b4921f098 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2910,190 +2910,24 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_16:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a4, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 3(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a5, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a6, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 4(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 5(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 6(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 7(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a6, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 4(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 5(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a3, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 6(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 7(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a7, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 8(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 9(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a3, a3, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 10(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 11(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a7, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 8(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 9(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 10(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu t0, 11(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a6, a7, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli t0, t0, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, t0, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, a5, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 12(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 13(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 14(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 15(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a6, a7, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 12(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 13(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 14(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 15(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, a7, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a4, a0
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a2, a0
-; CHECK-ALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-ALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-ALIGNED-RV32-V-NEXT:    vmnot.m v8, v8
+; CHECK-ALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_16:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 4(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 5(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a4, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 6(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 7(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a5, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a6, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 32
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a3, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 3(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 4(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 5(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 6(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a7, 7(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a7, a7, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a7, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a4, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 32
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 8(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 9(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 10(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 11(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 12(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 13(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 14(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a0, 15(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 32
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 8(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 9(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 10(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 11(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 12(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 13(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 14(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 15(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 32
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a2, a0
-; CHECK-ALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-ALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-ALIGNED-RV64-V-NEXT:    vmnot.m v8, v8
+; CHECK-ALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
+; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_16:
@@ -3194,34 +3028,24 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmnot.m v8, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmnot.m v8, v8
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 16)
@@ -3229,15 +3053,15 @@ entry:
 }
 
 define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_31:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 31
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -3249,6 +3073,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/114517