[llvm] [LoongArch] Optimize vector bitreverse using scalar bitrev and vshuf4i (PR #118054)

via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 9 03:54:06 PST 2024


https://github.com/zhaoqi5 updated https://github.com/llvm/llvm-project/pull/118054

>From 465803bf93120b968340a2d3b5e61d2e4983eba6 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Fri, 29 Nov 2024 13:15:51 +0800
Subject: [PATCH 1/2] [LoongArch] Pre-commit tests for vector type
 llvm.bitreverse. NFC

A later commit will optimize this.
---
 .../test/CodeGen/LoongArch/lasx/bitreverse.ll | 107 ++++++++++++++++++
 llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll | 101 +++++++++++++++++
 2 files changed, 208 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll

diff --git a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
new file mode 100644
index 00000000000000..3d0d232fcca687
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+lasx --verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
+
+define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvslli.b $xr1, $xr0, 4
+; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 4
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvandi.b $xr1, $xr0, 51
+; CHECK-NEXT:    xvslli.b $xr1, $xr1, 2
+; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 2
+; CHECK-NEXT:    xvandi.b $xr0, $xr0, 51
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvandi.b $xr1, $xr0, 85
+; CHECK-NEXT:    xvslli.b $xr1, $xr1, 1
+; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvandi.b $xr0, $xr0, 85
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+  ret <32 x i8> %b
+}
+
+declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
+
+define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 177
+; CHECK-NEXT:    xvsrli.h $xr1, $xr0, 4
+; CHECK-NEXT:    xvrepli.b $xr2, 15
+; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvslli.h $xr0, $xr0, 4
+; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvsrli.h $xr1, $xr0, 2
+; CHECK-NEXT:    xvrepli.b $xr2, 51
+; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvslli.h $xr0, $xr0, 2
+; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvsrli.h $xr1, $xr0, 1
+; CHECK-NEXT:    xvrepli.b $xr2, 85
+; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvslli.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+  ret <16 x i16> %b
+}
+
+declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
+
+define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
+; CHECK-NEXT:    xvsrli.w $xr1, $xr0, 4
+; CHECK-NEXT:    xvrepli.b $xr2, 15
+; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvslli.w $xr0, $xr0, 4
+; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvsrli.w $xr1, $xr0, 2
+; CHECK-NEXT:    xvrepli.b $xr2, 51
+; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvslli.w $xr0, $xr0, 2
+; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvsrli.w $xr1, $xr0, 1
+; CHECK-NEXT:    xvrepli.b $xr2, 85
+; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvslli.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+  ret <8 x i32> %b
+}
+
+declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>)
+
+define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 2
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    ret
+  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+  ret <4 x i64> %b
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll
new file mode 100644
index 00000000000000..93624c8dd6a965
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+lsx --verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
+
+define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vslli.b $vr1, $vr0, 4
+; CHECK-NEXT:    vsrli.b $vr0, $vr0, 4
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vandi.b $vr1, $vr0, 51
+; CHECK-NEXT:    vslli.b $vr1, $vr1, 2
+; CHECK-NEXT:    vsrli.b $vr0, $vr0, 2
+; CHECK-NEXT:    vandi.b $vr0, $vr0, 51
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vandi.b $vr1, $vr0, 85
+; CHECK-NEXT:    vslli.b $vr1, $vr1, 1
+; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
+; CHECK-NEXT:    vandi.b $vr0, $vr0, 85
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+  ret <16 x i8> %b
+}
+
+declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>)
+
+define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 177
+; CHECK-NEXT:    vsrli.h $vr1, $vr0, 4
+; CHECK-NEXT:    vrepli.b $vr2, 15
+; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vslli.h $vr0, $vr0, 4
+; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
+; CHECK-NEXT:    vsrli.h $vr1, $vr0, 2
+; CHECK-NEXT:    vrepli.b $vr2, 51
+; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vslli.h $vr0, $vr0, 2
+; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
+; CHECK-NEXT:    vsrli.h $vr1, $vr0, 1
+; CHECK-NEXT:    vrepli.b $vr2, 85
+; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vslli.h $vr0, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+  ret <8 x i16> %b
+}
+
+declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
+
+define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
+; CHECK-NEXT:    vsrli.w $vr1, $vr0, 4
+; CHECK-NEXT:    vrepli.b $vr2, 15
+; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 4
+; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
+; CHECK-NEXT:    vsrli.w $vr1, $vr0, 2
+; CHECK-NEXT:    vrepli.b $vr2, 51
+; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 2
+; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
+; CHECK-NEXT:    vsrli.w $vr1, $vr0, 1
+; CHECK-NEXT:    vrepli.b $vr2, 85
+; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+  ret <4 x i32> %b
+}
+
+declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>)
+
+define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+  ret <2 x i64> %b
+}

>From f3065bddf7a325f1bbaefd9c1895a24669e3beda Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Fri, 29 Nov 2024 13:19:41 +0800
Subject: [PATCH 2/2] [LoongArch] Optimize vector bitreverse using scalar
 bitrev and vshuf4i

Custom lower vector type bitreverse to scalar bitrev and vshuf4i
instructions.

Keep `v2i64` and `v4i64` bitreverse `Expand`, it's good enough.
---
 .../LoongArch/LoongArchISelLowering.cpp       | 51 +++++++++++
 .../Target/LoongArch/LoongArchISelLowering.h  |  2 +
 .../Target/LoongArch/LoongArchInstrInfo.td    |  2 +
 .../test/CodeGen/LoongArch/lasx/bitreverse.ll | 90 ++++++++-----------
 llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll | 72 +++++----------
 5 files changed, 115 insertions(+), 102 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 16bceacfaa222c..e10f122b381219 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -270,6 +270,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
     }
+    for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
     for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64})
       setOperationAction(ISD::BSWAP, VT, Legal);
     for (MVT VT : {MVT::v4i32, MVT::v2i64}) {
@@ -324,6 +326,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
     }
+    for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32})
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
     for (MVT VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64})
       setOperationAction(ISD::BSWAP, VT, Legal);
     for (MVT VT : {MVT::v8i32, MVT::v4i32, MVT::v4i64}) {
@@ -440,10 +444,56 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
     return lowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return lowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::BITREVERSE:
+    return lowerBITREVERSE(Op, DAG);
   }
   return SDValue();
 }
 
+SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT ResTy = Op->getValueType(0);
+  SDValue Src = Op->getOperand(0);
+  SDLoc DL(Op);
+
+  EVT NewVT = ResTy.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+  unsigned int OrigEltNum = ResTy.getVectorNumElements();
+  unsigned int NewEltNum = NewVT.getVectorNumElements();
+
+  SDValue NewSrc = DAG.getNode(ISD::BITCAST, DL, NewVT, Src);
+
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned int i = 0; i < NewEltNum; i++) {
+    SDValue Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, NewSrc,
+                             DAG.getConstant(i, DL, MVT::i64));
+    SDValue RevOp = DAG.getNode((ResTy == MVT::v16i8 || ResTy == MVT::v32i8)
+                                    ? LoongArchISD::BITREV_8B
+                                    : ISD::BITREVERSE,
+                                DL, MVT::i64, Op);
+    Ops.push_back(RevOp);
+  }
+  SDValue Res =
+      DAG.getNode(ISD::BITCAST, DL, ResTy, DAG.getBuildVector(NewVT, DL, Ops));
+
+  switch (ResTy.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::v16i8:
+  case MVT::v32i8:
+    return Res;
+  case MVT::v8i16:
+  case MVT::v16i16:
+  case MVT::v4i32:
+  case MVT::v8i32: {
+    SmallVector<int, 32> Mask;
+    for (unsigned int i = 0; i < NewEltNum; i++)
+      for (int j = OrigEltNum / NewEltNum - 1; j >= 0; j--)
+        Mask.push_back(j + (OrigEltNum / NewEltNum) * i);
+    return DAG.getVectorShuffle(ResTy, DL, Res, DAG.getUNDEF(ResTy), Mask);
+  }
+  }
+}
+
 /// Determine whether a range fits a regular pattern of values.
 /// This function accounts for the possibility of jumping over the End iterator.
 template <typename ValType>
@@ -4680,6 +4730,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(REVB_2H)
     NODE_NAME_CASE(REVB_2W)
     NODE_NAME_CASE(BITREV_4B)
+    NODE_NAME_CASE(BITREV_8B)
     NODE_NAME_CASE(BITREV_W)
     NODE_NAME_CASE(ROTR_W)
     NODE_NAME_CASE(ROTL_W)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 605093b01476d0..a3bcc7599efc3c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -68,6 +68,7 @@ enum NodeType : unsigned {
   REVB_2H,
   REVB_2W,
   BITREV_4B,
+  BITREV_8B,
   BITREV_W,
 
   // Intrinsic operations start ============================================
@@ -334,6 +335,7 @@ class LoongArchTargetLowering : public TargetLowering {
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBITREVERSE(SDValue Op, SelectionDAG &DAG) const;
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 6134daf2fbe630..2101aa058305fe 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -112,6 +112,7 @@ def loongarch_bstrpick
 def loongarch_revb_2h : SDNode<"LoongArchISD::REVB_2H", SDTUnaryOp>;
 def loongarch_revb_2w : SDNode<"LoongArchISD::REVB_2W", SDTUnaryOp>;
 def loongarch_bitrev_4b : SDNode<"LoongArchISD::BITREV_4B", SDTUnaryOp>;
+def loongarch_bitrev_8b : SDNode<"LoongArchISD::BITREV_8B", SDTUnaryOp>;
 def loongarch_bitrev_w : SDNode<"LoongArchISD::BITREV_W", SDTUnaryOp>;
 def loongarch_clzw : SDNode<"LoongArchISD::CLZ_W", SDTIntBitCountUnaryOp>;
 def loongarch_ctzw : SDNode<"LoongArchISD::CTZ_W", SDTIntBitCountUnaryOp>;
@@ -1765,6 +1766,7 @@ def : Pat<(bitreverse (bswap GPR:$rj)), (BITREV_4B GPR:$rj)>;
 let Predicates = [IsLA64] in {
 def : Pat<(loongarch_revb_2w GPR:$rj), (REVB_2W GPR:$rj)>;
 def : Pat<(bswap GPR:$rj), (REVB_D GPR:$rj)>;
+def : Pat<(loongarch_bitrev_8b GPR:$rj), (BITREV_8B GPR:$rj)>;
 def : Pat<(loongarch_bitrev_w GPR:$rj), (BITREV_W GPR:$rj)>;
 def : Pat<(bitreverse GPR:$rj), (BITREV_D GPR:$rj)>;
 def : Pat<(bswap (bitreverse GPR:$rj)), (BITREV_8B GPR:$rj)>;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
index 3d0d232fcca687..11f1bce55fad62 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
@@ -7,19 +7,19 @@ declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
 define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: test_bitreverse_v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvslli.b $xr1, $xr0, 4
-; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 4
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvandi.b $xr1, $xr0, 51
-; CHECK-NEXT:    xvslli.b $xr1, $xr1, 2
-; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 2
-; CHECK-NEXT:    xvandi.b $xr0, $xr0, 51
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvandi.b $xr1, $xr0, 85
-; CHECK-NEXT:    xvslli.b $xr1, $xr1, 1
-; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
-; CHECK-NEXT:    xvandi.b $xr0, $xr0, 85
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT:    bitrev.8b $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT:    bitrev.8b $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
+; CHECK-NEXT:    bitrev.8b $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 2
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
+; CHECK-NEXT:    bitrev.8b $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
   %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
   ret <32 x i8> %b
@@ -30,25 +30,19 @@ declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
 define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: test_bitreverse_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 177
-; CHECK-NEXT:    xvsrli.h $xr1, $xr0, 4
-; CHECK-NEXT:    xvrepli.b $xr2, 15
-; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvslli.h $xr0, $xr0, 4
-; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvsrli.h $xr1, $xr0, 2
-; CHECK-NEXT:    xvrepli.b $xr2, 51
-; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvslli.h $xr0, $xr0, 2
-; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvsrli.h $xr1, $xr0, 1
-; CHECK-NEXT:    xvrepli.b $xr2, 85
-; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvslli.h $xr0, $xr0, 1
-; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 2
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT:    xvshuf4i.h $xr0, $xr1, 27
 ; CHECK-NEXT:    ret
   %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
   ret <16 x i16> %b
@@ -59,25 +53,19 @@ declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
 define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; CHECK-LABEL: test_bitreverse_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
-; CHECK-NEXT:    xvsrli.w $xr1, $xr0, 4
-; CHECK-NEXT:    xvrepli.b $xr2, 15
-; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvslli.w $xr0, $xr0, 4
-; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvsrli.w $xr1, $xr0, 2
-; CHECK-NEXT:    xvrepli.b $xr2, 51
-; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvslli.w $xr0, $xr0, 2
-; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvsrli.w $xr1, $xr0, 1
-; CHECK-NEXT:    xvrepli.b $xr2, 85
-; CHECK-NEXT:    xvand.v $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvslli.w $xr0, $xr0, 1
-; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 2
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr1, 177
 ; CHECK-NEXT:    ret
   %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %b
diff --git a/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll
index 93624c8dd6a965..4c17d3fd8d7b2e 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll
@@ -7,19 +7,13 @@ declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
 define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: test_bitreverse_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vslli.b $vr1, $vr0, 4
-; CHECK-NEXT:    vsrli.b $vr0, $vr0, 4
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vandi.b $vr1, $vr0, 51
-; CHECK-NEXT:    vslli.b $vr1, $vr1, 2
-; CHECK-NEXT:    vsrli.b $vr0, $vr0, 2
-; CHECK-NEXT:    vandi.b $vr0, $vr0, 51
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vandi.b $vr1, $vr0, 85
-; CHECK-NEXT:    vslli.b $vr1, $vr1, 1
-; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
-; CHECK-NEXT:    vandi.b $vr0, $vr0, 85
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
+; CHECK-NEXT:    bitrev.8b $a0, $a0
+; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT:    bitrev.8b $a0, $a0
+; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
 ; CHECK-NEXT:    ret
   %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %b
@@ -30,25 +24,13 @@ declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>)
 define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: test_bitreverse_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 177
-; CHECK-NEXT:    vsrli.h $vr1, $vr0, 4
-; CHECK-NEXT:    vrepli.b $vr2, 15
-; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 4
-; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
-; CHECK-NEXT:    vsrli.h $vr1, $vr0, 2
-; CHECK-NEXT:    vrepli.b $vr2, 51
-; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 2
-; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
-; CHECK-NEXT:    vsrli.h $vr1, $vr0, 1
-; CHECK-NEXT:    vrepli.b $vr2, 85
-; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr1, 27
 ; CHECK-NEXT:    ret
   %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %b
@@ -59,25 +41,13 @@ declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
 define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: test_bitreverse_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
-; CHECK-NEXT:    vsrli.w $vr1, $vr0, 4
-; CHECK-NEXT:    vrepli.b $vr2, 15
-; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
-; CHECK-NEXT:    vslli.w $vr0, $vr0, 4
-; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
-; CHECK-NEXT:    vsrli.w $vr1, $vr0, 2
-; CHECK-NEXT:    vrepli.b $vr2, 51
-; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
-; CHECK-NEXT:    vslli.w $vr0, $vr0, 2
-; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
-; CHECK-NEXT:    vsrli.w $vr1, $vr0, 1
-; CHECK-NEXT:    vrepli.b $vr2, 85
-; CHECK-NEXT:    vand.v $vr1, $vr1, $vr2
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr2
-; CHECK-NEXT:    vslli.w $vr0, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT:    bitrev.d $a0, $a0
+; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr1, 177
 ; CHECK-NEXT:    ret
   %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %b



More information about the llvm-commits mailing list