[llvm] [LoongArch] Support bswap for LSX/LASX VTs (PR #114171)

WÁNG Xuěruì via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 29 20:18:03 PDT 2024


https://github.com/xen0n created https://github.com/llvm/llvm-project/pull/114171

On top of #114170

>From 8fe233f0d304f515cab1531480c19e5c4111012a Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git at xen0n.name>
Date: Wed, 30 Oct 2024 10:55:04 +0800
Subject: [PATCH 1/2] [LoongArch][NFC] Pre-commit tests for LSX/LASX bswap
 codegen

---
 llvm/test/CodeGen/LoongArch/lasx/bswap.ll | 86 +++++++++++++++++++++++
 llvm/test/CodeGen/LoongArch/lsx/bswap.ll  | 86 +++++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/bswap.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/bswap.ll

diff --git a/llvm/test/CodeGen/LoongArch/lasx/bswap.ll b/llvm/test/CodeGen/LoongArch/lasx/bswap.ll
new file mode 100644
index 00000000000000..4f6d49c7a79db5
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/bswap.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @bswap_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: bswap_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvsrli.h $xr1, $xr0, 8
+; CHECK-NEXT:    xvslli.h $xr0, $xr0, 8
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
+  store <16 x i16> %res, ptr %dst
+  ret void
+}
+
+define void @bswap_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: bswap_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    lu12i.w $a0, 15
+; CHECK-NEXT:    ori $a0, $a0, 3840
+; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a0
+; CHECK-NEXT:    xvsrli.w $xr2, $xr0, 8
+; CHECK-NEXT:    xvand.v $xr2, $xr2, $xr1
+; CHECK-NEXT:    xvsrli.w $xr3, $xr0, 24
+; CHECK-NEXT:    xvor.v $xr2, $xr2, $xr3
+; CHECK-NEXT:    xvand.v $xr1, $xr0, $xr1
+; CHECK-NEXT:    xvslli.w $xr1, $xr1, 8
+; CHECK-NEXT:    xvslli.w $xr0, $xr0, 24
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
+  store <8 x i32> %res, ptr %dst
+  ret void
+}
+
+define void @bswap_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: bswap_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    lu12i.w $a0, 4080
+; CHECK-NEXT:    xvreplgr2vr.d $xr1, $a0
+; CHECK-NEXT:    xvsrli.d $xr2, $xr0, 24
+; CHECK-NEXT:    xvand.v $xr2, $xr2, $xr1
+; CHECK-NEXT:    lu12i.w $a0, -4096
+; CHECK-NEXT:    lu32i.d $a0, 0
+; CHECK-NEXT:    xvreplgr2vr.d $xr3, $a0
+; CHECK-NEXT:    xvsrli.d $xr4, $xr0, 8
+; CHECK-NEXT:    xvand.v $xr4, $xr4, $xr3
+; CHECK-NEXT:    xvor.v $xr2, $xr4, $xr2
+; CHECK-NEXT:    lu12i.w $a0, 15
+; CHECK-NEXT:    ori $a0, $a0, 3840
+; CHECK-NEXT:    xvreplgr2vr.d $xr4, $a0
+; CHECK-NEXT:    xvsrli.d $xr5, $xr0, 40
+; CHECK-NEXT:    xvand.v $xr5, $xr5, $xr4
+; CHECK-NEXT:    xvsrli.d $xr6, $xr0, 56
+; CHECK-NEXT:    xvor.v $xr5, $xr5, $xr6
+; CHECK-NEXT:    xvor.v $xr2, $xr2, $xr5
+; CHECK-NEXT:    xvand.v $xr1, $xr0, $xr1
+; CHECK-NEXT:    xvslli.d $xr1, $xr1, 24
+; CHECK-NEXT:    xvand.v $xr3, $xr0, $xr3
+; CHECK-NEXT:    xvslli.d $xr3, $xr3, 8
+; CHECK-NEXT:    xvor.v $xr1, $xr1, $xr3
+; CHECK-NEXT:    xvand.v $xr3, $xr0, $xr4
+; CHECK-NEXT:    xvslli.d $xr3, $xr3, 40
+; CHECK-NEXT:    xvslli.d $xr0, $xr0, 56
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr3
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
+  store <4 x i64> %res, ptr %dst
+  ret void
+}
+
+declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/bswap.ll b/llvm/test/CodeGen/LoongArch/lsx/bswap.ll
new file mode 100644
index 00000000000000..ce7af9d33f1501
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/bswap.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @bswap_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: bswap_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vsrli.h $vr1, $vr0, 8
+; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
+  store <8 x i16> %res, ptr %dst
+  ret void
+}
+
+define void @bswap_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: bswap_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    lu12i.w $a0, 15
+; CHECK-NEXT:    ori $a0, $a0, 3840
+; CHECK-NEXT:    vreplgr2vr.w $vr1, $a0
+; CHECK-NEXT:    vsrli.w $vr2, $vr0, 8
+; CHECK-NEXT:    vand.v $vr2, $vr2, $vr1
+; CHECK-NEXT:    vsrli.w $vr3, $vr0, 24
+; CHECK-NEXT:    vor.v $vr2, $vr2, $vr3
+; CHECK-NEXT:    vand.v $vr1, $vr0, $vr1
+; CHECK-NEXT:    vslli.w $vr1, $vr1, 8
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
+  store <4 x i32> %res, ptr %dst
+  ret void
+}
+
+define void @bswap_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: bswap_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    lu12i.w $a0, 4080
+; CHECK-NEXT:    vreplgr2vr.d $vr1, $a0
+; CHECK-NEXT:    vsrli.d $vr2, $vr0, 24
+; CHECK-NEXT:    vand.v $vr2, $vr2, $vr1
+; CHECK-NEXT:    lu12i.w $a0, -4096
+; CHECK-NEXT:    lu32i.d $a0, 0
+; CHECK-NEXT:    vreplgr2vr.d $vr3, $a0
+; CHECK-NEXT:    vsrli.d $vr4, $vr0, 8
+; CHECK-NEXT:    vand.v $vr4, $vr4, $vr3
+; CHECK-NEXT:    vor.v $vr2, $vr4, $vr2
+; CHECK-NEXT:    lu12i.w $a0, 15
+; CHECK-NEXT:    ori $a0, $a0, 3840
+; CHECK-NEXT:    vreplgr2vr.d $vr4, $a0
+; CHECK-NEXT:    vsrli.d $vr5, $vr0, 40
+; CHECK-NEXT:    vand.v $vr5, $vr5, $vr4
+; CHECK-NEXT:    vsrli.d $vr6, $vr0, 56
+; CHECK-NEXT:    vor.v $vr5, $vr5, $vr6
+; CHECK-NEXT:    vor.v $vr2, $vr2, $vr5
+; CHECK-NEXT:    vand.v $vr1, $vr0, $vr1
+; CHECK-NEXT:    vslli.d $vr1, $vr1, 24
+; CHECK-NEXT:    vand.v $vr3, $vr0, $vr3
+; CHECK-NEXT:    vslli.d $vr3, $vr3, 8
+; CHECK-NEXT:    vor.v $vr1, $vr1, $vr3
+; CHECK-NEXT:    vand.v $vr3, $vr0, $vr4
+; CHECK-NEXT:    vslli.d $vr3, $vr3, 40
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr3
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
+  store <2 x i64> %res, ptr %dst
+  ret void
+}
+
+declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)

>From b6fc5473dfb332bbaa2d5c5f881756ed1e3a7db2 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git at xen0n.name>
Date: Wed, 30 Oct 2024 11:01:09 +0800
Subject: [PATCH 2/2] [LoongArch] Support bswap for LSX/LASX VTs

While the LSX/LASX instruction sets seem to not include byteswap
functionality, it is actually possible through the low-overhead
{,X}VSHUF4I family of instructions.
---
 .../LoongArch/LoongArchISelLowering.cpp       |  6 +++
 .../LoongArch/LoongArchLASXInstrInfo.td       |  6 +++
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |  6 +++
 llvm/test/CodeGen/LoongArch/lasx/bswap.ll     | 48 ++-----------------
 llvm/test/CodeGen/LoongArch/lsx/bswap.ll      | 48 ++-----------------
 5 files changed, 26 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index e2c644a56c95b0..f7d0da265d9b94 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -269,6 +269,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
     }
+    for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+      setOperationAction(ISD::BSWAP, VT, Legal);
+    }
     for (MVT VT : {MVT::v4i32, MVT::v2i64}) {
       setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal);
       setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal);
@@ -317,6 +320,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
     }
+    for (MVT VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+      setOperationAction(ISD::BSWAP, VT, Legal);
+    }
     for (MVT VT : {MVT::v8i32, MVT::v4i32, MVT::v4i64}) {
       setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal);
       setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index d13cc9af135b57..3e39e2c10a617a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1444,6 +1444,12 @@ def : Pat<(xor (v8i32 LASX256:$xj), (v8i32 (vsplat_uimm_pow2 uimm5:$imm))),
 def : Pat<(xor (v4i64 LASX256:$xj), (v4i64 (vsplat_uimm_pow2 uimm6:$imm))),
           (XVBITREVI_D LASX256:$xj, uimm6:$imm)>;
 
+// Vector bswaps
+def : Pat<(bswap (v16i16 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b10110001)>;
+def : Pat<(bswap (v8i32 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b00011011)>;
+def : Pat<(bswap (v4i64 LASX256:$xj)),
+          (XVSHUF4I_W (XVSHUF4I_B LASX256:$xj, 0b00011011), 0b10110001)>;
+
 // XVFADD_{S/D}
 defm : PatXrXrF<fadd, "XVFADD">;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 86aa6dcfd8261f..525d2802daa235 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1600,6 +1600,12 @@ def : Pat<(xor (v4i32 LSX128:$vj), (v4i32 (vsplat_uimm_pow2 uimm5:$imm))),
 def : Pat<(xor (v2i64 LSX128:$vj), (v2i64 (vsplat_uimm_pow2 uimm6:$imm))),
           (VBITREVI_D LSX128:$vj, uimm6:$imm)>;
 
+// Vector bswaps
+def : Pat<(bswap (v8i16 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b10110001)>;
+def : Pat<(bswap (v4i32 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b00011011)>;
+def : Pat<(bswap (v2i64 LSX128:$vj)),
+          (VSHUF4I_W (VSHUF4I_B LSX128:$vj, 0b00011011), 0b10110001)>;
+
 // VFADD_{S/D}
 defm : PatVrVrF<fadd, "VFADD">;
 
diff --git a/llvm/test/CodeGen/LoongArch/lasx/bswap.ll b/llvm/test/CodeGen/LoongArch/lasx/bswap.ll
index 4f6d49c7a79db5..1b0132d25ed591 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/bswap.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/bswap.ll
@@ -5,9 +5,7 @@ define void @bswap_v16i16(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: bswap_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvsrli.h $xr1, $xr0, 8
-; CHECK-NEXT:    xvslli.h $xr0, $xr0, 8
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 177
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load <16 x i16>, ptr %src
@@ -20,18 +18,7 @@ define void @bswap_v8i32(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: bswap_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    lu12i.w $a0, 15
-; CHECK-NEXT:    ori $a0, $a0, 3840
-; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a0
-; CHECK-NEXT:    xvsrli.w $xr2, $xr0, 8
-; CHECK-NEXT:    xvand.v $xr2, $xr2, $xr1
-; CHECK-NEXT:    xvsrli.w $xr3, $xr0, 24
-; CHECK-NEXT:    xvor.v $xr2, $xr2, $xr3
-; CHECK-NEXT:    xvand.v $xr1, $xr0, $xr1
-; CHECK-NEXT:    xvslli.w $xr1, $xr1, 8
-; CHECK-NEXT:    xvslli.w $xr0, $xr0, 24
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load <8 x i32>, ptr %src
@@ -44,35 +31,8 @@ define void @bswap_v4i64(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: bswap_v4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    lu12i.w $a0, 4080
-; CHECK-NEXT:    xvreplgr2vr.d $xr1, $a0
-; CHECK-NEXT:    xvsrli.d $xr2, $xr0, 24
-; CHECK-NEXT:    xvand.v $xr2, $xr2, $xr1
-; CHECK-NEXT:    lu12i.w $a0, -4096
-; CHECK-NEXT:    lu32i.d $a0, 0
-; CHECK-NEXT:    xvreplgr2vr.d $xr3, $a0
-; CHECK-NEXT:    xvsrli.d $xr4, $xr0, 8
-; CHECK-NEXT:    xvand.v $xr4, $xr4, $xr3
-; CHECK-NEXT:    xvor.v $xr2, $xr4, $xr2
-; CHECK-NEXT:    lu12i.w $a0, 15
-; CHECK-NEXT:    ori $a0, $a0, 3840
-; CHECK-NEXT:    xvreplgr2vr.d $xr4, $a0
-; CHECK-NEXT:    xvsrli.d $xr5, $xr0, 40
-; CHECK-NEXT:    xvand.v $xr5, $xr5, $xr4
-; CHECK-NEXT:    xvsrli.d $xr6, $xr0, 56
-; CHECK-NEXT:    xvor.v $xr5, $xr5, $xr6
-; CHECK-NEXT:    xvor.v $xr2, $xr2, $xr5
-; CHECK-NEXT:    xvand.v $xr1, $xr0, $xr1
-; CHECK-NEXT:    xvslli.d $xr1, $xr1, 24
-; CHECK-NEXT:    xvand.v $xr3, $xr0, $xr3
-; CHECK-NEXT:    xvslli.d $xr3, $xr3, 8
-; CHECK-NEXT:    xvor.v $xr1, $xr1, $xr3
-; CHECK-NEXT:    xvand.v $xr3, $xr0, $xr4
-; CHECK-NEXT:    xvslli.d $xr3, $xr3, 40
-; CHECK-NEXT:    xvslli.d $xr0, $xr0, 56
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr3
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr2
+; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 177
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load <4 x i64>, ptr %src
diff --git a/llvm/test/CodeGen/LoongArch/lsx/bswap.ll b/llvm/test/CodeGen/LoongArch/lsx/bswap.ll
index ce7af9d33f1501..8172e21eae34df 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/bswap.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/bswap.ll
@@ -5,9 +5,7 @@ define void @bswap_v8i16(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: bswap_v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vsrli.h $vr1, $vr0, 8
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 177
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load <8 x i16>, ptr %src
@@ -20,18 +18,7 @@ define void @bswap_v4i32(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: bswap_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    lu12i.w $a0, 15
-; CHECK-NEXT:    ori $a0, $a0, 3840
-; CHECK-NEXT:    vreplgr2vr.w $vr1, $a0
-; CHECK-NEXT:    vsrli.w $vr2, $vr0, 8
-; CHECK-NEXT:    vand.v $vr2, $vr2, $vr1
-; CHECK-NEXT:    vsrli.w $vr3, $vr0, 24
-; CHECK-NEXT:    vor.v $vr2, $vr2, $vr3
-; CHECK-NEXT:    vand.v $vr1, $vr0, $vr1
-; CHECK-NEXT:    vslli.w $vr1, $vr1, 8
-; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load <4 x i32>, ptr %src
@@ -44,35 +31,8 @@ define void @bswap_v2i64(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: bswap_v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    lu12i.w $a0, 4080
-; CHECK-NEXT:    vreplgr2vr.d $vr1, $a0
-; CHECK-NEXT:    vsrli.d $vr2, $vr0, 24
-; CHECK-NEXT:    vand.v $vr2, $vr2, $vr1
-; CHECK-NEXT:    lu12i.w $a0, -4096
-; CHECK-NEXT:    lu32i.d $a0, 0
-; CHECK-NEXT:    vreplgr2vr.d $vr3, $a0
-; CHECK-NEXT:    vsrli.d $vr4, $vr0, 8
-; CHECK-NEXT:    vand.v $vr4, $vr4, $vr3
-; CHECK-NEXT:    vor.v $vr2, $vr4, $vr2
-; CHECK-NEXT:    lu12i.w $a0, 15
-; CHECK-NEXT:    ori $a0, $a0, 3840
-; CHECK-NEXT:    vreplgr2vr.d $vr4, $a0
-; CHECK-NEXT:    vsrli.d $vr5, $vr0, 40
-; CHECK-NEXT:    vand.v $vr5, $vr5, $vr4
-; CHECK-NEXT:    vsrli.d $vr6, $vr0, 56
-; CHECK-NEXT:    vor.v $vr5, $vr5, $vr6
-; CHECK-NEXT:    vor.v $vr2, $vr2, $vr5
-; CHECK-NEXT:    vand.v $vr1, $vr0, $vr1
-; CHECK-NEXT:    vslli.d $vr1, $vr1, 24
-; CHECK-NEXT:    vand.v $vr3, $vr0, $vr3
-; CHECK-NEXT:    vslli.d $vr3, $vr3, 8
-; CHECK-NEXT:    vor.v $vr1, $vr1, $vr3
-; CHECK-NEXT:    vand.v $vr3, $vr0, $vr4
-; CHECK-NEXT:    vslli.d $vr3, $vr3, 40
-; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr3
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr2
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 177
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load <2 x i64>, ptr %src



More information about the llvm-commits mailing list