[llvm] [AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. (PR #83514)

Dinar Temirbulatov via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 9 07:58:23 PDT 2024


https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/83514

>From 3d3adc96aba3788a02209fd648c647d23af34e24 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 1 Mar 2024 01:28:19 +0000
Subject: [PATCH 1/7] [AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for
 bit-twiddling.

Allow to fold or/and-and to BSL instuction for scalable vectors.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 39 +++++++++++--------
 llvm/test/CodeGen/AArch64/sve2-bsl.ll         | 21 ++++++++++
 2 files changed, 44 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-bsl.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b1677df56e1bea..7c922d9dd12412 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17594,16 +17594,14 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
 
   if (!VT.isVector())
     return SDValue();
 
-  // The combining code currently only works for NEON vectors. In particular,
-  // it does not work for SVE when dealing with vectors wider than 128 bits.
-  // It also doesn't work for streaming mode because it causes generating
-  // bsl instructions that are invalid in streaming mode.
-  if (TLI.useSVEForFixedLengthVectorVT(
-          VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
+  // The combining code works for NEON, SVE2 and SME.
+  if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
+      (VT.isScalableVector() && !Subtarget.hasSVE2orSME()))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -17660,23 +17658,32 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     for (int j = 1; j >= 0; --j) {
       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
-      if (!BVN0 || !BVN1)
+      APInt Val1, Val2;
+      if ((!BVN0 || !BVN1) &&
+          (!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
+           !ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))
         continue;
 
       bool FoundMatch = true;
-      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-        if (!CN0 || !CN1 ||
-            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-          FoundMatch = false;
-          break;
+      if (BVN0) {
+        for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+          ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+          ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+          if (!CN0 || !CN1 ||
+              CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+            FoundMatch = false;
+            break;
+          }
         }
+      } else {
+        FoundMatch = ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue());
       }
 
-      if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
+      if (FoundMatch) {
+        SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
+      }
     }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
new file mode 100644
index 00000000000000..00ace4ebdb91c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: bsl:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z0.s, #0x7fffffff
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %5 = or disjoint <vscale x 4 x i32> %3, %4
+  store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+  ret void
+}

>From 266fe3c0e7bcac40deafab2a02cd558a2cd16902 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Wed, 6 Mar 2024 22:04:29 +0000
Subject: [PATCH 2/7] Added negative test.

---
 llvm/test/CodeGen/AArch64/sve2-bsl.ll | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 00ace4ebdb91c7..2533982742e2da 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -19,3 +19,23 @@ define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
   store <vscale x 4 x i32> %5, ptr %ptr3, align 4
   ret void
 }
+
+define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: nobsl:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and z0.s, z0.s, #0x7fffffff
+; CHECK-NEXT:    and z1.s, z1.s, #0x7ffffffe
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %5 = or disjoint <vscale x 4 x i32> %3, %4
+  store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+  ret void
+}

>From 719078b4a036b46d5d876862bdd58c6b70b89bfa Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Thu, 7 Mar 2024 14:17:51 +0000
Subject: [PATCH 3/7] Removed disjoint flag from the test.

---
 llvm/test/CodeGen/AArch64/sve2-bsl.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 2533982742e2da..706f8a77f7b30c 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -15,7 +15,7 @@ define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
   %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
   %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
   %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %5 = or disjoint <vscale x 4 x i32> %3, %4
+  %5 = or <vscale x 4 x i32> %3, %4
   store <vscale x 4 x i32> %5, ptr %ptr3, align 4
   ret void
 }
@@ -35,7 +35,7 @@ define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
   %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
   %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
   %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %5 = or disjoint <vscale x 4 x i32> %3, %4
+  %5 = or <vscale x 4 x i32> %3, %4
   store <vscale x 4 x i32> %5, ptr %ptr3, align 4
   ret void
 }

>From 4465caaa4a92b400f7308efa624f12a94c5dbcb7 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 8 Mar 2024 16:20:16 +0000
Subject: [PATCH 4/7] Restored disjoint in the sve2-bsl.ll, Check disjoint
 first in tryCombineToBSL().

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++----
 llvm/test/CodeGen/AArch64/sve2-bsl.ll           | 4 ++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7c922d9dd12412..be3fdbf9a47b6e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17605,11 +17605,9 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
-
   SDValue N1 = N->getOperand(1);
-  if (N1.getOpcode() != ISD::AND)
+  if (!N->getFlags().hasDisjoint() &&
+      (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND))
     return SDValue();
 
   // InstCombine does (not (neg a)) => (add a -1).
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 706f8a77f7b30c..2533982742e2da 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -15,7 +15,7 @@ define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
   %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
   %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
   %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %5 = or <vscale x 4 x i32> %3, %4
+  %5 = or disjoint <vscale x 4 x i32> %3, %4
   store <vscale x 4 x i32> %5, ptr %ptr3, align 4
   ret void
 }
@@ -35,7 +35,7 @@ define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
   %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
   %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
   %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %5 = or <vscale x 4 x i32> %3, %4
+  %5 = or disjoint <vscale x 4 x i32> %3, %4
   store <vscale x 4 x i32> %5, ptr %ptr3, align 4
   ret void
 }

>From 0494bf55272caf83a1eb0f4dfd2692cbea795f64 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 25 Mar 2024 14:44:15 +0000
Subject: [PATCH 5/7] Resolve comments.

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 42 +++++++++----------
 llvm/test/CodeGen/AArch64/sve2-bsl.ll         |  5 ++-
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index be3fdbf9a47b6e..f59806c6e51d67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17605,9 +17605,11 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
   SDValue N1 = N->getOperand(1);
-  if (!N->getFlags().hasDisjoint() &&
-      (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND))
+  if (N1.getOpcode() != ISD::AND)
     return SDValue();
 
   // InstCombine does (not (neg a)) => (add a -1).
@@ -17657,31 +17659,29 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
       APInt Val1, Val2;
-      if ((!BVN0 || !BVN1) &&
-          (!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
-           !ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))
+
+      if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
+          ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
+          (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+      }
+      if (!BVN0 || !BVN1)
         continue;
 
       bool FoundMatch = true;
-      if (BVN0) {
-        for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-          ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-          ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-          if (!CN0 || !CN1 ||
-              CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-            FoundMatch = false;
-            break;
-          }
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
         }
-      } else {
-        FoundMatch = ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue());
       }
-
-      if (FoundMatch) {
-        SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
-      }
     }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 2533982742e2da..032f06688be082 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -20,8 +20,9 @@ define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
   ret void
 }
 
-define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
-; CHECK-LABEL: nobsl:
+; we are not expecting bsl instruction here.
+define void @no_bsl_fold(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: no_bsl_fold:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]

>From e3d9392c2f8e304bd268f199452fac44f58b5e56 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Tue, 9 Apr 2024 11:09:53 +0000
Subject: [PATCH 6/7] Resolve comments.

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  4 +-
 llvm/test/CodeGen/AArch64/sve2-bsl.ll         | 38 ++++++++-----------
 2 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f59806c6e51d67..a7095ef2632bb8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17656,8 +17656,6 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
   for (int i = 1; i >= 0; --i)
     for (int j = 1; j >= 0; --j) {
-      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
       APInt Val1, Val2;
 
       if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
@@ -17666,6 +17664,8 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
         return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
       }
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
       if (!BVN0 || !BVN1)
         continue;
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 032f06688be082..3fa814637cbaa5 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -1,42 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
 
-define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+define void @bsl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %ptr1) {
 ; CHECK-LABEL: bsl:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z0.s, #0x7fffffff
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
-  %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
-  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
-  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %5 = or disjoint <vscale x 4 x i32> %3, %4
-  store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+  %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %c = or disjoint <vscale x 4 x i32> %1, %2
+  store <vscale x 4 x i32> %c, ptr %ptr1, align 4
   ret void
 }
 
-; we are not expecting bsl instruction here.
-define void @no_bsl_fold(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; we are not expecting bsl instruction here. the constants do not match to fold to bsl.
+define void @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %ptr1) {
 ; CHECK-LABEL: no_bsl_fold:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; CHECK-NEXT:    and z1.s, z1.s, #0x7ffffffe
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
-  %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
-  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
-  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %5 = or disjoint <vscale x 4 x i32> %3, %4
-  store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+  %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %c = or disjoint <vscale x 4 x i32> %1, %2
+  store <vscale x 4 x i32> %c, ptr %ptr1, align 4
   ret void
 }

>From 816f60b285e22bdb6471efca241552919591cc21 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Tue, 9 Apr 2024 14:57:49 +0000
Subject: [PATCH 7/7] Resolve comments.

---
 llvm/test/CodeGen/AArch64/sve2-bsl.ll | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 3fa814637cbaa5..11f67634a3fb2c 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -1,34 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
 
-define void @bsl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %ptr1) {
+define <vscale x 4 x i32> @bsl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: bsl:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z2.s, #0x7fffffff
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
-  %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %c = or disjoint <vscale x 4 x i32> %1, %2
-  store <vscale x 4 x i32> %c, ptr %ptr1, align 4
-  ret void
+  %1 = and <vscale x 4 x i32> %a, splat(i32 2147483647)
+  %2 = and <vscale x 4 x i32> %b, splat(i32 -2147483648)
+  %c = or <vscale x 4 x i32> %1, %2
+  ret <vscale x 4 x i32> %c
 }
 
 ; we are not expecting bsl instruction here. the constants do not match to fold to bsl.
-define void @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %ptr1) {
+define <vscale x 4 x i32> @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: no_bsl_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; CHECK-NEXT:    and z1.s, z1.s, #0x7ffffffe
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
-  %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %c = or disjoint <vscale x 4 x i32> %1, %2
-  store <vscale x 4 x i32> %c, ptr %ptr1, align 4
-  ret void
+  %1 = and <vscale x 4 x i32> %a, splat(i32 2147483647)
+  %2 = and <vscale x 4 x i32> %b, splat(i32 2147483646)
+  %c = or <vscale x 4 x i32> %1, %2
+  ret <vscale x 4 x i32> %c
 }



More information about the llvm-commits mailing list