[llvm] [AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. (PR #83514)

Thu Feb 29 17:31:27 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Dinar Temirbulatov (dtemirbulatov)

<details>
<summary>Changes</summary>

Allow to fold or/and-and to BSL instuction for scalable vectors.

---
Full diff: https://github.com/llvm/llvm-project/pull/83514.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+23-16) 
- (added) llvm/test/CodeGen/AArch64/sve2-bsl.ll (+21) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b1677df56e1bea..7c922d9dd12412 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17594,16 +17594,14 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
 
   if (!VT.isVector())
     return SDValue();
 
-  // The combining code currently only works for NEON vectors. In particular,
-  // it does not work for SVE when dealing with vectors wider than 128 bits.
-  // It also doesn't work for streaming mode because it causes generating
-  // bsl instructions that are invalid in streaming mode.
-  if (TLI.useSVEForFixedLengthVectorVT(
-          VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
+  // The combining code works for NEON, SVE2 and SME.
+  if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
+      (VT.isScalableVector() && !Subtarget.hasSVE2orSME()))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -17660,23 +17658,32 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     for (int j = 1; j >= 0; --j) {
       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
-      if (!BVN0 || !BVN1)
+      APInt Val1, Val2;
+      if ((!BVN0 || !BVN1) &&
+          (!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
+           !ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))
         continue;
 
       bool FoundMatch = true;
-      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-        if (!CN0 || !CN1 ||
-            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-          FoundMatch = false;
-          break;
+      if (BVN0) {
+        for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+          ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+          ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+          if (!CN0 || !CN1 ||
+              CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+            FoundMatch = false;
+            break;
+          }
         }
+      } else {
+        FoundMatch = ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue());
       }
 
-      if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
+      if (FoundMatch) {
+        SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
+      }
     }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
new file mode 100644
index 00000000000000..00ace4ebdb91c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: bsl:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z0.s, #0x7fffffff
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %5 = or disjoint <vscale x 4 x i32> %3, %4
+  store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/83514