[llvm] [AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. (PR #83514)
Dinar Temirbulatov via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 8 08:27:00 PST 2024
https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/83514
>From 3d3adc96aba3788a02209fd648c647d23af34e24 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 1 Mar 2024 01:28:19 +0000
Subject: [PATCH 1/4] [AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for
bit-twiddling.
Allow to fold or/and-and to BSL instuction for scalable vectors.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 39 +++++++++++--------
llvm/test/CodeGen/AArch64/sve2-bsl.ll | 21 ++++++++++
2 files changed, 44 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve2-bsl.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b1677df56e1bea..7c922d9dd12412 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17594,16 +17594,14 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
if (!VT.isVector())
return SDValue();
- // The combining code currently only works for NEON vectors. In particular,
- // it does not work for SVE when dealing with vectors wider than 128 bits.
- // It also doesn't work for streaming mode because it causes generating
- // bsl instructions that are invalid in streaming mode.
- if (TLI.useSVEForFixedLengthVectorVT(
- VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
+ // The combining code works for NEON, SVE2 and SME.
+ if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
+ (VT.isScalableVector() && !Subtarget.hasSVE2orSME()))
return SDValue();
SDValue N0 = N->getOperand(0);
@@ -17660,23 +17658,32 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
for (int j = 1; j >= 0; --j) {
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
- if (!BVN0 || !BVN1)
+ APInt Val1, Val2;
+ if ((!BVN0 || !BVN1) &&
+ (!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
+ !ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))
continue;
bool FoundMatch = true;
- for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
- ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
- ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
- if (!CN0 || !CN1 ||
- CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
- FoundMatch = false;
- break;
+ if (BVN0) {
+ for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+ ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+ ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+ if (!CN0 || !CN1 ||
+ CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+ FoundMatch = false;
+ break;
+ }
}
+ } else {
+ FoundMatch = ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue());
}
- if (FoundMatch)
- return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
+ if (FoundMatch) {
+ SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),
N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
}
return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
new file mode 100644
index 00000000000000..00ace4ebdb91c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: bsl:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov z0.s, #0x7fffffff
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT: bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT: st1w { z1.s }, p0, [x2]
+; CHECK-NEXT: ret
+ %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+ %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+ %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+ %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+ %5 = or disjoint <vscale x 4 x i32> %3, %4
+ store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+ ret void
+}
>From 266fe3c0e7bcac40deafab2a02cd558a2cd16902 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Wed, 6 Mar 2024 22:04:29 +0000
Subject: [PATCH 2/4] Added negative test.
---
llvm/test/CodeGen/AArch64/sve2-bsl.ll | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 00ace4ebdb91c7..2533982742e2da 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -19,3 +19,23 @@ define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
store <vscale x 4 x i32> %5, ptr %ptr3, align 4
ret void
}
+
+define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: nobsl:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff
+; CHECK-NEXT: and z1.s, z1.s, #0x7ffffffe
+; CHECK-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-NEXT: st1w { z0.s }, p0, [x2]
+; CHECK-NEXT: ret
+ %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+ %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+ %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+ %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+ %5 = or disjoint <vscale x 4 x i32> %3, %4
+ store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+ ret void
+}
>From 719078b4a036b46d5d876862bdd58c6b70b89bfa Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Thu, 7 Mar 2024 14:17:51 +0000
Subject: [PATCH 3/4] Removed disjoint flag from the test.
---
llvm/test/CodeGen/AArch64/sve2-bsl.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 2533982742e2da..706f8a77f7b30c 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -15,7 +15,7 @@ define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
- %5 = or disjoint <vscale x 4 x i32> %3, %4
+ %5 = or <vscale x 4 x i32> %3, %4
store <vscale x 4 x i32> %5, ptr %ptr3, align 4
ret void
}
@@ -35,7 +35,7 @@ define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
- %5 = or disjoint <vscale x 4 x i32> %3, %4
+ %5 = or <vscale x 4 x i32> %3, %4
store <vscale x 4 x i32> %5, ptr %ptr3, align 4
ret void
}
>From 4465caaa4a92b400f7308efa624f12a94c5dbcb7 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 8 Mar 2024 16:20:16 +0000
Subject: [PATCH 4/4] Restored disjoint in the sve2-bsl.ll, Check disjoint
first in tryCombineToBSL().
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++----
llvm/test/CodeGen/AArch64/sve2-bsl.ll | 4 ++--
2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7c922d9dd12412..be3fdbf9a47b6e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17605,11 +17605,9 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
SDValue N0 = N->getOperand(0);
- if (N0.getOpcode() != ISD::AND)
- return SDValue();
-
SDValue N1 = N->getOperand(1);
- if (N1.getOpcode() != ISD::AND)
+ if (!N->getFlags().hasDisjoint() &&
+ (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND))
return SDValue();
// InstCombine does (not (neg a)) => (add a -1).
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 706f8a77f7b30c..2533982742e2da 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -15,7 +15,7 @@ define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
- %5 = or <vscale x 4 x i32> %3, %4
+ %5 = or disjoint <vscale x 4 x i32> %3, %4
store <vscale x 4 x i32> %5, ptr %ptr3, align 4
ret void
}
@@ -35,7 +35,7 @@ define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
- %5 = or <vscale x 4 x i32> %3, %4
+ %5 = or disjoint <vscale x 4 x i32> %3, %4
store <vscale x 4 x i32> %5, ptr %ptr3, align 4
ret void
}
More information about the llvm-commits
mailing list