[llvm] [AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for add/sub. (PR #88413)

Thu Apr 11 09:41:24 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Dinar Temirbulatov (dtemirbulatov)

<details>
<summary>Changes</summary>

Allow to fold or/and-and to BSL instuction for scalable vectors.

---
Full diff: https://github.com/llvm/llvm-project/pull/88413.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+4-2) 
- (added) llvm/test/CodeGen/AArch64/sve2-bitselect.ll (+254) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 80181a77c9d238..d9aabb64125a4f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17942,11 +17942,13 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       } else
         continue;
 
-      if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
+      if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()) &&
+          !ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
         continue;
 
       // Constant ones is always righthand operand of the Add.
-      if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
+      if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()) &&
+          !ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
         continue;
 
       if (Sub.getOperand(1) != Add.getOperand(0))
diff --git a/llvm/test/CodeGen/AArch64/sve2-bitselect.ll b/llvm/test/CodeGen/AArch64/sve2-bitselect.ll
new file mode 100644
index 00000000000000..9ceeffc2e5d2ab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-bitselect.ll
@@ -0,0 +1,254 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64"
+
+; Check that an expanded vbsl(vneg(pre_cond), left, right) lowers to a VBSL
+; during ISEL.
+;
+; Subtly different from a plain vector bit select: operand representing the
+; condition has been negated (-v, not to be confused with bitwise_not(v)).
+
+; Each vbsl_neg_cond_xxxx tests one of the 16 permutations of the operands.
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0000(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0000:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0000 = or <vscale x 4 x i32> %right_bits_0, %left_bits_0
+  ret <vscale x 4 x i32> %bsl0000
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0001(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0001:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0001 = or <vscale x 4 x i32> %right_bits_0, %left_bits_1
+  ret <vscale x 4 x i32> %bsl0001
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0010(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0010:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
+  %bsl0010 = or <vscale x 4 x i32> %right_bits_1, %left_bits_0
+  ret <vscale x 4 x i32> %bsl0010
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0011(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0011:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
+  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
+  %bsl0011 = or <vscale x 4 x i32> %right_bits_1, %left_bits_1
+  ret <vscale x 4 x i32> %bsl0011
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0100(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0100:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0100 = or <vscale x 4 x i32> %left_bits_0, %right_bits_0
+  ret <vscale x 4 x i32> %bsl0100
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0101(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0101:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
+  %bsl0101 = or <vscale x 4 x i32> %left_bits_0, %right_bits_1
+  ret <vscale x 4 x i32> %bsl0101
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0110(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0110:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0110 = or <vscale x 4 x i32> %left_bits_1, %right_bits_0
+  ret <vscale x 4 x i32> %bsl0110
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0111(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0111:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
+  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
+  %bsl0111 = or <vscale x 4 x i32> %left_bits_1, %right_bits_1
+  ret <vscale x 4 x i32> %bsl0111
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1000(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1000:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
+  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
+  %bsl1000 = or <vscale x 4 x i32> %flip_cond_right_bits_0, %flip_cond_left_bits_0
+  ret <vscale x 4 x i32> %bsl1000
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1001(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1001:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
+  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
+  %bsl1001 = or <vscale x 4 x i32> %flip_cond_right_bits_0, %flip_cond_left_bits_1
+  ret <vscale x 4 x i32> %bsl1001
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1010(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1010:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
+  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
+  %bsl1010 = or <vscale x 4 x i32> %flip_cond_right_bits_1, %flip_cond_left_bits_0
+  ret <vscale x 4 x i32> %bsl1010
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1011(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1011:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
+  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
+  %bsl1011 = or <vscale x 4 x i32> %flip_cond_right_bits_1, %flip_cond_left_bits_1
+  ret <vscale x 4 x i32> %bsl1011
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1100(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1100:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
+  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
+  %bsl1100 = or <vscale x 4 x i32> %flip_cond_left_bits_0, %flip_cond_right_bits_0
+  ret <vscale x 4 x i32> %bsl1100
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1101(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1101:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
+  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
+  %bsl1101 = or <vscale x 4 x i32> %flip_cond_left_bits_0, %flip_cond_right_bits_1
+  ret <vscale x 4 x i32> %bsl1101
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1110(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1110:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
+  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
+  %bsl1110 = or <vscale x 4 x i32> %flip_cond_left_bits_1, %flip_cond_right_bits_0
+  ret <vscale x 4 x i32> %bsl1110
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1111(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1111:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
+  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
+  %bsl1111 = or <vscale x 4 x i32> %flip_cond_left_bits_1, %flip_cond_right_bits_1
+  ret <vscale x 4 x i32> %bsl1111
+}
+
+attributes #0 = { "target-features"="+sve2" }

``````````

</details>


https://github.com/llvm/llvm-project/pull/88413