[llvm] [ARM] Lower arm_neon_vbsl to ARMISD::VBSP and fold (vbsl x, y, y) to y (PR #109761)

Tue Sep 24 00:27:20 PDT 2024

https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/109761

This helps clean up the patterns a little and will help share combines on both the intrinsic and VBSP. A combine is then added to fold away the VBSP if both the selected operands are the same.

>From 2bfc2d2c07bfe958dd52b8c3489567ae5023bf6f Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 24 Sep 2024 08:23:52 +0100
Subject: [PATCH] [ARM] Lower arm_neon_vbsl to ARMISD::VBSP and fold (vbsl x,
 y, y) to y

This helps clean up the patterns a little and will help share combines on both
the intrinsic and VBSP. A combine is then added to fold away the VBSP if both
the selected operands are the same.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp |  9 +++++
 llvm/lib/Target/ARM/ARMInstrNEON.td     | 54 +++++++++++--------------
 llvm/test/CodeGen/ARM/vbsl.ll           |  5 +--
 3 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a03928b618df03..f891aece26848c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -17653,6 +17653,11 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
     // No immediate versions of these to check for.
     break;
 
+  case Intrinsic::arm_neon_vbsl: {
+    SDLoc dl(N);
+    return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
+                       N->getOperand(2), N->getOperand(3));
+  }
   case Intrinsic::arm_mve_vqdmlah:
   case Intrinsic::arm_mve_vqdmlash:
   case Intrinsic::arm_mve_vqrdmlah:
@@ -19072,6 +19077,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
       return SDValue();
     break;
   }
+  case ARMISD::VBSP:
+    if (N->getOperand(1) == N->getOperand(2))
+      return N->getOperand(1);
+    return SDValue();
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (N->getConstantOperandVal(1)) {
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index fcabc9076e4d30..48dcbdb137123a 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -5524,26 +5524,23 @@ def : Pat<(v16i8 (vnotq QPR:$src)),
 // with different register constraints; it just inserts copies.
 // That is why pseudo VBSP implemented. Is is expanded later into
 // VBIT/VBIF/VBSL taking into account register constraints to avoid copies.
-def  VBSPd
-  : PseudoNeonI<(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
-                IIC_VBINiD, "",
-                [(set DPR:$Vd,
-                      (v2i32 (NEONvbsp DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def  VBSPd : PseudoNeonI<(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                         IIC_VBINiD, "", []>;
 let Predicates = [HasNEON] in {
-def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
-                                   (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+def : Pat<(v8i8 (NEONvbsp (v8i8 DPR:$src1),
+                          (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
           (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
-                                    (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+def : Pat<(v4i16 (NEONvbsp (v4i16 DPR:$src1),
+                           (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
           (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
-                                    (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+def : Pat<(v2i32 (NEONvbsp (v2i32 DPR:$src1),
+                           (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
           (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
-                                    (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
+def : Pat<(v2f32 (NEONvbsp (v2f32 DPR:$src1),
+                           (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
           (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
-                                    (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
+def : Pat<(v1i64 (NEONvbsp (v1i64 DPR:$src1),
+                           (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
           (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
 
 def : Pat<(v8i8 (or (and DPR:$Vn, DPR:$Vd),
@@ -5560,26 +5557,23 @@ def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
           (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
 }
 
-def  VBSPq
-  : PseudoNeonI<(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
-                IIC_VBINiQ, "",
-                [(set QPR:$Vd,
-                      (v4i32 (NEONvbsp QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+def  VBSPq : PseudoNeonI<(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                         IIC_VBINiQ, "", []>;
 let Predicates = [HasNEON] in {
-def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
-                                   (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+def : Pat<(v16i8 (NEONvbsp (v16i8 QPR:$src1),
+                           (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
           (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
-def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
-                                    (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+def : Pat<(v8i16 (NEONvbsp (v8i16 QPR:$src1),
+                           (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
           (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
-def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
-                                    (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+def : Pat<(v4i32 (NEONvbsp (v4i32 QPR:$src1),
+                           (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
           (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
-def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
-                                    (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
+def : Pat<(v4f32 (NEONvbsp (v4f32 QPR:$src1),
+                           (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
           (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
-def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
-                                    (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
+def : Pat<(v2i64 (NEONvbsp (v2i64 QPR:$src1),
+                           (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
           (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
 
 def : Pat<(v16i8 (or (and QPR:$Vn, QPR:$Vd),
diff --git a/llvm/test/CodeGen/ARM/vbsl.ll b/llvm/test/CodeGen/ARM/vbsl.ll
index d5aaf3e6f30bd3..0ef725fc91b547 100644
--- a/llvm/test/CodeGen/ARM/vbsl.ll
+++ b/llvm/test/CodeGen/ARM/vbsl.ll
@@ -264,8 +264,7 @@ define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounw
 define <8 x i8> @same_param_all(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: same_param_all:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vorr d0, d1, d1
-; CHECK-NEXT:    vbsl d0, d1, d1
+; CHECK-NEXT:    vmov.f64 d0, d1
 ; CHECK-NEXT:    bx lr
   %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %b, <8 x i8> %b, <8 x i8> %b)
   ret <8 x i8> %vbsl.i
@@ -274,7 +273,7 @@ define <8 x i8> @same_param_all(<8 x i8> %a, <8 x i8> %b) {
 define <8 x i8> @same_param_12(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: same_param_12:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vbsl d0, d1, d1
+; CHECK-NEXT:    vmov.f64 d0, d1
 ; CHECK-NEXT:    bx lr
   %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %b)
   ret <8 x i8> %vbsl.i