[llvm] [AArch64] Optimized generated assembly for bool to svbool_t conversions (PR #83001)

Wed Feb 28 03:15:50 PST 2024

https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/83001

>From 3c4270c28d42ac798c3674bc51f16ca30ae0320d Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Mon, 26 Feb 2024 13:03:52 +0000
Subject: [PATCH 1/2] [AArch64] Optimized generated assembly for bool to
 svbool_t conversions

The original assembly was generating `AND(WHILELO, SPLAT 1)` pattern when only `WHILELO` was necessary
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  1 +
 .../AArch64/sve-intrinsics-reinterpret.ll     | 42 ++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a3b7e3128ac1a4..dba3a787734721 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -276,6 +276,7 @@ static bool isZeroingInactiveLanes(SDValue Op) {
     if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
       return true;
     return false;
+  case ISD::SPLAT_VECTOR:
   case AArch64ISD::PTRUE:
   case AArch64ISD::SETCC_MERGE_ZERO:
     return true;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
index 82bf756f822898..c7c102f5d567d9 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
 
@@ -150,6 +150,46 @@ define <vscale x 16 x i1> @chained_reinterpret() {
   ret <vscale x 16 x i1> %out
 }
 
+define <vscale x 16 x i1> @reinterpret_scalar_bool_h(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_s(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_q(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_q:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+
 declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
 declare <vscale x 8 x i1> @llvm.aarch64.sve.cmpgt.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)

>From 8724b327c830ec9caa6ab75f6a21f1da49fcb641 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 28 Feb 2024 11:11:19 +0000
Subject: [PATCH 2/2] Cleaned up default behaviour of isZeroingInactiveLanes
 and added seeing through bitcasts

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dba3a787734721..5b1be2705d3740 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -269,13 +269,14 @@ static bool isMergePassthruOpcode(unsigned Opc) {
 
 // Returns true if inactive lanes are known to be zeroed by construction.
 static bool isZeroingInactiveLanes(SDValue Op) {
+  // Skip bitcasts nodes
+  while (Op->getOpcode() == ISD::BITCAST)
+    Op = Op->getOperand(0);
+    
   switch (Op.getOpcode()) {
   default:
-    // We guarantee i1 splat_vectors to zero the other lanes by
-    // implementing it with ptrue and possibly a punpklo for nxv1i1.
-    if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
-      return true;
     return false;
+  // We guarantee i1 splat_vectors to zero the other lanes
   case ISD::SPLAT_VECTOR:
   case AArch64ISD::PTRUE:
   case AArch64ISD::SETCC_MERGE_ZERO: