[llvm] [AArch64] optimise SVE cmp intrinsics with no active lanes (PR #104779)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 19 06:41:40 PDT 2024
https://github.com/Lukacma created https://github.com/llvm/llvm-project/pull/104779
This patch extends https://github.com/llvm/llvm-project/pull/73964 and optimises SVE cmp intrinsics to zero vector when predicate is zero.
>From a6e8a219da3769f2b59887f2ab553927dd7a7c13 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Mon, 19 Aug 2024 12:50:33 +0000
Subject: [PATCH] [AArch64] optimise SVE cmp intrinsics with no active lanes
---
.../AArch64/AArch64TargetTransformInfo.cpp | 23 ++
.../sve-intrinsic-comb-no-active-lanes-cmp.ll | 235 ++++++++++++++++++
2 files changed, 258 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cmp.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a416565392eabe..0254ede383d255 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1160,6 +1160,10 @@ static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
+ // Replace by zero constant when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+ return II_NA;
+
// Check that the predicate is all active
auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
@@ -2131,6 +2135,25 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_st4:
case Intrinsic::aarch64_sve_st4q:
return instCombineSVENoActiveUnaryErase(IC, II, 4);
+ case Intrinsic::aarch64_sve_cmpeq:
+ case Intrinsic::aarch64_sve_cmpeq_wide:
+ case Intrinsic::aarch64_sve_cmpge:
+ case Intrinsic::aarch64_sve_cmpge_wide:
+ case Intrinsic::aarch64_sve_cmpgt:
+ case Intrinsic::aarch64_sve_cmpgt_wide:
+ case Intrinsic::aarch64_sve_cmphi:
+ case Intrinsic::aarch64_sve_cmphi_wide:
+ case Intrinsic::aarch64_sve_cmphs:
+ case Intrinsic::aarch64_sve_cmphs_wide:
+ case Intrinsic::aarch64_sve_cmple_wide:
+ case Intrinsic::aarch64_sve_cmplo_wide:
+ case Intrinsic::aarch64_sve_cmpls_wide:
+ case Intrinsic::aarch64_sve_cmplt_wide:
+ case Intrinsic::aarch64_sve_fcmpeq:
+ case Intrinsic::aarch64_sve_fcmpge:
+ case Intrinsic::aarch64_sve_fcmpgt:
+ case Intrinsic::aarch64_sve_fcmpne:
+ case Intrinsic::aarch64_sve_fcmpuo:
case Intrinsic::aarch64_sve_ld1_gather:
case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
case Intrinsic::aarch64_sve_ld1_gather_sxtw:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cmp.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cmp.ll
new file mode 100644
index 00000000000000..769cadd3ffa729
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cmp.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i1> @test_cmpeq(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpeq(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpeq.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmpeq_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpeq_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpeq.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmpge(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpge(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmpge_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpge_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmpgt(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpgt(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpgt.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmpgt_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpgt_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpgt.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmphi(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphi(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphi.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmphi_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphi_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphi.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmphs(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphs(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphs.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmphs_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphs_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphs.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmple_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmple_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmple.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmplo_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmplo_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmplo.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmpls_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpls_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpls.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmplt_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmplt_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmplt.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmpne(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpne(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 16 x i1> @test_cmpne_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
+; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpne_wide(
+; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i1> %0
+}
+
+define <vscale x 8 x i1> @test_fcmpeq(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
+; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpeq(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpeq.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x i1> %0
+}
+
+define <vscale x 8 x i1> @test_fcmpge(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
+; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpge(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpge.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x i1> %0
+}
+
+define <vscale x 8 x i1> @test_fcmpgt(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
+; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpgt(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpgt.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x i1> %0
+}
+
+define <vscale x 8 x i1> @test_fcmpne(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
+; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpne(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpne.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x i1> %0
+}
+
+define <vscale x 8 x i1> @test_fcmpuo(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
+; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpuo(
+; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpuo.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 8 x i1> %0
+}
+
More information about the llvm-commits
mailing list