[llvm] [SVE][InstCombine] Fold ld1d and splice into ld1ro (PR #69565)

Thu Oct 19 00:16:01 PDT 2023

https://github.com/vfdff created https://github.com/llvm/llvm-project/pull/69565

Perform the transform when the value of ld1d only used by splice Fixes https://github.com/llvm/llvm-project/issues/69440.

>From 0f231e6d8dd209428b80759b671b4082ca39ed86 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Wed, 18 Oct 2023 21:31:03 -0400
Subject: [PATCH] [SVE][InstCombine] Fold ld1d and splice into ld1ro

Perform the transform when the value of ld1d only used by splice
Fixes https://github.com/llvm/llvm-project/issues/69440.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 20 ++++++++++++
 .../AArch64/sve-intrinsic-ld1ro.ll            | 32 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c5703b15d07d83c..b945126663d708e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1253,6 +1253,24 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
   return IC.eraseInstFromFunction(II);
 }
 
+static std::optional<Instruction *>
+instCombineSVESplice(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL,
+                     const AArch64Subtarget *ST) {
+  Value *Pred = II.getOperand(0);
+  Value *VecOp0 = II.getOperand(1);
+  Value *VecOp1 = II.getOperand(2);
+  Value *PtrOp;
+  if (!ST->hasMatMulFP64() || VecOp0 != VecOp1 || VecOp0->hasNUsesOrMore(3) ||
+      !match(VecOp0,
+             m_Intrinsic<Intrinsic::masked_load>(m_Value(PtrOp), m_Value(),
+                                                 m_Specific(Pred), m_Zero())))
+    return std::nullopt;
+
+  CallInst *Res = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ld1ro,
+                                             {II.getType()}, {Pred, PtrOp});
+  return IC.replaceInstUsesWith(II, Res);
+}
+
 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_fmul_u:
@@ -1889,6 +1907,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVELD1(IC, II, DL);
   case Intrinsic::aarch64_sve_st1:
     return instCombineSVEST1(IC, II, DL);
+  case Intrinsic::aarch64_sve_splice:
+    return instCombineSVESplice(IC, II, DL, ST);
   case Intrinsic::aarch64_sve_sdiv:
     return instCombineSVESDIV(IC, II);
   case Intrinsic::aarch64_sve_sel:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
new file mode 100644
index 000000000000000..f50caf71e0372cc
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mattr=+sve,+f64mm -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 2 x double> @combine_ld1ro_double(<vscale x 2 x i1> %pred, ptr %addr) {
+; CHECK-LABEL: @combine_ld1ro_double(
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1> [[PRED:%.*]], ptr [[ADDR:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[RES]]
+;
+  %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %res
+}
+
+; Negative test: More than 2 uses
+define <vscale x 2 x double> @combine_ld1ro_double_3uses(<vscale x 2 x i1> %pred, ptr %addr) {
+; CHECK-LABEL: @combine_ld1ro_double_3uses(
+; CHECK-NEXT:    [[A:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[ADDR:%.*]], i32 8, <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    call void @use_double(<vscale x 2 x double> [[A]])
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> [[PRED]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[RES]]
+;
+  %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+  call void @use_double(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %res
+}
+
+declare void @use_double(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)