[llvm] [SVE][InstCombine] Fold ld1d and splice into ld1ro (PR #69565)

Thu Oct 19 06:42:16 PDT 2023

https://github.com/vfdff updated https://github.com/llvm/llvm-project/pull/69565

>From 8d12f47401be9f90905853bdffe18b5dbda23491 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Wed, 18 Oct 2023 21:31:03 -0400
Subject: [PATCH] [SVE][InstCombine] Fold ld1d and splice into ld1ro

Perform the transform when the value of ld1d only used by splice
Fixes https://github.com/llvm/llvm-project/issues/69440.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 32 ++++++++++++++++
 .../AArch64/sve-intrinsic-ld1ro.ll            | 37 +++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c5703b15d07d83c..8477c3ff84fd37b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1253,6 +1253,36 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
   return IC.eraseInstFromFunction(II);
 }
 
+static std::optional<Instruction *>
+instCombineSVESplice(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL,
+                     const AArch64Subtarget *ST) {
+  Value *Pred = II.getOperand(0);
+  Value *VecOp0 = II.getOperand(1);
+  Value *VecOp1 = II.getOperand(2);
+  Value *PtrOp;
+  unsigned MinSVESize = ST->getMinSVEVectorSizeInBits();
+  unsigned MaxSVESize = ST->getMaxSVEVectorSizeInBits();
+  if (!ST->hasMatMulFP64() || VecOp0 != VecOp1 || MinSVESize != MaxSVESize ||
+      VecOp0->hasNUsesOrMore(3) ||
+      !match(VecOp0,
+             m_Intrinsic<Intrinsic::masked_load>(m_Value(PtrOp), m_Value(),
+                                                 m_Specific(Pred), m_Zero())))
+    return std::nullopt;
+
+  unsigned BitsPerElt = II.getType()->getScalarSizeInBits();
+  unsigned HalfLen = MinSVESize / BitsPerElt / 2;
+  const APInt *CI;
+  // The ld1ro load contiguous 256 bits, so half of 512 should match it.
+  if (!match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_whilelt>(m_Zero(),
+                                                               m_APInt(CI))) ||
+      HalfLen == 0 || CI->getZExtValue() != HalfLen || MinSVESize != 512)
+    return std::nullopt;
+
+  CallInst *Res = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ld1ro,
+                                             {II.getType()}, {Pred, PtrOp});
+  return IC.replaceInstUsesWith(II, Res);
+}
+
 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_fmul_u:
@@ -1889,6 +1919,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVELD1(IC, II, DL);
   case Intrinsic::aarch64_sve_st1:
     return instCombineSVEST1(IC, II, DL);
+  case Intrinsic::aarch64_sve_splice:
+    return instCombineSVESplice(IC, II, DL, ST);
   case Intrinsic::aarch64_sve_sdiv:
     return instCombineSVESDIV(IC, II);
   case Intrinsic::aarch64_sve_sel:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
new file mode 100644
index 000000000000000..3aee8cb8e169f39
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mattr=+sve,+f64mm -passes=instcombine -aarch64-sve-vector-bits-min=512 -aarch64-sve-vector-bits-max=512 < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 2 x double> @combine_ld1ro_double(ptr %addr) {
+; CHECK-LABEL: @combine_ld1ro_double(
+; CHECK-NEXT:    [[PRED:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1> [[PRED]], ptr [[ADDR:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[RES]]
+;
+  %pred = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)   ; half = 512/bits(type double)/2 = 4
+  %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %res
+}
+
+; Negative test: More than 2 uses
+define <vscale x 2 x double> @combine_ld1ro_double_3uses(ptr %addr) {
+; CHECK-LABEL: @combine_ld1ro_double_3uses(
+; CHECK-NEXT:    [[PRED:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)
+; CHECK-NEXT:    [[A:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[ADDR:%.*]], i32 8, <vscale x 2 x i1> [[PRED]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    call void @use_double(<vscale x 2 x double> [[A]])
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> [[PRED]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[RES]]
+;
+  %pred = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)   ; half = 512/bits(type double)/2 = 4
+  %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+  call void @use_double(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %res
+}
+
+declare void @use_double(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64, i64)