[llvm] [SVE][InstCombine] Fold ld1d and splice into ld1ro (PR #69565)

Thu Nov 2 21:42:15 PDT 2023

https://github.com/vfdff updated https://github.com/llvm/llvm-project/pull/69565

>From 4c210da47c9adc452b6091cb730fa036865cb99d Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Wed, 18 Oct 2023 21:31:03 -0400
Subject: [PATCH] [SVE][InstCombine] Fold ld1d and splice into ld1ro

Perform the transform when the value of ld1d only used by splice
Fixes https://github.com/llvm/llvm-project/issues/69440.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 35 +++++++++++++++++
 .../AArch64/sve-intrinsic-ld1ro.ll            | 38 +++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0eaa3e817c0b62d..e2b43285665da07 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1358,6 +1358,39 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
   return IC.eraseInstFromFunction(II);
 }
 
+static std::optional<Instruction *>
+instCombineSVESplice(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL,
+                     const AArch64Subtarget *ST) {
+  Value *Pred = II.getOperand(0);
+  Value *VecOp0 = II.getOperand(1);
+  Value *VecOp1 = II.getOperand(2);
+  const auto *F = II.getFunction();
+  ConstantRange CR = getVScaleRange(F, 64);
+  const APInt *C = CR.getSingleElement();
+  Value *PtrOp;
+
+  // The ld1ro load contiguous 256 bits, so half of 512 should match it.
+  if (!ST->hasMatMulFP64() || VecOp0 != VecOp1 || !C ||
+      C->getZExtValue() != 4 || !VecOp0->hasNUses(2) ||
+      !match(VecOp0,
+             m_Intrinsic<Intrinsic::masked_load>(m_Value(PtrOp), m_Value(),
+                                                 m_Specific(Pred), m_Zero())))
+    return std::nullopt;
+
+  unsigned BitsPerElt = II.getType()->getScalarSizeInBits();
+  unsigned VecLen = C->getZExtValue() * 128;
+  unsigned HalfLane = VecLen / BitsPerElt / 2;
+  const APInt *CI;
+  if (!match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_whilelt>(m_Zero(),
+                                                               m_APInt(CI))) ||
+      HalfLane == 0 || CI->getZExtValue() != HalfLane)
+    return std::nullopt;
+
+  CallInst *Res = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ld1ro,
+                                             {II.getType()}, {Pred, PtrOp});
+  return IC.replaceInstUsesWith(II, Res);
+}
+
 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_fmul_u:
@@ -1994,6 +2027,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVELD1(IC, II, DL);
   case Intrinsic::aarch64_sve_st1:
     return instCombineSVEST1(IC, II, DL);
+  case Intrinsic::aarch64_sve_splice:
+    return instCombineSVESplice(IC, II, DL, ST);
   case Intrinsic::aarch64_sve_sdiv:
     return instCombineSVESDIV(IC, II);
   case Intrinsic::aarch64_sve_sel:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
new file mode 100644
index 000000000000000..5879afbcdf29840
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mattr=+sve,+f64mm -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; The instuction ld1ro load a 256-bit (octaword) vector, so half of vscale=4 can match this limitation.
+define <vscale x 2 x double> @combine_ld1ro_double(ptr %addr) vscale_range(4,4) {
+; CHECK-LABEL: @combine_ld1ro_double(
+; CHECK-NEXT:    [[PRED:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1> [[PRED]], ptr [[ADDR:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[RES]]
+;
+  %pred = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)   ; half = 512/bits(type double)/2 = 4
+  %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %res
+}
+
+; Negative test: More than 2 uses
+define <vscale x 2 x double> @combine_ld1ro_double_3uses(ptr %addr) vscale_range(4,4) {
+; CHECK-LABEL: @combine_ld1ro_double_3uses(
+; CHECK-NEXT:    [[PRED:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)
+; CHECK-NEXT:    [[A:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[ADDR:%.*]], i32 8, <vscale x 2 x i1> [[PRED]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    call void @use_double(<vscale x 2 x double> [[A]])
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> [[PRED]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[RES]]
+;
+  %pred = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)   ; half = 512/bits(type double)/2 = 4
+  %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+  call void @use_double(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %res
+}
+
+declare void @use_double(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64, i64)