[llvm] [SVE][InstCombine] Fold ld1d and splice into ld1ro (PR #69565)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 19 06:42:16 PDT 2023
https://github.com/vfdff updated https://github.com/llvm/llvm-project/pull/69565
>From 8d12f47401be9f90905853bdffe18b5dbda23491 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Wed, 18 Oct 2023 21:31:03 -0400
Subject: [PATCH] [SVE][InstCombine] Fold ld1d and splice into ld1ro
Perform the transform when the value of ld1d only used by splice
Fixes https://github.com/llvm/llvm-project/issues/69440.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 32 ++++++++++++++++
.../AArch64/sve-intrinsic-ld1ro.ll | 37 +++++++++++++++++++
2 files changed, 69 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c5703b15d07d83c..8477c3ff84fd37b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1253,6 +1253,36 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
return IC.eraseInstFromFunction(II);
}
+static std::optional<Instruction *>
+instCombineSVESplice(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL,
+ const AArch64Subtarget *ST) {
+ Value *Pred = II.getOperand(0);
+ Value *VecOp0 = II.getOperand(1);
+ Value *VecOp1 = II.getOperand(2);
+ Value *PtrOp;
+ unsigned MinSVESize = ST->getMinSVEVectorSizeInBits();
+ unsigned MaxSVESize = ST->getMaxSVEVectorSizeInBits();
+ if (!ST->hasMatMulFP64() || VecOp0 != VecOp1 || MinSVESize != MaxSVESize ||
+ VecOp0->hasNUsesOrMore(3) ||
+ !match(VecOp0,
+ m_Intrinsic<Intrinsic::masked_load>(m_Value(PtrOp), m_Value(),
+ m_Specific(Pred), m_Zero())))
+ return std::nullopt;
+
+ unsigned BitsPerElt = II.getType()->getScalarSizeInBits();
+ unsigned HalfLen = MinSVESize / BitsPerElt / 2;
+ const APInt *CI;
+ // The ld1ro load contiguous 256 bits, so half of 512 should match it.
+ if (!match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_whilelt>(m_Zero(),
+ m_APInt(CI))) ||
+ HalfLen == 0 || CI->getZExtValue() != HalfLen || MinSVESize != 512)
+ return std::nullopt;
+
+ CallInst *Res = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ld1ro,
+ {II.getType()}, {Pred, PtrOp});
+ return IC.replaceInstUsesWith(II, Res);
+}
+
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
switch (Intrinsic) {
case Intrinsic::aarch64_sve_fmul_u:
@@ -1889,6 +1919,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVELD1(IC, II, DL);
case Intrinsic::aarch64_sve_st1:
return instCombineSVEST1(IC, II, DL);
+ case Intrinsic::aarch64_sve_splice:
+ return instCombineSVESplice(IC, II, DL, ST);
case Intrinsic::aarch64_sve_sdiv:
return instCombineSVESDIV(IC, II);
case Intrinsic::aarch64_sve_sel:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
new file mode 100644
index 000000000000000..3aee8cb8e169f39
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mattr=+sve,+f64mm -passes=instcombine -aarch64-sve-vector-bits-min=512 -aarch64-sve-vector-bits-max=512 < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 2 x double> @combine_ld1ro_double(ptr %addr) {
+; CHECK-LABEL: @combine_ld1ro_double(
+; CHECK-NEXT: [[PRED:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)
+; CHECK-NEXT: [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1> [[PRED]], ptr [[ADDR:%.*]])
+; CHECK-NEXT: ret <vscale x 2 x double> [[RES]]
+;
+ %pred = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4) ; half = 512/bits(type double)/2 = 4
+ %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+ ret <vscale x 2 x double> %res
+}
+
+; Negative test: More than 2 uses
+define <vscale x 2 x double> @combine_ld1ro_double_3uses(ptr %addr) {
+; CHECK-LABEL: @combine_ld1ro_double_3uses(
+; CHECK-NEXT: [[PRED:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)
+; CHECK-NEXT: [[A:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[ADDR:%.*]], i32 8, <vscale x 2 x i1> [[PRED]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT: call void @use_double(<vscale x 2 x double> [[A]])
+; CHECK-NEXT: [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> [[PRED]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x double> [[RES]]
+;
+ %pred = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4) ; half = 512/bits(type double)/2 = 4
+ %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+ call void @use_double(<vscale x 2 x double> %a)
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+ ret <vscale x 2 x double> %res
+}
+
+declare void @use_double(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64, i64)
More information about the llvm-commits
mailing list