[llvm] [SVE][InstCombine] Fold ld1d and splice into ld1ro (PR #69565)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 2 21:42:15 PDT 2023
https://github.com/vfdff updated https://github.com/llvm/llvm-project/pull/69565
>From 4c210da47c9adc452b6091cb730fa036865cb99d Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Wed, 18 Oct 2023 21:31:03 -0400
Subject: [PATCH] [SVE][InstCombine] Fold ld1d and splice into ld1ro
Perform the transform when the value of ld1d only used by splice
Fixes https://github.com/llvm/llvm-project/issues/69440.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 35 +++++++++++++++++
.../AArch64/sve-intrinsic-ld1ro.ll | 38 +++++++++++++++++++
2 files changed, 73 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0eaa3e817c0b62d..e2b43285665da07 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1358,6 +1358,39 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
return IC.eraseInstFromFunction(II);
}
+static std::optional<Instruction *>
+instCombineSVESplice(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL,
+ const AArch64Subtarget *ST) {
+ Value *Pred = II.getOperand(0);
+ Value *VecOp0 = II.getOperand(1);
+ Value *VecOp1 = II.getOperand(2);
+ const auto *F = II.getFunction();
+ ConstantRange CR = getVScaleRange(F, 64);
+ const APInt *C = CR.getSingleElement();
+ Value *PtrOp;
+
+ // The ld1ro load contiguous 256 bits, so half of 512 should match it.
+ if (!ST->hasMatMulFP64() || VecOp0 != VecOp1 || !C ||
+ C->getZExtValue() != 4 || !VecOp0->hasNUses(2) ||
+ !match(VecOp0,
+ m_Intrinsic<Intrinsic::masked_load>(m_Value(PtrOp), m_Value(),
+ m_Specific(Pred), m_Zero())))
+ return std::nullopt;
+
+ unsigned BitsPerElt = II.getType()->getScalarSizeInBits();
+ unsigned VecLen = C->getZExtValue() * 128;
+ unsigned HalfLane = VecLen / BitsPerElt / 2;
+ const APInt *CI;
+ if (!match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_whilelt>(m_Zero(),
+ m_APInt(CI))) ||
+ HalfLane == 0 || CI->getZExtValue() != HalfLane)
+ return std::nullopt;
+
+ CallInst *Res = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ld1ro,
+ {II.getType()}, {Pred, PtrOp});
+ return IC.replaceInstUsesWith(II, Res);
+}
+
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
switch (Intrinsic) {
case Intrinsic::aarch64_sve_fmul_u:
@@ -1994,6 +2027,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVELD1(IC, II, DL);
case Intrinsic::aarch64_sve_st1:
return instCombineSVEST1(IC, II, DL);
+ case Intrinsic::aarch64_sve_splice:
+ return instCombineSVESplice(IC, II, DL, ST);
case Intrinsic::aarch64_sve_sdiv:
return instCombineSVESDIV(IC, II);
case Intrinsic::aarch64_sve_sel:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
new file mode 100644
index 000000000000000..5879afbcdf29840
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-ld1ro.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mattr=+sve,+f64mm -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; The instuction ld1ro load a 256-bit (octaword) vector, so half of vscale=4 can match this limitation.
+define <vscale x 2 x double> @combine_ld1ro_double(ptr %addr) vscale_range(4,4) {
+; CHECK-LABEL: @combine_ld1ro_double(
+; CHECK-NEXT: [[PRED:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)
+; CHECK-NEXT: [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1> [[PRED]], ptr [[ADDR:%.*]])
+; CHECK-NEXT: ret <vscale x 2 x double> [[RES]]
+;
+ %pred = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4) ; half = 512/bits(type double)/2 = 4
+ %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+ ret <vscale x 2 x double> %res
+}
+
+; Negative test: More than 2 uses
+define <vscale x 2 x double> @combine_ld1ro_double_3uses(ptr %addr) vscale_range(4,4) {
+; CHECK-LABEL: @combine_ld1ro_double_3uses(
+; CHECK-NEXT: [[PRED:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4)
+; CHECK-NEXT: [[A:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[ADDR:%.*]], i32 8, <vscale x 2 x i1> [[PRED]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT: call void @use_double(<vscale x 2 x double> [[A]])
+; CHECK-NEXT: [[RES:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> [[PRED]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x double> [[RES]]
+;
+ %pred = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 0, i64 4) ; half = 512/bits(type double)/2 = 4
+ %a = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %addr, i32 8, <vscale x 2 x i1> %pred, <vscale x 2 x double> zeroinitializer)
+ call void @use_double(<vscale x 2 x double> %a)
+ %res = call <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1> %pred, <vscale x 2 x double> %a, <vscale x 2 x double> %a)
+ ret <vscale x 2 x double> %res
+}
+
+declare void @use_double(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.splice.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64, i64)
More information about the llvm-commits
mailing list