[clang] [CodeGenPrepare] Transform ldexp into target supported intrinsics (PR #67552)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Sep 27 06:02:38 PDT 2023
https://github.com/huhu233 created https://github.com/llvm/llvm-project/pull/67552
There are more efficient implementations for llvm.ldexp on different targets. This patch transforms llvm.ldexp into target supported intrinsics before lowering.
>From dce0b8f8a76da24b27dd6ed61087e97b5f9415ec Mon Sep 17 00:00:00 2001
From: TiehuZhang <zhangtiehu at huawei.com>
Date: Wed, 27 Sep 2023 20:30:16 +0800
Subject: [PATCH] [CodeGenPrepare] Transform ldexp into target supported
intrinsics
There are more efficient implementations for llvm.ldexp on different targets.
This patch transforms llvm.ldexp into target supported intrinsics before lowering.
---
clang/lib/CodeGen/CGBuiltin.cpp | 3 +
clang/test/CodeGen/math-libcalls.c | 12 ++--
.../llvm/Analysis/TargetTransformInfo.h | 6 ++
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++
llvm/lib/CodeGen/CodeGenPrepare.cpp | 68 +++++++++++++++++++
.../AArch64/AArch64TargetTransformInfo.h | 7 ++
llvm/lib/Target/X86/X86TargetTransformInfo.h | 11 +++
.../CodeGenPrepare/AArch64/optimize-ldexp.ll | 46 +++++++++++++
.../CodeGenPrepare/X86/optimize-ldexp.ll | 38 +++++++++++
10 files changed, 191 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 04c0325c7fd038b..da01c34731386e0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2719,6 +2719,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
*this, E, Intrinsic::llrint,
Intrinsic::experimental_constrained_llrint));
+ case Builtin::BIldexp:
+ case Builtin::BIldexpf:
+ case Builtin::BIldexpl:
case Builtin::BI__builtin_ldexp:
case Builtin::BI__builtin_ldexpf:
case Builtin::BI__builtin_ldexpl:
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index 02df4fe5fea6018..a906bda4c88c958 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -71,15 +71,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
ldexp(f,f); ldexpf(f,f); ldexpl(f,f);
- // NO__ERRNO: declare double @ldexp(double noundef, i32 noundef) [[READNONE]]
- // NO__ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[READNONE]]
- // NO__ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[READNONE]]
+ // NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]]
+ // NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]]
+ // NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
// HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
// HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
// HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
- // HAS_MAYTRAP: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
- // HAS_MAYTRAP: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
- // HAS_MAYTRAP: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
+ // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ldexp.f64.i32(
+ // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ldexp.f32.i32(
+ // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32(
modf(f,d); modff(f,fp); modfl(f,l);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ae595d2110457d..c8805aadf146874 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1669,6 +1669,7 @@ class TargetTransformInfo {
/// \return The maximum number of function arguments the target supports.
unsigned getMaxNumArgs() const;
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const;
/// @}
private:
@@ -2035,6 +2036,7 @@ class TargetTransformInfo::Concept {
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
virtual unsigned getMaxNumArgs() const = 0;
+ virtual unsigned getTargetSupportedLdexpInst(Type *Ty) const = 0;
};
template <typename T>
@@ -2745,6 +2747,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxNumArgs() const override {
return Impl.getMaxNumArgs();
}
+
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const override {
+ return Impl.getTargetSupportedLdexpInst(Ty);
+ }
};
template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 326c3130c6cff76..6d6a715f62b201c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -891,6 +891,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxNumArgs() const { return UINT_MAX; }
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const { return 0; }
+
protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c751d174a48ab1f..6a58a146d0431f9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1237,6 +1237,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
}
+unsigned TargetTransformInfo::getTargetSupportedLdexpInst(Type *Ty) const {
+ return TTIImpl->getTargetSupportedLdexpInst(Ty);
+}
+
TargetTransformInfo::Concept::~Concept() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index faee623d7c62fba..ce0c6b653e1c6c5 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -61,6 +61,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
@@ -486,6 +487,7 @@ class CodeGenPrepare : public FunctionPass {
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
void verifyBFIUpdates(Function &F);
+ void optimizeScalarLdexp(Instruction *Ldexp, Value *X, Value *Exp);
};
} // end anonymous namespace
@@ -2432,6 +2434,13 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
return optimizeGatherScatterInst(II, II->getArgOperand(0));
case Intrinsic::masked_scatter:
return optimizeGatherScatterInst(II, II->getArgOperand(1));
+ case Intrinsic::ldexp: {
+ // Vector versions of llvm.ldexp are not fully supported for all targets,
+ // only handle scalar version currently.
+ if (!II->getType()->isVectorTy())
+ optimizeScalarLdexp(II, II->getArgOperand(0), II->getArgOperand(1));
+ break;
+ }
}
SmallVector<Value *, 2> PtrOps;
@@ -8667,3 +8676,62 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {
}
return MadeChange;
}
+
+// Transform llvm.ldexp.T.i32(T x, i32 exp) into target supported instructions.
+void CodeGenPrepare::optimizeScalarLdexp(Instruction *Ldexp, Value *X,
+ Value *Exp) {
+ auto IID = TTI->getTargetSupportedLdexpInst(X->getType());
+ if (IID == 0)
+ return;
+
+ unsigned XScalarSize = X->getType()->getScalarSizeInBits();
+ // Target related intrinsics for ldexp.f128 are not well supported, filter out
+ // the scenario currently.
+ if (XScalarSize > 64)
+ return;
+ unsigned VL = 128 / XScalarSize;
+
+ IRBuilder<> B(Ldexp);
+ LLVMContext &C = Ldexp->getModule()->getContext();
+ Type *VXTy = nullptr, *VExpTy = nullptr;
+ Value *VX = nullptr, *VExp = nullptr, *CvtExp = nullptr;
+ Value *Ret = nullptr, *Pg = nullptr;
+ ElementCount EC;
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::aarch64_sve_fscale: {
+ EC = ElementCount::get(VL, true);
+ CvtExp = Exp;
+ if (X->getType() == Type::getDoubleTy(C))
+ CvtExp = B.CreateSExt(Exp, Type::getInt64Ty(C));
+ VExpTy = VectorType::get(CvtExp->getType(), EC);
+ VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0));
+ VXTy = VectorType::get(X->getType(), EC);
+ VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0));
+ Type *PTy = VectorType::get(Type::getInt1Ty(C), EC);
+ Constant *True = ConstantInt::get(Type::getInt32Ty(C), 31);
+ Pg = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PTy}, {True});
+ Value *FScale = B.CreateIntrinsic(IID, {VXTy}, {Pg, VX, VExp});
+ Ret = B.CreateExtractElement(FScale, (uint64_t)0);
+ Ldexp->replaceAllUsesWith(Ret);
+ break;
+ }
+ case Intrinsic::x86_avx512_mask_scalef_ss:
+ case Intrinsic::x86_avx512_mask_scalef_sd: {
+ EC = ElementCount::get(VL, false);
+ CvtExp = B.CreateSIToFP(Exp, X->getType());
+ VExpTy = VectorType::get(CvtExp->getType(), EC);
+ VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0));
+ VXTy = VectorType::get(X->getType(), EC);
+ VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0));
+ Pg = ConstantInt::get(Type::getInt8Ty(C), -1);
+ Constant *Round = ConstantInt::get(Type::getInt32Ty(C), 4);
+ Value *Scalef =
+ B.CreateIntrinsic(IID, std::nullopt, {VX, VExp, VX, Pg, Round});
+ Ret = B.CreateExtractElement(Scalef, (uint64_t)0);
+ Ldexp->replaceAllUsesWith(Ret);
+ break;
+ }
+ }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..5190572b3d386da 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include <cstdint>
#include <optional>
@@ -412,6 +413,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
}
+
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const {
+ if (!ST->hasSVE())
+ return 0;
+ return Intrinsic::aarch64_sve_fscale;
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0fa0d240a548b96..4ceada4e756f6f5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -19,6 +19,7 @@
#include "X86TargetMachine.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicsX86.h"
#include <optional>
namespace llvm {
@@ -285,6 +286,16 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
bool prefersVectorizedAddressing() const;
bool supportsEfficientVectorElementLoadStore() const;
bool enableInterleavedAccessVectorization();
+ unsigned getTargetSupportedLdexpInst(Type *Ty) const {
+ if (!ST->hasAVX512())
+ return 0;
+ if (Ty->isFloatTy())
+ return Intrinsic::x86_avx512_mask_scalef_ss;
+ else if (Ty->isDoubleTy())
+ return Intrinsic::x86_avx512_mask_scalef_sd;
+ else
+ return 0;
+ }
private:
bool supportsGather() const;
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
new file mode 100644
index 000000000000000..77605844450d006
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=AARCH64 %s
+
+define dso_local double @testExp(double noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExp:
+; AARCH64: // %bb.0: // %entry
+; AARCH64-NEXT: ptrue p0.d
+; AARCH64-NEXT: // kill: def $w0 killed $w0 def $x0
+; AARCH64-NEXT: sxtw x8, w0
+; AARCH64-NEXT: // kill: def $d0 killed $d0 def $z0
+; AARCH64-NEXT: fmov d1, x8
+; AARCH64-NEXT: fscale z0.d, p0/m, z0.d, z1.d
+; AARCH64-NEXT: // kill: def $d0 killed $d0 killed $z0
+; AARCH64-NEXT: ret
+entry:
+ %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a)
+ ret double %0
+}
+declare double @llvm.ldexp.f64.i32(double, i32)
+
+define dso_local float @testExpf(float noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpf:
+; AARCH64: // %bb.0: // %entry
+; AARCH64-NEXT: ptrue p0.s
+; AARCH64-NEXT: fmov s1, w0
+; AARCH64-NEXT: // kill: def $s0 killed $s0 def $z0
+; AARCH64-NEXT: fscale z0.s, p0/m, z0.s, z1.s
+; AARCH64-NEXT: // kill: def $s0 killed $s0 killed $z0
+; AARCH64-NEXT: ret
+entry:
+ %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a)
+ ret float %0
+}
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+; Target related intrinsics for f128 are not well supported, use call ldexpl
+; currently.
+define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpl:
+; AARCH64: // %bb.0: // %entry
+; AARCH64-NEXT: b ldexpl
+entry:
+ %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+ ret fp128 %0
+}
+declare fp128 @llvm.ldexp.f128.i32(fp128, i32)
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
new file mode 100644
index 000000000000000..97dd7bd80aa43b1
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 -mattr=+avx512f < %s -o - | FileCheck --check-prefixes=AARCH64 %s
+
+define dso_local double @testExp(double noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExp:
+; AARCH64: # %bb.0: # %entry
+; AARCH64-NEXT: vcvtsi2sd %edi, %xmm1, %xmm1
+; AARCH64-NEXT: vscalefsd %xmm1, %xmm0, %xmm0
+; AARCH64-NEXT: retq
+entry:
+ %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a)
+ ret double %0
+}
+declare double @llvm.ldexp.f64.i32(double, i32)
+
+define dso_local float @testExpf(float noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpf:
+; AARCH64: # %bb.0: # %entry
+; AARCH64-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1
+; AARCH64-NEXT: vscalefss %xmm1, %xmm0, %xmm0
+; AARCH64-NEXT: retq
+entry:
+ %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a)
+ ret float %0
+}
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+; Target related intrinsics for f128 are not well supported, use call ldexpl
+; currently.
+define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpl:
+; AARCH64: # %bb.0: # %entry
+; AARCH64-NEXT: jmp ldexpl at PLT # TAILCALL
+entry:
+ %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+ ret fp128 %0
+}
+declare fp128 @llvm.ldexp.f128.i32(fp128, i32)
More information about the cfe-commits
mailing list