[clang] [CodeGenPrepare] Transform ldexp into target supported intrinsics (PR #67552)

Wed Sep 27 06:02:38 PDT 2023

https://github.com/huhu233 created https://github.com/llvm/llvm-project/pull/67552

There are more efficient implementations for llvm.ldexp on different targets. This patch transforms llvm.ldexp into target supported intrinsics before lowering.

>From dce0b8f8a76da24b27dd6ed61087e97b5f9415ec Mon Sep 17 00:00:00 2001
From: TiehuZhang <zhangtiehu at huawei.com>
Date: Wed, 27 Sep 2023 20:30:16 +0800
Subject: [PATCH] [CodeGenPrepare] Transform ldexp into target supported
 intrinsics

There are more efficient implementations for llvm.ldexp on different targets.
This patch transforms llvm.ldexp into target supported intrinsics before lowering.
---
 clang/lib/CodeGen/CGBuiltin.cpp               |  3 +
 clang/test/CodeGen/math-libcalls.c            | 12 ++--
 .../llvm/Analysis/TargetTransformInfo.h       |  6 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 ++
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 68 +++++++++++++++++++
 .../AArch64/AArch64TargetTransformInfo.h      |  7 ++
 llvm/lib/Target/X86/X86TargetTransformInfo.h  | 11 +++
 .../CodeGenPrepare/AArch64/optimize-ldexp.ll  | 46 +++++++++++++
 .../CodeGenPrepare/X86/optimize-ldexp.ll      | 38 +++++++++++
 10 files changed, 191 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 04c0325c7fd038b..da01c34731386e0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2719,6 +2719,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
           *this, E, Intrinsic::llrint,
           Intrinsic::experimental_constrained_llrint));
+    case Builtin::BIldexp:
+    case Builtin::BIldexpf:
+    case Builtin::BIldexpl:
     case Builtin::BI__builtin_ldexp:
     case Builtin::BI__builtin_ldexpf:
     case Builtin::BI__builtin_ldexpl:
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index 02df4fe5fea6018..a906bda4c88c958 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -71,15 +71,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   ldexp(f,f);    ldexpf(f,f);   ldexpl(f,f);
 
-  // NO__ERRNO: declare double @ldexp(double noundef, i32 noundef) [[READNONE]]
-  // NO__ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[READNONE]]
-  // NO__ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[READNONE]]
+  // NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
   // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
   // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
   // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
-  // HAS_MAYTRAP: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
+  // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ldexp.f64.i32(
+  // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ldexp.f32.i32(
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32(
 
   modf(f,d);       modff(f,fp);      modfl(f,l);
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ae595d2110457d..c8805aadf146874 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1669,6 +1669,7 @@ class TargetTransformInfo {
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const;
   /// @}
 
 private:
@@ -2035,6 +2036,7 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual unsigned getTargetSupportedLdexpInst(Type *Ty) const = 0;
 };
 
 template <typename T>
@@ -2745,6 +2747,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const override {
+    return Impl.getTargetSupportedLdexpInst(Ty);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 326c3130c6cff76..6d6a715f62b201c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -891,6 +891,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const { return 0; }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c751d174a48ab1f..6a58a146d0431f9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1237,6 +1237,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+unsigned TargetTransformInfo::getTargetSupportedLdexpInst(Type *Ty) const {
+  return TTIImpl->getTargetSupportedLdexpInst(Ty);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index faee623d7c62fba..ce0c6b653e1c6c5 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -61,6 +61,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
@@ -486,6 +487,7 @@ class CodeGenPrepare : public FunctionPass {
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   void verifyBFIUpdates(Function &F);
+  void optimizeScalarLdexp(Instruction *Ldexp, Value *X, Value *Exp);
 };
 
 } // end anonymous namespace
@@ -2432,6 +2434,13 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
       return optimizeGatherScatterInst(II, II->getArgOperand(0));
     case Intrinsic::masked_scatter:
       return optimizeGatherScatterInst(II, II->getArgOperand(1));
+    case Intrinsic::ldexp: {
+      // Vector versions of llvm.ldexp are not fully supported for all targets,
+      // only handle scalar version currently.
+      if (!II->getType()->isVectorTy())
+        optimizeScalarLdexp(II, II->getArgOperand(0), II->getArgOperand(1));
+      break;
+    }
     }
 
     SmallVector<Value *, 2> PtrOps;
@@ -8667,3 +8676,62 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {
   }
   return MadeChange;
 }
+
+// Transform llvm.ldexp.T.i32(T x, i32 exp) into target supported instructions.
+void CodeGenPrepare::optimizeScalarLdexp(Instruction *Ldexp, Value *X,
+                                         Value *Exp) {
+  auto IID = TTI->getTargetSupportedLdexpInst(X->getType());
+  if (IID == 0)
+    return;
+
+  unsigned XScalarSize = X->getType()->getScalarSizeInBits();
+  // Target related intrinsics for ldexp.f128 are not well supported, filter out
+  // the scenario currently.
+  if (XScalarSize > 64)
+    return;
+  unsigned VL = 128 / XScalarSize;
+
+  IRBuilder<> B(Ldexp);
+  LLVMContext &C = Ldexp->getModule()->getContext();
+  Type *VXTy = nullptr, *VExpTy = nullptr;
+  Value *VX = nullptr, *VExp = nullptr, *CvtExp = nullptr;
+  Value *Ret = nullptr, *Pg = nullptr;
+  ElementCount EC;
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::aarch64_sve_fscale: {
+    EC = ElementCount::get(VL, true);
+    CvtExp = Exp;
+    if (X->getType() == Type::getDoubleTy(C))
+      CvtExp = B.CreateSExt(Exp, Type::getInt64Ty(C));
+    VExpTy = VectorType::get(CvtExp->getType(), EC);
+    VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0));
+    VXTy = VectorType::get(X->getType(), EC);
+    VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0));
+    Type *PTy = VectorType::get(Type::getInt1Ty(C), EC);
+    Constant *True = ConstantInt::get(Type::getInt32Ty(C), 31);
+    Pg = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PTy}, {True});
+    Value *FScale = B.CreateIntrinsic(IID, {VXTy}, {Pg, VX, VExp});
+    Ret = B.CreateExtractElement(FScale, (uint64_t)0);
+    Ldexp->replaceAllUsesWith(Ret);
+    break;
+  }
+  case Intrinsic::x86_avx512_mask_scalef_ss:
+  case Intrinsic::x86_avx512_mask_scalef_sd: {
+    EC = ElementCount::get(VL, false);
+    CvtExp = B.CreateSIToFP(Exp, X->getType());
+    VExpTy = VectorType::get(CvtExp->getType(), EC);
+    VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0));
+    VXTy = VectorType::get(X->getType(), EC);
+    VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0));
+    Pg = ConstantInt::get(Type::getInt8Ty(C), -1);
+    Constant *Round = ConstantInt::get(Type::getInt32Ty(C), 4);
+    Value *Scalef =
+        B.CreateIntrinsic(IID, std::nullopt, {VX, VExp, VX, Pg, Round});
+    Ret = B.CreateExtractElement(Scalef, (uint64_t)0);
+    Ldexp->replaceAllUsesWith(Ret);
+    break;
+  }
+  }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..5190572b3d386da 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include <cstdint>
 #include <optional>
 
@@ -412,6 +413,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
     return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
   }
+
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const {
+    if (!ST->hasSVE())
+      return 0;
+    return Intrinsic::aarch64_sve_fscale;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0fa0d240a548b96..4ceada4e756f6f5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -19,6 +19,7 @@
 #include "X86TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicsX86.h"
 #include <optional>
 
 namespace llvm {
@@ -285,6 +286,16 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   bool prefersVectorizedAddressing() const;
   bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
+  unsigned getTargetSupportedLdexpInst(Type *Ty) const {
+    if (!ST->hasAVX512())
+      return 0;
+    if (Ty->isFloatTy())
+      return Intrinsic::x86_avx512_mask_scalef_ss;
+    else if (Ty->isDoubleTy())
+      return Intrinsic::x86_avx512_mask_scalef_sd;
+    else
+      return 0;
+  }
 
 private:
   bool supportsGather() const;
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
new file mode 100644
index 000000000000000..77605844450d006
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=AARCH64 %s
+
+define dso_local double @testExp(double noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExp:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    ptrue p0.d
+; AARCH64-NEXT:    // kill: def $w0 killed $w0 def $x0
+; AARCH64-NEXT:    sxtw x8, w0
+; AARCH64-NEXT:    // kill: def $d0 killed $d0 def $z0
+; AARCH64-NEXT:    fmov d1, x8
+; AARCH64-NEXT:    fscale z0.d, p0/m, z0.d, z1.d
+; AARCH64-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; AARCH64-NEXT:    ret
+entry:
+  %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a)
+  ret double %0
+}
+declare double @llvm.ldexp.f64.i32(double, i32)
+
+define dso_local float @testExpf(float noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpf:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    ptrue p0.s
+; AARCH64-NEXT:    fmov s1, w0
+; AARCH64-NEXT:    // kill: def $s0 killed $s0 def $z0
+; AARCH64-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
+; AARCH64-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; AARCH64-NEXT:    ret
+entry:
+  %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a)
+  ret float %0
+}
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+; Target related intrinsics for f128 are not well supported, use call ldexpl
+; currently.
+define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpl:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    b ldexpl
+entry:
+  %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+  ret fp128 %0
+}
+declare fp128 @llvm.ldexp.f128.i32(fp128, i32)
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
new file mode 100644
index 000000000000000..97dd7bd80aa43b1
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 -mattr=+avx512f < %s -o - | FileCheck --check-prefixes=AARCH64 %s
+
+define dso_local double @testExp(double noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExp:
+; AARCH64:       # %bb.0: # %entry
+; AARCH64-NEXT:    vcvtsi2sd %edi, %xmm1, %xmm1
+; AARCH64-NEXT:    vscalefsd %xmm1, %xmm0, %xmm0
+; AARCH64-NEXT:    retq
+entry:
+  %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a)
+  ret double %0
+}
+declare double @llvm.ldexp.f64.i32(double, i32)
+
+define dso_local float @testExpf(float noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpf:
+; AARCH64:       # %bb.0: # %entry
+; AARCH64-NEXT:    vcvtsi2ss %edi, %xmm1, %xmm1
+; AARCH64-NEXT:    vscalefss %xmm1, %xmm0, %xmm0
+; AARCH64-NEXT:    retq
+entry:
+  %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a)
+  ret float %0
+}
+declare float @llvm.ldexp.f32.i32(float, i32)
+
+; Target related intrinsics for f128 are not well supported, use call ldexpl
+; currently.
+define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) {
+; AARCH64-LABEL: testExpl:
+; AARCH64:       # %bb.0: # %entry
+; AARCH64-NEXT:    jmp ldexpl at PLT # TAILCALL
+entry:
+  %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+  ret fp128 %0
+}
+declare fp128 @llvm.ldexp.f128.i32(fp128, i32)