[llvm] [TargetLowering] Lower ldexp into target supported instructions (PR #67552)

Fri Oct 20 04:17:48 PDT 2023

https://github.com/huhu233 updated https://github.com/llvm/llvm-project/pull/67552

>From 69fc42c3405d3a52fd8dae81e5e2595e5f1fdb61 Mon Sep 17 00:00:00 2001
From: huhu233 <1293577861 at qq.com>
Date: Sun, 8 Oct 2023 19:48:23 +0800
Subject: [PATCH] [AArch64] Lower mathlib call ldexp into fscale when sve is
 enabled

The function of 'fscale' is equivalent to mathlib call ldexp, but has
better performance. This patch lowers ldexp into fscale when sve is enabled.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 51 ++++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  1 +
 llvm/test/CodeGen/AArch64/ldexp.ll            | 66 +++++++++++++++++++
 3 files changed, 118 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/ldexp.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a16a102e472e709..3aba60cdba4239c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1642,6 +1642,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
+  if (Subtarget->hasSVE()) {
+    setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
+    setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
+    setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
+  }
+
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 
   IsStrictFPEnabled = true;
@@ -6217,6 +6223,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::FSHL:
   case ISD::FSHR:
     return LowerFunnelShift(Op, DAG);
+  case ISD::FLDEXP:
+    return LowerFLDEXP(Op, DAG);
   }
 }
 
@@ -26414,3 +26422,46 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
   }
   return true;
 }
+
+SDValue AArch64TargetLowering::LowerFLDEXP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue X = Op.getOperand(0);
+  EVT XScalarTy = X.getValueType();
+  SDValue Exp = Op.getOperand(1);
+
+  SDLoc DL(Op);
+  EVT XVT, ExpVT;
+  switch (Op.getSimpleValueType().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::f16:
+    X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
+  case MVT::f32:
+    XVT = MVT::nxv4f32;
+    ExpVT = MVT::nxv4i32;
+    break;
+  case MVT::f64:
+    XVT = MVT::nxv2f64;
+    ExpVT = MVT::nxv2i64;
+    Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
+    break;
+  }
+
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  SDValue VX =
+      DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
+  SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
+                             DAG.getUNDEF(ExpVT), Exp, Zero);
+  SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
+                         AArch64SVEPredPattern::all);
+  SDValue FScale =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT,
+                  DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
+                  VPg, VX, VExp);
+  SDValue Final =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
+  if (X.getValueType() != XScalarTy)
+    Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
+                        DAG.getIntPtrConstant(1, SDLoc(Op)));
+  return Final;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9dcfba3a229cccd..c67083a70b0fa02 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1136,6 +1136,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op,
                                               SelectionDAG &DAG) const;
+  SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;
diff --git a/llvm/test/CodeGen/AArch64/ldexp.ll b/llvm/test/CodeGen/AArch64/ldexp.ll
new file mode 100644
index 000000000000000..07ff574cd478cbc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldexp.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck %s
+
+define double @testExp(double %val, i32 %a) {
+; CHECK-LABEL: testExp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fscale z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+entry:
+  %call = tail call fast double @ldexp(double %val, i32 %a)
+  ret double %call
+}
+
+declare double @ldexp(double, i32) #1
+
+define float @testExpf(float %val, i32 %a) {
+; CHECK-LABEL: testExpf:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+entry:
+  %call = tail call fast float @ldexpf(float %val, i32 %a)
+  ret float %call
+}
+
+declare float @ldexpf(float, i32) #1
+
+define fp128 @testExpl(fp128 %val, i32 %a) {
+; CHECK-LABEL: testExpl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b ldexpl
+entry:
+  %call = tail call fast fp128 @ldexpl(fp128 %val, i32 %a)
+  ret fp128 %call
+}
+
+declare fp128 @ldexpl(fp128, i32) #1
+
+define half @testExpf16(half %val, i32 %a) {
+; CHECK-LABEL: testExpf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call fast half @llvm.ldexp.f16.i32(half %val, i32 %a)
+  ret half %0
+}
+
+declare half @llvm.ldexp.f16.i32(half, i32) #1
+
+attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(none) }
+