[llvm] [AArch64] Fix #94909: Optimize vector fmul(sitofp(x), 0.5) -> scvtf(x, 2) (PR #141480)

Fri May 30 11:54:12 PDT 2025

https://github.com/jph-13 updated https://github.com/llvm/llvm-project/pull/141480

>From 67a4484f25cbd425a05d5cd4b80db17ec601d381 Mon Sep 17 00:00:00 2001
From: JP Hafer <jhafer at mathworks.com>
Date: Fri, 30 May 2025 14:19:15 -0400
Subject: [PATCH] [AArch64] Fix #94909: Optimize vector fmul(sitofp(x), 0.5) ->
 scvtf(x, 2)

This commit reintroduces the optimization in InstCombine that was previously removed due to limited applicability.
See: #91924

This update targets `fmul(sitofp(x), C)` where `C` is a constant reciprocal of a power of two. For both scalar and vector inputs, if we have `sitofp(X) * C` (where `C` is `1/2^N`), this can be optimized to `scvtf(X, 2^N)`. This eliminates the floating-point multiply by directly converting the integer to a scaled floating-point value.

[AArch64] Fix #94909: Optimize vector fmul(sitofp(x), 0.5) -> scvtf(x, 2)

This commit reintroduces the optimization in InstCombine that was previously removed due to limited applicability.
See: #91924

This update targets `fmul(sitofp(x), C)` where `C` is a constant reciprocal of a power of two. For both scalar and vector inputs, if we have `sitofp(X) * C` (where `C` is `1/2^N`), this can be optimized to `scvtf(X, 2^N)`. This eliminates the floating-point multiply by directly converting the integer to a scaled floating-point value.

revert orig try
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 58 ++++++++++++
 .../AArch64/scvtf-div-mul-combine.ll          | 93 +++++++++++++++++++
 2 files changed, 151 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/scvtf-div-mul-combine.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 96fa85179d023..27f35f3120b49 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -3907,6 +3907,64 @@ static bool checkCVTFixedPointOperandWithFBits(SelectionDAG *CurDAG, SDValue N,
                                                unsigned RegWidth,
                                                bool isReciprocal) {
   APFloat FVal(0.0);
+
+  if (N.getOpcode() == ISD::BUILD_VECTOR) {
+    EVT VT = N.getValueType();
+    EVT EltVT = VT.getVectorElementType();
+
+    unsigned NumElts = N.getNumOperands();
+    SDValue FirstOp = N.getOperand(0);
+
+    ConstantFPSDNode *FirstCN = dyn_cast<ConstantFPSDNode>(FirstOp);
+    if (!FirstCN)
+      return false;
+
+    APFloat FirstVal = FirstCN->getValueAPF();
+    if (EltVT == MVT::f16) {
+      bool ignored;
+      FirstVal.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+                       &ignored);
+    }
+
+    // Handle reciprocal case if needed
+    if (isReciprocal) {
+      if (!FirstVal.getExactInverse(&FirstVal))
+        return false;
+    }
+
+    bool IsExact;
+    APSInt IntVal(65, true);
+    FirstVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+
+    if (!IsExact || !IntVal.isPowerOf2())
+      return false;
+
+    unsigned FBits = IntVal.logBase2();
+    if (FBits == 0 || FBits > RegWidth)
+      return false;
+
+    APInt FirstBits = FirstVal.bitcastToAPInt();
+
+    for (unsigned i = 1; i < NumElts; ++i) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N.getOperand(i));
+      if (!CN)
+        return false;
+
+      APFloat ElemVal = CN->getValueAPF();
+      if (EltVT == MVT::f16) {
+        bool ignored;
+        ElemVal.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+                        &ignored);
+      }
+
+      if (ElemVal.bitcastToAPInt() != FirstBits)
+        return false;
+    }
+
+    FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
+    return true;
+  }
+
   if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
     FVal = CN->getValueAPF();
   else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
diff --git a/llvm/test/Transforms/InstCombine/AArch64/scvtf-div-mul-combine.ll b/llvm/test/Transforms/InstCombine/AArch64/scvtf-div-mul-combine.ll
new file mode 100644
index 0000000000000..27f1158f3a0b3
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/scvtf-div-mul-combine.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+fullfp16 -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s
+
+; Scalar fdiv by 16.0 (f32)
+define float @tests_f32_div(i32 %in) {
+; CHECK-LABEL: tests_f32_div:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf   s0, w0, #4
+; CHECK-NEXT:    ret
+entry:
+  %vcvt.i = sitofp i32 %in to float
+  %div.i = fdiv float %vcvt.i, 16.0
+  ret float %div.i
+}
+
+; Scalar fmul by (2^-4) (f32)
+define float @testsmul_f32_mul(i32 %in) local_unnamed_addr #0 {
+; CHECK-LABEL: testsmul_f32_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf   s0, w0, #4
+; CHECK-NEXT:    ret
+  %vcvt.i = sitofp i32 %in to float
+  %div.i = fmul float %vcvt.i, 6.250000e-02 ; 0.0625 is 2^-4
+  ret float %div.i
+}
+
+; Vector fdiv by 16.0 (v2f32)
+define <2 x float> @testv_v2f32_div(<2 x i32> %in) {
+; CHECK-LABEL: testv_v2f32_div:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf.2s        v0, v0, #4
+; CHECK-NEXT:    ret
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 16.0, float 16.0>
+  ret <2 x float> %div.i
+}
+
+; Vector fmul by 2^-4 (v2f32)
+define <2 x float> @testvmul_v2f32_mul(<2 x i32> %in) local_unnamed_addr #0 {
+; CHECK-LABEL: testvmul_v2f32_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf.2s        v0, v0, #4
+; CHECK-NEXT:    ret
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fmul <2 x float> %vcvt.i, splat (float 6.250000e-02) ; 0.0625 is 2^-4
+  ret <2 x float> %div.i
+}
+
+; Scalar fdiv by 16.0 (f64)
+define double @tests_f64_div(i64 %in) {
+; CHECK-LABEL: tests_f64_div:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf   d0, x0, #4
+; CHECK-NEXT:    ret
+entry:
+  %vcvt.i = sitofp i64 %in to double
+  %div.i = fdiv double %vcvt.i, 1.600000e+01 ; 16.0 in double-precision
+  ret double %div.i
+}
+
+; Scalar fmul by (2^-4) (f64)
+define double @testsmul_f64_mul(i64 %in) local_unnamed_addr #0 {
+; CHECK-LABEL: testsmul_f64_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf   d0, x0, #4
+; CHECK-NEXT:    ret
+  %vcvt.i = sitofp i64 %in to double
+  %div.i = fmul double %vcvt.i, 6.250000e-02 ; 0.0625 is 2^-4 in double-precision
+  ret double %div.i
+}
+
+; Vector fdiv by 16.0 (v2f64)
+define <2 x double> @testv_v2f64_div(<2 x i64> %in) {
+; CHECK-LABEL: testv_v2f64_div:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf.2d        v0, v0, #4
+; CHECK-NEXT:    ret
+entry:
+  %vcvt.i = sitofp <2 x i64> %in to <2 x double>
+  %div.i = fdiv <2 x double> %vcvt.i, <double 1.600000e+01, double 1.600000e+01>
+  ret <2 x double> %div.i
+}
+
+; Vector fmul by 2^-4 (v2f64)
+define <2 x double> @testvmul_v2f64_mul(<2 x i64> %in) local_unnamed_addr #0 {
+; CHECK-LABEL: testvmul_v2f64_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf.2d        v0, v0, #4
+; CHECK-NEXT:    ret
+  %vcvt.i = sitofp <2 x i64> %in to <2 x double>
+  %div.i = fmul <2 x double> %vcvt.i, splat (double 6.250000e-02) ; 0.0625 is 2^-4 in double-precision
+  ret <2 x double> %div.i
+}
\ No newline at end of file