[llvm] [AMDGPU] Constant folding of llvm.amdgcn.trig.preop (PR #98562)

Mon Jul 15 14:51:50 PDT 2024

https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/98562

>From 9ac00e88e6cc0326f6ca6ce5d9606573106f6772 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Thu, 11 Jul 2024 15:02:28 -0700
Subject: [PATCH 1/3] [AMDGPU] Constant folding of llvm.amdgcn.trig.preop

  If the parameters(the input and segment select) coming in to
amdgcn.trig.preop intrinsic are compile time constants, we
pre-compute the output of amdgcn.trig.preop on the CPU and
replaces the uses with the computed constant.

  This work extends the patch https://reviews.llvm.org/D120150 to
make it a complete coverage.

  For the segment select, only src1[4:0] are used. A segment slect is
invalid if we are selecting the 53-bit segment beyond the [1200:0] range
of the 2/PI table. 0 is returned when a segment select is not valid.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  91 ++++++++++++
 .../InstCombine/AMDGPU/amdgcn-intrinsics.ll   | 134 ++++++++++--------
 2 files changed, 166 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 93bca4402ed23..f736f5cdd33d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1102,6 +1102,97 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     break;
   }
+  case Intrinsic::amdgcn_trig_preop: {
+    // The intrinsic is declared with name mangling, but currently the
+    // instruction only exists for f64
+    if (!II.getType()->isDoubleTy())
+      break;
+
+    Value *Src = II.getArgOperand(0);
+    Value *Segment = II.getArgOperand(1);
+    if (isa<PoisonValue>(Src))
+      return IC.replaceInstUsesWith(II, Src);
+
+    if (isa<UndefValue>(Src)) {
+      auto *QNaN = ConstantFP::get(
+          II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
+    if (!Csrc)
+      break;
+
+    if (II.isStrictFP())
+      break;
+
+    const APFloat &Fsrc = Csrc->getValueAPF();
+    if (Fsrc.isNaN()) {
+      // FIXME: We just need to make the nan quiet here, but that's unavailable
+      // on APFloat, only IEEEfloat
+      auto *Quieted = ConstantFP::get(
+          II.getType(), scalbn(Fsrc, 0, APFloat::rmNearestTiesToEven));
+      return IC.replaceInstUsesWith(II, Quieted);
+    }
+
+    const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
+    if (!Cseg)
+      break;
+
+    // 2.0/PI table.
+    static const uint32_t TwoByPi[] = {
+        0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
+        0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
+        0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
+        0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
+        0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
+        0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
+        0x56033046};
+
+    const APInt &SegVal = Cseg->getValue();
+    bool Ovflow = false;
+    unsigned Numbits = 32;
+    bool Signed = true;
+    APInt EClamp(Numbits, 1077, Signed);
+    APInt E = Fsrc.bitcastToAPInt().ashr(52);
+    E &= 0x7ff;
+    E = E.trunc(Numbits);
+    APInt Shift =
+        (E.sgt(EClamp) ? E.ssub_ov(EClamp, Ovflow) : APInt(Numbits, 0, Signed))
+            .sadd_ov(APInt(Numbits, 53, Signed).smul_ov(SegVal & 0x1f, Ovflow),
+                     Ovflow);
+    uint32_t Idx = Shift.ashr(5).getZExtValue();
+
+    // Return 0 for invalid segment select (outbound).
+    if (static_cast<size_t>(Idx) + 2 >= std::size(TwoByPi)) {
+      APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
+      return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Zero));
+    }
+
+    APInt Bshift = Shift & 0x1f;
+    Numbits = 64;
+    Signed = false;
+    uint64_t Hi = ((uint64_t)TwoByPi[Idx] << 32) | (uint64_t)TwoByPi[Idx + 1];
+    APInt Thi = APInt(Numbits, Hi, Signed);
+    APInt Tlo = APInt(Numbits, (uint64_t)TwoByPi[Idx + 2] << 32, Signed);
+
+    if (Bshift.sgt(0)) {
+      Numbits = 32;
+      Signed = true;
+      Thi = Thi.shl(Bshift) |
+            Tlo.lshr(APInt(Numbits, 64, Signed).ssub_ov(Bshift, Ovflow));
+    }
+
+    Thi = Thi.lshr(11);
+    APFloat Res = APFloat(Thi.roundToDouble());
+    int32_t Scale = -53 - Shift.getSExtValue();
+
+    if (E.sge(0x7b0))
+      Scale += 128;
+
+    Res = scalbn(Res, Scale, RoundingMode::NearestTiesToEven);
+    return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Res));
+  }
   case Intrinsic::amdgcn_fmul_legacy: {
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 59118a172a2bc..4da5bde7abe30 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -5608,8 +5608,7 @@ declare float @llvm.amdgcn.trig.preop.f32(float, i32)
 
 define double @trig_preop_constfold_variable_undef_arg(i32 %arg) {
 ; CHECK-LABEL: @trig_preop_constfold_variable_undef_arg(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x7FF8000000000000
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 %arg)
   ret double %val
@@ -5617,8 +5616,7 @@ define double @trig_preop_constfold_variable_undef_arg(i32 %arg) {
 
 define double @trig_preop_constfold_variable_poison_arg(i32 %arg) {
 ; CHECK-LABEL: @trig_preop_constfold_variable_poison_arg(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double poison
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 %arg)
   ret double %val
@@ -5653,8 +5651,7 @@ define double @trig_preop_constfold_variable_int(i32 %arg) {
 
 define double @trig_preop_qnan(i32 %arg) {
 ; CHECK-LABEL: @trig_preop_qnan(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x7FF8000000000000
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 %arg)
   ret double %val
@@ -5662,8 +5659,7 @@ define double @trig_preop_qnan(i32 %arg) {
 
 define double @trig_preop_snan(i32 %arg) {
 ; CHECK-LABEL: @trig_preop_snan(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x7FF8000000000001
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 %arg)
   ret double %val
@@ -5671,8 +5667,7 @@ define double @trig_preop_snan(i32 %arg) {
 
 define double @trig_preop_inf_0() {
 ; CHECK-LABEL: @trig_preop_inf_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000000, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0xB43DD63F5F2F8BD
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000000, i32 0)
   ret double %val
@@ -5680,8 +5675,7 @@ define double @trig_preop_inf_0() {
 
 define double @trig_preop_ninf_0() {
 ; CHECK-LABEL: @trig_preop_ninf_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFF0000000000000, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0xB43DD63F5F2F8BD
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0xFFF0000000000000, i32 0)
   ret double %val
@@ -5707,10 +5701,36 @@ define double @trig_preop_variable_args(double %arg0, i32 %arg1) {
 
 define double @trig_preop_constfold() {
 ; CHECK-LABEL: @trig_preop_constfold(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5)
-; CHECK-NEXT:    ret double [[VAL]]
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 2)
+  ret double %val
+}
+
+; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 22)
+  ret double %val
+}
+
+; Only use src1[4:0], so segment is actually 31 for -1.
+define double @trig_preop_constfold_neg1_segment() {
+; CHECK-LABEL: @trig_preop_constfold_neg1_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 -1)
+  ret double %val
+}
+
+; Only use src1[4:0], so segment is actually 0 for -32.
+define double @trig_preop_constfold_neg32_segment() {
+; CHECK-LABEL: @trig_preop_constfold_neg32_segment(
+; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 -32)
   ret double %val
 }
 
@@ -5723,84 +5743,80 @@ define double @trig_preop_constfold_strictfp() strictfp {
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__0() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent0__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0__segment0(
+; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__1() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__1(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 1)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent0__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0__segment2(
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 1)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 2)
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__neg1() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__neg1(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -1)
-; CHECK-NEXT:    ret double [[VAL]]
+; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent0__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 -1)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 22)
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__9999999() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__9999999(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 9999999)
-; CHECK-NEXT:    ret double [[VAL]]
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607__segment0(
+; CHECK-NEXT:    ret double 0x1EC8135A2FBF209C
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 9999999)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_0.0__neg999999() {
-; CHECK-LABEL: @trig_preop_constfold_0.0__neg999999(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -999999)
-; CHECK-NEXT:    ret double [[VAL]]
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607__segment2(
+; CHECK-NEXT:    ret double 0x181272117E2EF7E4
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 -999999)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 2)
   ret double %val
 }
 
-define double @trig_preop_constfold_0x0020000000000000_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x0020000000000000_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x10000000000000, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; src1[4:0] <= 11 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1607__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x0010000000000000, i32 0)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 12)
   ret double %val
 }
 
-define double @trig_preop_constfold_0x001fffffffffffff_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x001fffffffffffff_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent1968__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968__segment0(
+; CHECK-NEXT:    ret double 0x10374F463F669E5F
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000fffffffffffff, i32 0)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_0x8020000000000000_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x8020000000000000_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x8020000000000000, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+define double @trig_preop_constfold_exponent1968__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968__segment2(
+; CHECK-NEXT:    ret double 0x98F2F8BD9E839CE
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x8020000000000000, i32 0)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 2)
   ret double %val
 }
 
-define double @trig_preop_constfold_0x801fffffffffffff_0() {
-; CHECK-LABEL: @trig_preop_constfold_0x801fffffffffffff_0(
-; CHECK-NEXT:    [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x801FFFFFFFFFFFFF, i32 0)
-; CHECK-NEXT:    ret double [[VAL]]
+; src1[4:0] <= 4 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1968__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x801fffffffffffff, i32 0)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 5)
   ret double %val
 }
 

>From 996f358db098843cb2e8aade67f6720d1490f510 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Fri, 12 Jul 2024 12:39:33 -0700
Subject: [PATCH 2/3] [AMDGPU] Constant folding of llvm.amdgcn.trig.preop

  If the parameters(the input and segment select) coming in to
amdgcn.trig.preop intrinsic are compile time constants, we pre-compute
the output of amdgcn.trig.preop on the CPU and replaces the uses with
the computed constant.
  This work extends the patch https://reviews.llvm.org/D120150 to
make it a complete coverage.
  For the segment select, only src1[4:0] are used. A segment select is
invalid if we are selecting the 53-bit segment beyond the [1200:0] range
of the 2/PI table. 0 is returned when a segment select is not valid.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 57 +++++++------------
 .../InstCombine/AMDGPU/amdgcn-intrinsics.ll   | 25 ++++++++
 2 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index f736f5cdd33d4..fb0e1387545fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1139,6 +1139,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (!Cseg)
       break;
 
+    APInt Exponent = Fsrc.bitcastToAPInt().ashr(52);
+    Exponent &= 0x7ff;
+    Exponent = Exponent.trunc(32);
+
+    unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
+    unsigned Shift = SegmentVal * 53;
+    if (Exponent.sgt(1077))
+      Shift += Exponent.getZExtValue() - 1077;
+
     // 2.0/PI table.
     static const uint32_t TwoByPi[] = {
         0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
@@ -1149,49 +1158,27 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
         0x56033046};
 
-    const APInt &SegVal = Cseg->getValue();
-    bool Ovflow = false;
-    unsigned Numbits = 32;
-    bool Signed = true;
-    APInt EClamp(Numbits, 1077, Signed);
-    APInt E = Fsrc.bitcastToAPInt().ashr(52);
-    E &= 0x7ff;
-    E = E.trunc(Numbits);
-    APInt Shift =
-        (E.sgt(EClamp) ? E.ssub_ov(EClamp, Ovflow) : APInt(Numbits, 0, Signed))
-            .sadd_ov(APInt(Numbits, 53, Signed).smul_ov(SegVal & 0x1f, Ovflow),
-                     Ovflow);
-    uint32_t Idx = Shift.ashr(5).getZExtValue();
-
-    // Return 0 for invalid segment select (outbound).
-    if (static_cast<size_t>(Idx) + 2 >= std::size(TwoByPi)) {
+    // Return 0 for outbound segment (hardware behavior).
+    unsigned Idx = Shift >> 5;
+    if (Idx + 2 >= std::size(TwoByPi)) {
       APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
       return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Zero));
     }
 
-    APInt Bshift = Shift & 0x1f;
-    Numbits = 64;
-    Signed = false;
-    uint64_t Hi = ((uint64_t)TwoByPi[Idx] << 32) | (uint64_t)TwoByPi[Idx + 1];
-    APInt Thi = APInt(Numbits, Hi, Signed);
-    APInt Tlo = APInt(Numbits, (uint64_t)TwoByPi[Idx + 2] << 32, Signed);
-
-    if (Bshift.sgt(0)) {
-      Numbits = 32;
-      Signed = true;
-      Thi = Thi.shl(Bshift) |
-            Tlo.lshr(APInt(Numbits, 64, Signed).ssub_ov(Bshift, Ovflow));
-    }
-
+    unsigned BShift = Shift & 0x1f;
+    APInt Thi = APInt(64, Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]), false);
+    APInt Tlo = APInt(64, Make_64(TwoByPi[Idx + 2], 0), false);
+    if (BShift)
+      Thi = Thi.shl(BShift) | Tlo.lshr(64 - BShift);
     Thi = Thi.lshr(11);
-    APFloat Res = APFloat(Thi.roundToDouble());
-    int32_t Scale = -53 - Shift.getSExtValue();
+    APFloat Result = APFloat(Thi.roundToDouble());
 
-    if (E.sge(0x7b0))
+    int Scale = -53 - Shift;
+    if (Exponent.sge(1968))
       Scale += 128;
 
-    Res = scalbn(Res, Scale, RoundingMode::NearestTiesToEven);
-    return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Res));
+    Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
+    return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
   }
   case Intrinsic::amdgcn_fmul_legacy: {
     Value *Op0 = II.getArgOperand(0);
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 4da5bde7abe30..6bf6e46332542 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -5751,6 +5751,14 @@ define double @trig_preop_constfold_exponent0__segment0() {
   ret double %val
 }
 
+define double @trig_preop_constfold_exponent0__segment1() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0__segment1(
+; CHECK-NEXT:    ret double 0x3C94A7F09D5F47D4
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 1)
+  ret double %val
+}
+
 define double @trig_preop_constfold_exponent0__segment2() {
 ; CHECK-LABEL: @trig_preop_constfold_exponent0__segment2(
 ; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
@@ -5777,6 +5785,15 @@ define double @trig_preop_constfold_exponent1607__segment0() {
   ret double %val
 }
 
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607__segment1() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607__segment1(
+; CHECK-NEXT:    ret double 0x1B791D639835339F
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 1)
+  ret double %val
+}
+
 ; 1607 = 1077 + 10 * 53
 define double @trig_preop_constfold_exponent1607__segment2() {
 ; CHECK-LABEL: @trig_preop_constfold_exponent1607__segment2(
@@ -5803,6 +5820,14 @@ define double @trig_preop_constfold_exponent1968__segment0() {
   ret double %val
 }
 
+define double @trig_preop_constfold_exponent1968__segment1() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968__segment1(
+; CHECK-NEXT:    ret double 0xCED45AEA4F758FD
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 1)
+  ret double %val
+}
+
 define double @trig_preop_constfold_exponent1968__segment2() {
 ; CHECK-LABEL: @trig_preop_constfold_exponent1968__segment2(
 ; CHECK-NEXT:    ret double 0x98F2F8BD9E839CE

>From 434e613baf5b7522303530e8d5d07c225413ab54 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Mon, 15 Jul 2024 14:49:30 -0700
Subject: [PATCH 3/3] [AMDGPU] Constant folding of llvm.amdgcn.trig.preop

  If the parameters(the input and segment select) coming in to
amdgcn.trig.preop intrinsic are compile time constants, we pre-compute
the output of amdgcn.trig.preop on the CPU and replaces the uses with
the computed constant.
  This work extends the patch https://reviews.llvm.org/D120150 to
make it a complete coverage.
  For the segment select, only src1[4:0] are used. A segment select is
invalid if we are selecting the 53-bit segment beyond the [1200:0] range
of the 2/PI table. 0 is returned when a segment select is not valid.
---
 .../InstCombine/AMDGPU/amdgcn-intrinsics.ll   | 189 +++++++++++++++---
 1 file changed, 159 insertions(+), 30 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 6bf6e46332542..c3a7e684c772e 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -5743,42 +5743,84 @@ define double @trig_preop_constfold_strictfp() strictfp {
   ret double %val
 }
 
-define double @trig_preop_constfold_exponent0__segment0() {
-; CHECK-LABEL: @trig_preop_constfold_exponent0__segment0(
+define double @trig_preop_constfold_exponent0_mantissa0__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__segment0(
 ; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_exponent0__segment1() {
-; CHECK-LABEL: @trig_preop_constfold_exponent0__segment1(
-; CHECK-NEXT:    ret double 0x3C94A7F09D5F47D4
+define double @trig_preop_constfold_exponent0_mantissa1__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__segment0(
+; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 0)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent0_mantissaX__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__segment0(
+; CHECK-NEXT:    ret double 0x3FE45F306DC9C882
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 1)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x4A7F09D5F47D4, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_exponent0__segment2() {
-; CHECK-LABEL: @trig_preop_constfold_exponent0__segment2(
+define double @trig_preop_constfold_exponent0_mantissa0__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__segment2(
 ; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 2)
   ret double %val
 }
 
+define double @trig_preop_constfold_exponent0_mantissa1__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__segment2(
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 2)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent0_mantissaX__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__segment2(
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x394A6EE06DB14A, i32 2)
+  ret double %val
+}
+
 ; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
-define double @trig_preop_constfold_exponent0__outbound_segment() {
-; CHECK-LABEL: @trig_preop_constfold_exponent0__outbound_segment(
+define double @trig_preop_constfold_exponent0_mantissa0__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__outbound_segment(
 ; CHECK-NEXT:    ret double 0.000000e+00
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 22)
   ret double %val
 }
 
+; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent0_mantissa1__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 22)
+  ret double %val
+}
+
+; src1[4:0] <= 21 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent0_mantissaX__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0xA6EE06DB14ACC, i32 22)
+  ret double %val
+}
+
 ; 1607 = 1077 + 10 * 53
-define double @trig_preop_constfold_exponent1607__segment0() {
-; CHECK-LABEL: @trig_preop_constfold_exponent1607__segment0(
+define double @trig_preop_constfold_exponent1607_mantissa0__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__segment0(
 ; CHECK-NEXT:    ret double 0x1EC8135A2FBF209C
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 0)
@@ -5786,65 +5828,152 @@ define double @trig_preop_constfold_exponent1607__segment0() {
 }
 
 ; 1607 = 1077 + 10 * 53
-define double @trig_preop_constfold_exponent1607__segment1() {
-; CHECK-LABEL: @trig_preop_constfold_exponent1607__segment1(
-; CHECK-NEXT:    ret double 0x1B791D639835339F
+define double @trig_preop_constfold_exponent1607_mantissa1__segment1() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__segment1(
+; CHECK-NEXT:    ret double 0x1EC8135A2FBF209C
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 1)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 0)
   ret double %val
 }
 
 ; 1607 = 1077 + 10 * 53
-define double @trig_preop_constfold_exponent1607__segment2() {
-; CHECK-LABEL: @trig_preop_constfold_exponent1607__segment2(
+define double @trig_preop_constfold_exponent1607_mantissaX__segment1() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__segment1(
+; CHECK-NEXT:    ret double 0x1EC8135A2FBF209C
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6471B791D6398353, i32 0)
+  ret double %val
+}
+
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissa0__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__segment2(
 ; CHECK-NEXT:    ret double 0x181272117E2EF7E4
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 2)
   ret double %val
 }
 
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissa1__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__segment2(
+; CHECK-NEXT:    ret double 0x181272117E2EF7E4
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 2)
+  ret double %val
+}
+
+; 1607 = 1077 + 10 * 53
+define double @trig_preop_constfold_exponent1607_mantissaX__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__segment2(
+; CHECK-NEXT:    ret double 0x181272117E2EF7E4
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647272117E2EF7E4, i32 2)
+  ret double %val
+}
+
 ; src1[4:0] <= 11 for segment to be inbound with this exponent of src0.
-define double @trig_preop_constfold_exponent1607__outbound_segment() {
-; CHECK-LABEL: @trig_preop_constfold_exponent1607__outbound_segment(
+define double @trig_preop_constfold_exponent1607_mantissa0__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__outbound_segment(
 ; CHECK-NEXT:    ret double 0.000000e+00
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 12)
   ret double %val
 }
 
-define double @trig_preop_constfold_exponent1968__segment0() {
-; CHECK-LABEL: @trig_preop_constfold_exponent1968__segment0(
+; src1[4:0] <= 11 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1607_mantissa1__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 12)
+  ret double %val
+}
+
+; src1[4:0] <= 11 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1607_mantissaX__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647181272117E2EF, i32 12)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissa0__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__segment0(
 ; CHECK-NEXT:    ret double 0x10374F463F669E5F
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_exponent1968__segment1() {
-; CHECK-LABEL: @trig_preop_constfold_exponent1968__segment1(
-; CHECK-NEXT:    ret double 0xCED45AEA4F758FD
+define double @trig_preop_constfold_exponent1968_mantissa1__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__segment0(
+; CHECK-NEXT:    ret double 0x10374F463F669E5F
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 0)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissax__segment0() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissax__segment0(
+; CHECK-NEXT:    ret double 0x10374F463F669E5F
 ;
-  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 1)
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B074F463F669E5F, i32 0)
   ret double %val
 }
 
-define double @trig_preop_constfold_exponent1968__segment2() {
-; CHECK-LABEL: @trig_preop_constfold_exponent1968__segment2(
+define double @trig_preop_constfold_exponent1968_mantissa0__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__segment2(
 ; CHECK-NEXT:    ret double 0x98F2F8BD9E839CE
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 2)
   ret double %val
 }
 
+define double @trig_preop_constfold_exponent1968_mantissa1__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__segment2(
+; CHECK-NEXT:    ret double 0x98F2F8BD9E839CE
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 2)
+  ret double %val
+}
+
+define double @trig_preop_constfold_exponent1968_mantissaX__segment2() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissaX__segment2(
+; CHECK-NEXT:    ret double 0x394A6EE06DB14ACC
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B02F8BD9E839CE, i32 2)
+  ret double %val
+}
+
 ; src1[4:0] <= 4 for segment to be inbound with this exponent of src0.
-define double @trig_preop_constfold_exponent1968__outbound_segment() {
-; CHECK-LABEL: @trig_preop_constfold_exponent1968__outbound_segment(
+define double @trig_preop_constfold_exponent1968_mantissa0__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__outbound_segment(
 ; CHECK-NEXT:    ret double 0.000000e+00
 ;
   %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 5)
   ret double %val
 }
 
+; src1[4:0] <= 4 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1968_mantissa1__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__outbound_segment(
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 5)
+  ret double %val
+}
+
+; src1[4:0] <= 4 for segment to be inbound with this exponent of src0.
+define double @trig_preop_constfold_exponent1968_mantissaX__outbound_segment() {
+; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissaX__outbound_segment(
+; CHECK-NEXT:    ret double 0x2F42371D2126E970
+;
+  %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B098F2F8BD9E83, i32 5)
+  ret double %val
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.log
 ; --------------------------------------------------------------------