[PATCH] D120150: Constant folding of llvm.amdgcn.trig.preop

Fri Feb 18 11:12:26 PST 2022

Ravi created this revision.
Ravi added reviewers: b-sumner, arsenm, sameerds, cdevadas.
Herald added subscribers: foad, kerbowa, hiraditya, tpr, nhaehnle, jvesely.
Ravi requested review of this revision.
Herald added subscribers: llvm-commits, wdng.
Herald added a project: LLVM.

If the parameters(the input and segment select) coming in to amdgcn.trig.preop intrinsic are compile time constants, then this patch pre-computes the output of amdgcn.trig.preop on the CPU and replaces the uses with the computed constant.

All the existing AMDGPU lit cases pass along with the negative cases where the parameters to this intrinsic are variable.
Added a simple test case with the exact output that matches the output from the GPU.

Created a small HIP test application with the exact compute logic(and the constants used for 2/pi) running on the CPU and the intrinsic invoked for the GPU kernel. 
Ran the test over the entire range of double floating-point. The outputs from the CPU and those from the intrinsic on gfx10 AMD GPU match.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D120150

Files:
  llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll


Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -1,5 +1,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx900 -instcombine < %s | FileCheck -check-prefix=GCN %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx1010 -instcombine < %s | FileCheck -check-prefix=GCN %s
 
 declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
 
@@ -28,3 +30,11 @@
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
+
+define protected amdgpu_kernel void @trig_preop_constfold(double addrspace(1)* nocapture %0, double addrspace(1)* nocapture readnone %1, i32 %2){
+; GCN: store double 0x2F42371D2126E970, double addrspace(1)* %0, align 8
+; GCN-NEXT: ret void
+  %4 = tail call contract double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5)
+  store double %4, double addrspace(1)* %0, align 8
+  ret void
+}
Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -289,6 +289,8 @@
       });
 }
 
+static inline long as_long(double d) { union { double d; long u; } v; v.d = d; return v.u; }
+
 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
                                            InstCombiner &IC) const {
   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
@@ -968,6 +970,52 @@
 
     break;
   }
+  case Intrinsic::amdgcn_trig_preop: {
+
+    const ulong TwoByPi[] = {
+    0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0,
+    0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561,
+    0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
+    0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484,
+    0xe99c7026, 0xb45f7e41, 0x3991d639, 0x835339f4,
+    0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
+    0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7,
+    0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b,
+    0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
+    0x56033046
+    };
+
+    Value *Src = II.getArgOperand(0);
+    Value *Segment = II.getArgOperand(1);
+
+    if (const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src)) {
+      if (const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment)) {
+
+      const APFloat &ArgVal = Csrc->getValueAPF();
+      double Dsrc = ArgVal.convertToDouble();
+      const APInt &SegVal = Cseg->getUniqueInteger();
+      int Iseg = SegVal.getSExtValue();
+
+      const int Eclamp = 1077;
+      int E = (as_long(Dsrc) >> 52) & 0x7ff;
+      int Shift = (E > Eclamp ?  E - Eclamp : 0) + 53 * Iseg;
+      int I = Shift >> 5;
+      int Bshift = Shift & 0x1f;
+      ulong Thi = (TwoByPi[I] << 32) | TwoByPi[I+1];
+      ulong Tlo = TwoByPi[I+2] << 32;
+      if (Bshift > 0)
+        Thi = (Thi << Bshift) | (Tlo >> (64-Bshift));
+      Thi >>= 11;
+      double Res = (double)Thi;
+      int Scale = -53 - Shift;
+      if (E >= 0x7b0)
+        Scale += 128;
+      Res = ldexp(Res, Scale);
+      return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Res));
+      }
+    }
+    break;
+  }
   case Intrinsic::amdgcn_fmul_legacy: {
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D120150.409976.patch
Type: text/x-patch
Size: 3715 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220218/97506b77/attachment.bin>