[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

Christian Sigg via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Sun May 22 01:00:58 PDT 2022


csigg created this revision.
csigg added a reviewer: bkramer.
Herald added subscribers: bzcheeseman, mattd, gchakrabarti, awarzynski, sdasgup3, asavonic, wenzhicui, wrengr, Chia-hungDuan, dcaballe, cota, teijeong, rdzhabarov, tatianashp, msifontes, jurahul, Kayjukh, grosul1, Joonsoo, liufengdb, aartbik, mgester, arpith-jacob, antiagainst, shauheen, rriddle, mehdi_amini, sanjoy.google, hiraditya, jholewinski.
Herald added a reviewer: ftynse.
Herald added a reviewer: bondhugula.
Herald added a reviewer: ThomasRaoux.
Herald added a project: All.
csigg requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, stephenneuendorffer, nicolasvasilache, jdoerfert.
Herald added a reviewer: herhut.
Herald added projects: clang, MLIR, LLVM.

This is correct for all values, i.e. the same as promoting the division to fp32 in the NVPTX backend. But it is faster (~10% in average, sometimes more) because:

- it performs less Newton iterations
- it avoids the slow path for e.g. denormals
- it allows reuse of the reciprocal for multiple divisions by the same divisor

Test program:

  #include <stdio.h>
  #include "cuda_fp16.h"
  
  // This is a variant of CUDA's own __hdiv which is fast than hdiv_promote below
  // and doesn't suffer from the perf cliff of div.rn.fp32 with 'special' values.
  __device__ half hdiv_newton(half a, half b) {
    float fa = __half2float(a);
    float fb = __half2float(b);
  
    float rcp;
    asm("{rcp.approx.ftz.f32 %0, %1;\n}" : "=f"(rcp) : "f"(fb));
  
    float result = fa * rcp;
    auto exponent = reinterpret_cast<const unsigned&>(result) & 0x7f800000;
    if (exponent != 0 && exponent != 0x7f800000) {
      float err = __fmaf_rn(-fb, result, fa);
      result = __fmaf_rn(rcp, err, result);
    }
  
    return __float2half(result);
  }
  
  // Surprisingly, this is faster than CUDA's own __hdiv.
  __device__ half hdiv_promote(half a, half b) {
    return __float2half(__half2float(a) / __half2float(b));
  }
  
  // This is an approximation that is accurate up to 1 ulp.
  __device__ half hdiv_approx(half a, half b) {
    float fa = __half2float(a);
    float fb = __half2float(b);
  
    float result;
    asm("{div.approx.ftz.f32 %0, %1, %2;\n}" : "=f"(result) : "f"(fa), "f"(fb));
    return __float2half(result);
  }
  
  __global__ void CheckCorrectness() {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    half x = reinterpret_cast<const half&>(i);
    for (int j = 0; j < 65536; ++j) {
      half y = reinterpret_cast<const half&>(j);
      half d1 = hdiv_newton(x, y);
      half d2 = hdiv_promote(x, y);
      auto s1 = reinterpret_cast<const short&>(d1);
      auto s2 = reinterpret_cast<const short&>(d2);
      if (s1 != s2) {
        printf("%f (%u) / %f (%u), got %f (%hu), expected: %f (%hu)\n",
               __half2float(x), i, __half2float(y), j, __half2float(d1), s1,
               __half2float(d2), s2);
        //__trap();
      }
    }
  }
  
  __device__ half dst;
  
  __global__ void ProfileBuiltin(half x) {
    #pragma unroll 1
    for (int i = 0; i < 10000000; ++i) {
      x = x / x;
    }
    dst = x;
  }
  
  __global__ void ProfilePromote(half x) {
    #pragma unroll 1
    for (int i = 0; i < 10000000; ++i) {
      x = hdiv_promote(x, x);
    }
    dst = x;
  }
  
  __global__ void ProfileNewton(half x) {
    #pragma unroll 1
    for (int i = 0; i < 10000000; ++i) {
      x = hdiv_newton(x, x);
    }
    dst = x;
  }
  
  __global__ void ProfileApprox(half x) {
    #pragma unroll 1
    for (int i = 0; i < 10000000; ++i) {
      x = hdiv_approx(x, x);
    }
    dst = x;
  }
  
  int main() {
    CheckCorrectness<<<256, 256>>>();
    half one = __float2half(1.0f);
    ProfileBuiltin<<<1, 1>>>(one);  // 1.001s
    ProfilePromote<<<1, 1>>>(one);  // 0.560s
    ProfileNewton<<<1, 1>>>(one);   // 0.508s
    ProfileApprox<<<1, 1>>>(one);   // 0.304s
    auto status = cudaDeviceSynchronize();
    printf("%s\n", cudaGetErrorString(status));
  }


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D126158

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
  mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir

-------------- next part --------------
A non-text attachment was scrubbed...
Name: D126158.431218.patch
Type: text/x-patch
Size: 25168 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220522/7d2535e7/attachment.bin>


More information about the llvm-commits mailing list