[clang] bac2a07 - clang: Attach !fpmath metadata to __builtin_sqrt based on language flags

Fri Jul 14 15:46:27 PDT 2023

Author: Matt Arsenault
Date: 2023-07-14T18:46:18-04:00
New Revision: bac2a075408377a8aa41f6626b17bb3e471221f3

URL: https://github.com/llvm/llvm-project/commit/bac2a075408377a8aa41f6626b17bb3e471221f3
DIFF: https://github.com/llvm/llvm-project/commit/bac2a075408377a8aa41f6626b17bb3e471221f3.diff

LOG: clang: Attach !fpmath metadata to __builtin_sqrt based on language flags

OpenCL and HIP have -cl-fp32-correctly-rounded-divide-sqrt and
-fno-hip-correctly-rounded-divide-sqrt. The corresponding fpmath metadata
was only set on fdiv, and not sqrt. The backend is currently underutilizing
sqrt lowering options, and the responsibility is split between the libraries
and backend and this metadata is needed.

CUDA/NVCC has -prec-div and -prev-sqrt but clang doesn't appear to be
aiming for compatibility with those. Don't know if OpenMP has a similar
control.

Added: 
    

Modified: 
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/CodeGen/CGExpr.cpp
    clang/lib/CodeGen/CGExprScalar.cpp
    clang/lib/CodeGen/CodeGenFunction.h
    clang/test/CodeGenCUDA/correctly-rounded-div.cu
    clang/test/CodeGenOpenCL/fpmath.cl

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4abce3fdd579af..033f82e63a899f 100644

--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2544,11 +2544,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_sqrtf:
     case Builtin::BI__builtin_sqrtf16:
     case Builtin::BI__builtin_sqrtl:
-    case Builtin::BI__builtin_sqrtf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::sqrt,
-                                   Intrinsic::experimental_constrained_sqrt));
-
+    case Builtin::BI__builtin_sqrtf128: {
+      llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
+      SetSqrtFPAccuracy(Call);
+      return RValue::get(Call);
+    }
     case Builtin::BItrunc:
     case Builtin::BItruncf:
     case Builtin::BItruncl:

diff  --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index ae5168d4124bb4..a585abf15f829f 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5577,6 +5577,48 @@ void CodeGenFunction::SetFPAccuracy(llvm::Value *Val, float Accuracy) {
   cast<llvm::Instruction>(Val)->setMetadata(llvm::LLVMContext::MD_fpmath, Node);
 }
 
+void CodeGenFunction::SetSqrtFPAccuracy(llvm::Value *Val) {
+  llvm::Type *EltTy = Val->getType()->getScalarType();
+  if (!EltTy->isFloatTy())
+    return;
+
+  if ((getLangOpts().OpenCL &&
+       !CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
+      (getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
+       !CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
+    // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 3ulp
+    //
+    // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
+    // build option allows an application to specify that single precision
+    // floating-point divide (x/y and 1/x) and sqrt used in the program
+    // source are correctly rounded.
+    //
+    // TODO: CUDA has a prec-sqrt flag
+    SetFPAccuracy(Val, 3.0f);
+  }
+}
+
+void CodeGenFunction::SetDivFPAccuracy(llvm::Value *Val) {
+  llvm::Type *EltTy = Val->getType()->getScalarType();
+  if (!EltTy->isFloatTy())
+    return;
+
+  if ((getLangOpts().OpenCL &&
+       !CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
+      (getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
+       !CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
+    // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
+    //
+    // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
+    // build option allows an application to specify that single precision
+    // floating-point divide (x/y and 1/x) and sqrt used in the program
+    // source are correctly rounded.
+    //
+    // TODO: CUDA has a prec-div flag
+    SetFPAccuracy(Val, 2.5f);
+  }
+}
+
 namespace {
   struct LValueOrRValue {
     LValue LV;

diff  --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index f579ca296c97ad..8921e5040df11e 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -3478,21 +3478,7 @@ Value *ScalarExprEmitter::EmitDiv(const BinOpInfo &Ops) {
     llvm::Value *Val;
     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures);
     Val = Builder.CreateFDiv(Ops.LHS, Ops.RHS, "div");
-    if ((CGF.getLangOpts().OpenCL &&
-         !CGF.CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
-        (CGF.getLangOpts().HIP && CGF.getLangOpts().CUDAIsDevice &&
-         !CGF.CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
-      // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
-      // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
-      // build option allows an application to specify that single precision
-      // floating-point divide (x/y and 1/x) and sqrt used in the program
-      // source are correctly rounded.
-      llvm::Type *ValTy = Val->getType();
-      if (ValTy->isFloatTy() ||
-          (isa<llvm::VectorType>(ValTy) &&
-           cast<llvm::VectorType>(ValTy)->getElementType()->isFloatTy()))
-        CGF.SetFPAccuracy(Val, 2.5);
-    }
+    CGF.SetDivFPAccuracy(Val);
     return Val;
   }
   else if (Ops.isFixedPointOp())

diff  --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index bcd4ef4520740b..bac1844a994eef 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4708,6 +4708,14 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// point operation, expressed as the maximum relative error in ulp.
   void SetFPAccuracy(llvm::Value *Val, float Accuracy);
 
+  /// Set the minimum required accuracy of the given sqrt operation
+  /// based on CodeGenOpts.
+  void SetSqrtFPAccuracy(llvm::Value *Val);
+
+  /// Set the minimum required accuracy of the given sqrt operation based on
+  /// CodeGenOpts.
+  void SetDivFPAccuracy(llvm::Value *Val);
+
   /// Set the codegen fast-math flags.
   void SetFastMathFlags(FPOptions FPFeatures);
 

diff  --git a/clang/test/CodeGenCUDA/correctly-rounded-div.cu b/clang/test/CodeGenCUDA/correctly-rounded-div.cu
index 4f17220534cada..2455987b410517 100644
--- a/clang/test/CodeGenCUDA/correctly-rounded-div.cu
+++ b/clang/test/CodeGenCUDA/correctly-rounded-div.cu
@@ -32,4 +32,18 @@ __device__ double dpscalardiv(double a, double b) {
   return a / b;
 }
 
-// NCRDIV: ![[MD]] = !{float 2.500000e+00}
+// COMMON-LABEL: @_Z12spscalarsqrt
+// NCRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD:[0-9]+]]
+// CRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}){{$}}
+__device__ float spscalarsqrt(float a) {
+  return __builtin_sqrtf(a);
+}
+
+// COMMON-LABEL: @_Z12dpscalarsqrt
+// COMMON: call contract double @llvm.sqrt.f64(double %{{.+}}){{$}}
+// COMMON-NOT: !fpmath
+__device__ double dpscalarsqrt(double a) {
+  return __builtin_sqrt(a);
+}
+
+// NCRSQRT: ![[MD]] = !{float 2.500000e+00}

diff  --git a/clang/test/CodeGenOpenCL/fpmath.cl b/clang/test/CodeGenOpenCL/fpmath.cl
index 904c508d00f7ee..3f9ea2c88dbc48 100644
--- a/clang/test/CodeGenOpenCL/fpmath.cl
+++ b/clang/test/CodeGenOpenCL/fpmath.cl
@@ -8,7 +8,7 @@ typedef __attribute__(( ext_vector_type(4) )) float float4;
 float spscalardiv(float a, float b) {
   // CHECK: @spscalardiv
   // CHECK: fdiv{{.*}},
-  // NODIVOPT: !fpmath ![[MD:[0-9]+]]
+  // NODIVOPT: !fpmath ![[MD_FDIV:[0-9]+]]
   // DIVOPT-NOT: !fpmath !{{[0-9]+}}
   return a / b;
 }
@@ -16,11 +16,18 @@ float spscalardiv(float a, float b) {
 float4 spvectordiv(float4 a, float4 b) {
   // CHECK: @spvectordiv
   // CHECK: fdiv{{.*}},
-  // NODIVOPT: !fpmath ![[MD]]
+  // NODIVOPT: !fpmath ![[MD_FDIV]]
   // DIVOPT-NOT: !fpmath !{{[0-9]+}}
   return a / b;
 }
 
+float spscalarsqrt(float a) {
+  // CHECK-LABEL: @spscalarsqrt
+  // NODIVOPT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
+  // DIVOPT: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
+  return __builtin_sqrtf(a);
+}
+
 #if __OPENCL_C_VERSION__ >=120
 void printf(constant char* fmt, ...);
 
@@ -34,11 +41,27 @@ void testdbllit(long *val) {
 
 #ifndef NOFP64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+typedef __attribute__(( ext_vector_type(4) )) double double4;
+
 double dpscalardiv(double a, double b) {
   // CHECK: @dpscalardiv
   // CHECK-NOT: !fpmath
   return a / b;
 }
+
+double4 dpvectordiv(double4 a, double4 b) {
+  // CHECK: @dpvectordiv
+  // CHECK-NOT: !fpmath
+  return a / b;
+}
+
+double dpscalarsqrt(double a) {
+  // CHECK-LABEL: @dpscalarsqrt
+  // CHECK: call double @llvm.sqrt.f64(double %{{.+}}){{$}}
+  return __builtin_sqrt(a);
+}
+
 #endif
 
-// NODIVOPT: ![[MD]] = !{float 2.500000e+00}
+// NODIVOPT: ![[MD_FDIV]] = !{float 2.500000e+00}
+// NODIVOPT: ![[MD_SQRT]] = !{float 3.000000e+00}