[llvm] Add constant-folding for unary NVVM intrinsics (PR #141233)

Wed Jul 2 01:11:00 PDT 2025

https://github.com/LewisCrawford updated https://github.com/llvm/llvm-project/pull/141233

>From 026166f597bea6da1c5c69d560471cdb4d879996 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford at nvidia.com>
Date: Fri, 23 May 2025 13:21:46 +0000
Subject: [PATCH 1/5] Add constant-folding for unary NVVM intrinsics

Add support for constant-folding numerous NVVM unary arithmetic
intrinsics (including f, d, and ftz_f variants):
  - nvvm.ceil.*
  - nvvm.cos.approx.*
  - nvvm.ex2.approx.*
  - nvvm.fabs.*
  - nvvm.floor.*
  - nvvm.lg2.approx.*
  - nvvm.rcp.*
  - nvvm.round.*
  - nvvm.rsqrt.approx.*
  - nvvm.saturate.*
  - nvvm.sin.approx.*
  - nvvm.sqrt.f
  - nvvm.sqrt.rn.*
  - nvvm.sqrt.approx.*
---
 llvm/include/llvm/IR/NVVMIntrinsicUtils.h     |  121 ++
 llvm/lib/Analysis/ConstantFolding.cpp         |  209 +++-
 .../const-fold-nvvm-unary-arithmetic.ll       | 1003 +++++++++++++++++
 3 files changed, 1329 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll

diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index ce794e2573637..61beca8613989 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -334,6 +334,127 @@ inline bool FMinFMaxIsXorSignAbs(Intrinsic::ID IntrinsicID) {
   return false;
 }
 
+inline bool UnaryMathIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_ceil_ftz_f:
+  case Intrinsic::nvvm_cos_approx_ftz_f:
+  case Intrinsic::nvvm_ex2_approx_ftz_f:
+  case Intrinsic::nvvm_fabs_ftz:
+  case Intrinsic::nvvm_floor_ftz_f:
+  case Intrinsic::nvvm_lg2_approx_ftz_f:
+  case Intrinsic::nvvm_round_ftz_f:
+  case Intrinsic::nvvm_rsqrt_approx_ftz_d:
+  case Intrinsic::nvvm_rsqrt_approx_ftz_f:
+  case Intrinsic::nvvm_saturate_ftz_f:
+  case Intrinsic::nvvm_sin_approx_ftz_f:
+  case Intrinsic::nvvm_sqrt_rn_ftz_f:
+  case Intrinsic::nvvm_sqrt_approx_ftz_f:
+    return true;
+  case Intrinsic::nvvm_ceil_f:
+  case Intrinsic::nvvm_ceil_d:
+  case Intrinsic::nvvm_cos_approx_f:
+  case Intrinsic::nvvm_ex2_approx_d:
+  case Intrinsic::nvvm_ex2_approx_f:
+  case Intrinsic::nvvm_fabs:
+  case Intrinsic::nvvm_floor_f:
+  case Intrinsic::nvvm_floor_d:
+  case Intrinsic::nvvm_lg2_approx_d:
+  case Intrinsic::nvvm_lg2_approx_f:
+  case Intrinsic::nvvm_round_f:
+  case Intrinsic::nvvm_round_d:
+  case Intrinsic::nvvm_rsqrt_approx_d:
+  case Intrinsic::nvvm_rsqrt_approx_f:
+  case Intrinsic::nvvm_saturate_d:
+  case Intrinsic::nvvm_saturate_f:
+  case Intrinsic::nvvm_sin_approx_f:
+  case Intrinsic::nvvm_sqrt_f:
+  case Intrinsic::nvvm_sqrt_rn_d:
+  case Intrinsic::nvvm_sqrt_rn_f:
+  case Intrinsic::nvvm_sqrt_approx_f:
+    return false;
+  }
+  llvm_unreachable("Checking FTZ flag for invalid unary intrinsic");
+  return false;
+}
+
+inline bool RCPShouldFTZ(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_rcp_rm_ftz_f:
+  case Intrinsic::nvvm_rcp_rn_ftz_f:
+  case Intrinsic::nvvm_rcp_rp_ftz_f:
+  case Intrinsic::nvvm_rcp_rz_ftz_f:
+  case Intrinsic::nvvm_rcp_approx_ftz_f:
+  case Intrinsic::nvvm_rcp_approx_ftz_d:
+    return true;
+  case Intrinsic::nvvm_rcp_rm_d:
+  case Intrinsic::nvvm_rcp_rm_f:
+  case Intrinsic::nvvm_rcp_rn_d:
+  case Intrinsic::nvvm_rcp_rn_f:
+  case Intrinsic::nvvm_rcp_rp_d:
+  case Intrinsic::nvvm_rcp_rp_f:
+  case Intrinsic::nvvm_rcp_rz_d:
+  case Intrinsic::nvvm_rcp_rz_f:
+    return false;
+  }
+  llvm_unreachable("Checking FTZ flag for invalid rcp intrinsic");
+  return false;
+}
+
+inline APFloat::roundingMode GetRCPRoundingMode(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_rcp_rm_f:
+  case Intrinsic::nvvm_rcp_rm_d:
+  case Intrinsic::nvvm_rcp_rm_ftz_f:
+    return APFloat::rmTowardNegative;
+
+  case Intrinsic::nvvm_rcp_approx_ftz_f:
+  case Intrinsic::nvvm_rcp_approx_ftz_d:
+  case Intrinsic::nvvm_rcp_rn_f:
+  case Intrinsic::nvvm_rcp_rn_d:
+  case Intrinsic::nvvm_rcp_rn_ftz_f:
+    return APFloat::rmNearestTiesToEven;
+
+  case Intrinsic::nvvm_rcp_rp_f:
+  case Intrinsic::nvvm_rcp_rp_d:
+  case Intrinsic::nvvm_rcp_rp_ftz_f:
+    return APFloat::rmNearestTiesToEven;
+
+  case Intrinsic::nvvm_rcp_rz_f:
+  case Intrinsic::nvvm_rcp_rz_d:
+  case Intrinsic::nvvm_rcp_rz_ftz_f:
+    return APFloat::rmTowardZero;
+  }
+  llvm_unreachable("Checking rounding mode for invalid rcp intrinsic");
+  return APFloat::roundingMode::Invalid;
+}
+
+inline bool RCPIsApprox(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_rcp_approx_ftz_f:
+  case Intrinsic::nvvm_rcp_approx_ftz_d:
+    return true;
+
+  case Intrinsic::nvvm_rcp_rm_f:
+  case Intrinsic::nvvm_rcp_rm_d:
+  case Intrinsic::nvvm_rcp_rm_ftz_f:
+
+  case Intrinsic::nvvm_rcp_rn_f:
+  case Intrinsic::nvvm_rcp_rn_d:
+  case Intrinsic::nvvm_rcp_rn_ftz_f:
+
+  case Intrinsic::nvvm_rcp_rp_f:
+  case Intrinsic::nvvm_rcp_rp_d:
+  case Intrinsic::nvvm_rcp_rp_ftz_f:
+
+  case Intrinsic::nvvm_rcp_rz_f:
+  case Intrinsic::nvvm_rcp_rz_d:
+  case Intrinsic::nvvm_rcp_rz_ftz_f:
+    return false;
+  }
+  llvm_unreachable("Checking approx flag for invalid rcp intrinsic");
+  return false;
+}
+
 } // namespace nvvm
 } // namespace llvm
 #endif // LLVM_IR_NVVMINTRINSICUTILS_H
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 2476dc58375e5..75f860a28818e 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1776,6 +1776,67 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::nvvm_d2ull_rp:
   case Intrinsic::nvvm_d2ull_rz:
 
+  // NVVM math intrinsics:
+  case Intrinsic::nvvm_ceil_d:
+  case Intrinsic::nvvm_ceil_f:
+  case Intrinsic::nvvm_ceil_ftz_f:
+
+  case Intrinsic::nvvm_cos_approx_f:
+  case Intrinsic::nvvm_cos_approx_ftz_f:
+
+  case Intrinsic::nvvm_ex2_approx_d:
+  case Intrinsic::nvvm_ex2_approx_f:
+  case Intrinsic::nvvm_ex2_approx_ftz_f:
+
+  case Intrinsic::nvvm_fabs:
+  case Intrinsic::nvvm_fabs_ftz:
+
+  case Intrinsic::nvvm_floor_d:
+  case Intrinsic::nvvm_floor_f:
+  case Intrinsic::nvvm_floor_ftz_f:
+
+  case Intrinsic::nvvm_lg2_approx_d:
+  case Intrinsic::nvvm_lg2_approx_f:
+  case Intrinsic::nvvm_lg2_approx_ftz_f:
+
+  case Intrinsic::nvvm_rcp_rm_d:
+  case Intrinsic::nvvm_rcp_rm_f:
+  case Intrinsic::nvvm_rcp_rm_ftz_f:
+  case Intrinsic::nvvm_rcp_rn_d:
+  case Intrinsic::nvvm_rcp_rn_f:
+  case Intrinsic::nvvm_rcp_rn_ftz_f:
+  case Intrinsic::nvvm_rcp_rp_d:
+  case Intrinsic::nvvm_rcp_rp_f:
+  case Intrinsic::nvvm_rcp_rp_ftz_f:
+  case Intrinsic::nvvm_rcp_rz_d:
+  case Intrinsic::nvvm_rcp_rz_f:
+  case Intrinsic::nvvm_rcp_rz_ftz_f:
+  case Intrinsic::nvvm_rcp_approx_ftz_d:
+  case Intrinsic::nvvm_rcp_approx_ftz_f:
+
+  case Intrinsic::nvvm_round_d:
+  case Intrinsic::nvvm_round_f:
+  case Intrinsic::nvvm_round_ftz_f:
+
+  case Intrinsic::nvvm_rsqrt_approx_d:
+  case Intrinsic::nvvm_rsqrt_approx_f:
+  case Intrinsic::nvvm_rsqrt_approx_ftz_d:
+  case Intrinsic::nvvm_rsqrt_approx_ftz_f:
+
+  case Intrinsic::nvvm_saturate_d:
+  case Intrinsic::nvvm_saturate_f:
+  case Intrinsic::nvvm_saturate_ftz_f:
+
+  case Intrinsic::nvvm_sin_approx_f:
+  case Intrinsic::nvvm_sin_approx_ftz_f:
+
+  case Intrinsic::nvvm_sqrt_f:
+  case Intrinsic::nvvm_sqrt_rn_d:
+  case Intrinsic::nvvm_sqrt_rn_f:
+  case Intrinsic::nvvm_sqrt_rn_ftz_f:
+  case Intrinsic::nvvm_sqrt_approx_f:
+  case Intrinsic::nvvm_sqrt_approx_ftz_f:
+
   // Sign operations are actually bitwise operations, they do not raise
   // exceptions even for SNANs.
   case Intrinsic::fabs:
@@ -1791,6 +1852,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::nearbyint:
   case Intrinsic::rint:
   case Intrinsic::canonicalize:
+
   // Constrained intrinsics can be folded if FP environment is known
   // to compiler.
   case Intrinsic::experimental_constrained_fma:
@@ -1944,16 +2006,23 @@ static const APFloat FTZPreserveSign(const APFloat &V) {
   return V;
 }
 
-Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
-                         Type *Ty) {
+Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty,
+                         bool ShouldFTZPreservingSign = false) {
   llvm_fenv_clearexcept();
-  double Result = NativeFP(V.convertToDouble());
+  auto Input = ShouldFTZPreservingSign ? FTZPreserveSign(V) : V;
+  double Result = NativeFP(Input.convertToDouble());
   if (llvm_fenv_testexcept()) {
     llvm_fenv_clearexcept();
     return nullptr;
   }
 
-  return GetConstantFoldFPValue(Result, Ty);
+  Constant *Output = GetConstantFoldFPValue(Result, Ty);
+  if (ShouldFTZPreservingSign) {
+    const auto *CFP = static_cast<ConstantFP *>(Output);
+    return ConstantFP::get(Ty->getContext(),
+                           FTZPreserveSign(CFP->getValueAPF()));
+  }
+  return Output;
 }
 
 #if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
@@ -2524,6 +2593,138 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
         return ConstantFoldFP(cosh, APF, Ty);
       case Intrinsic::sqrt:
         return ConstantFoldFP(sqrt, APF, Ty);
+
+      // NVVM Intrinsics:
+      case Intrinsic::nvvm_ceil_ftz_f:
+      case Intrinsic::nvvm_ceil_f:
+      case Intrinsic::nvvm_ceil_d:
+        return ConstantFoldFP(ceil, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+
+      case Intrinsic::nvvm_cos_approx_ftz_f:
+      case Intrinsic::nvvm_cos_approx_f:
+        return ConstantFoldFP(cos, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+
+      case Intrinsic::nvvm_ex2_approx_ftz_f:
+      case Intrinsic::nvvm_ex2_approx_d:
+      case Intrinsic::nvvm_ex2_approx_f:
+        return ConstantFoldFP(exp2, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+
+      case Intrinsic::nvvm_fabs_ftz:
+      case Intrinsic::nvvm_fabs:
+        return ConstantFoldFP(fabs, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+
+      case Intrinsic::nvvm_floor_ftz_f:
+      case Intrinsic::nvvm_floor_f:
+      case Intrinsic::nvvm_floor_d:
+        return ConstantFoldFP(floor, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+
+      case Intrinsic::nvvm_lg2_approx_ftz_f:
+      case Intrinsic::nvvm_lg2_approx_d:
+      case Intrinsic::nvvm_lg2_approx_f: {
+        if (APF.isNegative() || APF.isZero())
+          return nullptr;
+        return ConstantFoldFP(log2, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+      }
+
+      case Intrinsic::nvvm_rcp_rm_ftz_f:
+      case Intrinsic::nvvm_rcp_rn_ftz_f:
+      case Intrinsic::nvvm_rcp_rp_ftz_f:
+      case Intrinsic::nvvm_rcp_rz_ftz_f:
+      case Intrinsic::nvvm_rcp_approx_ftz_f:
+      case Intrinsic::nvvm_rcp_approx_ftz_d:
+      case Intrinsic::nvvm_rcp_rm_d:
+      case Intrinsic::nvvm_rcp_rm_f:
+      case Intrinsic::nvvm_rcp_rn_d:
+      case Intrinsic::nvvm_rcp_rn_f:
+      case Intrinsic::nvvm_rcp_rp_d:
+      case Intrinsic::nvvm_rcp_rp_f:
+      case Intrinsic::nvvm_rcp_rz_d:
+      case Intrinsic::nvvm_rcp_rz_f: {
+        APFloat::roundingMode RoundMode = nvvm::GetRCPRoundingMode(IntrinsicID);
+        bool IsApprox = nvvm::RCPIsApprox(IntrinsicID);
+        bool IsFTZ = nvvm::RCPShouldFTZ(IntrinsicID);
+
+        auto Denominator = IsFTZ ? FTZPreserveSign(APF) : APF;
+        if (IsApprox && Denominator.isZero()) {
+          // According to the PTX spec, approximate rcp should return infinity
+          // with the same sign as the denominator when dividing by 0.
+          APFloat Inf = APFloat::getInf(APF.getSemantics(), APF.isNegative());
+          return ConstantFP::get(Ty->getContext(), Inf);
+        }
+        APFloat Res = APFloat::getOne(APF.getSemantics());
+        APFloat::opStatus Status = Res.divide(Denominator, RoundMode);
+
+        if (Status == APFloat::opOK || Status == APFloat::opInexact) {
+          if (IsFTZ)
+            Res = FTZPreserveSign(Res);
+          return ConstantFP::get(Ty->getContext(), Res);
+        }
+        return nullptr;
+      }
+
+      case Intrinsic::nvvm_round_ftz_f:
+      case Intrinsic::nvvm_round_f:
+      case Intrinsic::nvvm_round_d:
+        return ConstantFoldFP(round, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+
+      case Intrinsic::nvvm_rsqrt_approx_ftz_d:
+      case Intrinsic::nvvm_rsqrt_approx_ftz_f:
+      case Intrinsic::nvvm_rsqrt_approx_d:
+      case Intrinsic::nvvm_rsqrt_approx_f: {
+        bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
+        auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+        APFloat SqrtV(sqrt(V.convertToDouble()));
+
+        bool lost;
+        SqrtV.convert(APF.getSemantics(), APFloat::rmNearestTiesToEven, &lost);
+
+        APFloat Res = APFloat::getOne(APF.getSemantics());
+        Res.divide(SqrtV, APFloat::rmNearestTiesToEven);
+
+        // We do not need to flush the output for ftz because it is impossible
+        // for 1/sqrt(x) to be a denormal value. If x is the largest fp value,
+        // sqrt(x) will be a number with the exponent approximately halved and
+        // the reciprocal of that number can't be small enough to be denormal.
+        return ConstantFP::get(Ty->getContext(), Res);
+      }
+
+      case Intrinsic::nvvm_saturate_ftz_f:
+      case Intrinsic::nvvm_saturate_d:
+      case Intrinsic::nvvm_saturate_f: {
+        bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
+        auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+        if (V.isNegative() || V.isZero() || V.isNaN())
+          return ConstantFP::getZero(Ty);
+        APFloat One = APFloat::getOne(APF.getSemantics());
+        if (V > One)
+          return ConstantFP::get(Ty->getContext(), One);
+        return ConstantFP::get(Ty->getContext(), APF);
+      }
+
+      case Intrinsic::nvvm_sin_approx_ftz_f:
+      case Intrinsic::nvvm_sin_approx_f:
+        return ConstantFoldFP(sin, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+
+      case Intrinsic::nvvm_sqrt_rn_ftz_f:
+      case Intrinsic::nvvm_sqrt_approx_ftz_f:
+      case Intrinsic::nvvm_sqrt_f:
+      case Intrinsic::nvvm_sqrt_rn_d:
+      case Intrinsic::nvvm_sqrt_rn_f:
+      case Intrinsic::nvvm_sqrt_approx_f:
+        if (APF.isNegative())
+          return nullptr;
+        return ConstantFoldFP(sqrt, APF, Ty,
+                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+
+      // AMDGCN Intrinsics:
       case Intrinsic::amdgcn_cos:
       case Intrinsic::amdgcn_sin: {
         double V = getValueAsDouble(Op);
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
new file mode 100644
index 0000000000000..ef71ff95129ab
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
@@ -0,0 +1,1003 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; Test constant-folding for various NVVM unary arithmetic intrinsics.
+
+;###############################################################
+;#                          Ceil                               #
+;###############################################################
+
+define double @test_ceil_d_1_25() {
+; CHECK-LABEL: define double @test_ceil_d_1_25() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.ceil.d(double 1.25)
+  ret double %res
+}
+
+define float @test_ceil_f_1_25() {
+; CHECK-LABEL: define float @test_ceil_f_1_25() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.ceil.f(float 1.25)
+  ret float %res
+}
+
+define float @test_ceil_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_ceil_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.ceil.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_ceil_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_ceil_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %res = call double @llvm.nvvm.ceil.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_ceil_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_ceil_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.ceil.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_ceil_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_ceil_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.ceil.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                        Cos Approx                           #
+;###############################################################
+
+define float @test_cos_approx_f_1_25() {
+; CHECK-LABEL: define float @test_cos_approx_f_1_25() {
+; CHECK-NEXT:    ret float 0x3FD42E3DE0000000
+;
+  %res = call float @llvm.nvvm.cos.approx.f(float 1.25)
+  ret float %res
+}
+
+define float @test_cos_approx_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_cos_approx_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 0x3FD42E3DE0000000
+;
+  %res = call float @llvm.nvvm.cos.approx.ftz.f(float 1.25)
+  ret float %res
+}
+
+define float @test_cos_approx_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_cos_approx_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.cos.approx.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_cos_approx_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_cos_approx_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.cos.approx.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                        Ex2 Approx                           #
+;###############################################################
+
+define double @test_ex2_approx_d_1_25() {
+; CHECK-LABEL: define double @test_ex2_approx_d_1_25() {
+; CHECK-NEXT:    ret double 0x400306FE0A31B715
+;
+  %res = call double @llvm.nvvm.ex2.approx.d(double 1.25)
+  ret double %res
+}
+
+define float @test_ex2_approx_f_1_25() {
+; CHECK-LABEL: define float @test_ex2_approx_f_1_25() {
+; CHECK-NEXT:    ret float 0x400306FE00000000
+;
+  %res = call float @llvm.nvvm.ex2.approx.f(float 1.25)
+  ret float %res
+}
+
+define float @test_ex2_approx_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_ex2_approx_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 0x400306FE00000000
+;
+  %res = call float @llvm.nvvm.ex2.approx.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_ex2_approx_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_ex2_approx_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %res = call double @llvm.nvvm.ex2.approx.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_ex2_approx_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_ex2_approx_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.ex2.approx.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_ex2_approx_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_ex2_approx_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.ex2.approx.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                          FAbs                               #
+;###############################################################
+
+define float @test_fabs_neg_1_5() {
+; CHECK-LABEL: define float @test_fabs_neg_1_5() {
+; CHECK-NEXT:    ret float 1.500000e+00
+;
+  %res = call float @llvm.nvvm.fabs(float -1.5)
+  ret float %res
+}
+
+define float @test_fabs_ftz_neg_1_5() {
+; CHECK-LABEL: define float @test_fabs_ftz_neg_1_5() {
+; CHECK-NEXT:    ret float 1.500000e+00
+;
+  %res = call float @llvm.nvvm.fabs.ftz(float -1.5)
+  ret float %res
+}
+
+define float @test_fabs_1_25() {
+; CHECK-LABEL: define float @test_fabs_1_25() {
+; CHECK-NEXT:    ret float 1.250000e+00
+;
+  %res = call float @llvm.nvvm.fabs(float 1.25)
+  ret float %res
+}
+
+define float @test_fabs_ftz_1_25() {
+; CHECK-LABEL: define float @test_fabs_ftz_1_25() {
+; CHECK-NEXT:    ret float 1.250000e+00
+;
+  %res = call float @llvm.nvvm.fabs.ftz(float 1.25)
+  ret float %res
+}
+
+define float @test_fabs_neg_subnorm() {
+; CHECK-LABEL: define float @test_fabs_neg_subnorm() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fabs(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fabs_ftz_neg_subnorm() {
+; CHECK-LABEL: define float @test_fabs_ftz_neg_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fabs.ftz(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fabs_pos_subnorm() {
+; CHECK-LABEL: define float @test_fabs_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fabs(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fabs_ftz_pos_subnorm() {
+; CHECK-LABEL: define float @test_fabs_ftz_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fabs.ftz(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+
+;###############################################################
+;#                          Floor                              #
+;###############################################################
+
+define double @test_floor_d_1_25() {
+; CHECK-LABEL: define double @test_floor_d_1_25() {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %res = call double @llvm.nvvm.floor.d(double 1.25)
+  ret double %res
+}
+
+define float @test_floor_f_1_25() {
+; CHECK-LABEL: define float @test_floor_f_1_25() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.floor.f(float 1.25)
+  ret float %res
+}
+
+define float @test_floor_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_floor_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.floor.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_floor_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_floor_d_neg_subnorm() {
+; CHECK-NEXT:    ret double -1.000000e+00
+;
+  %res = call double @llvm.nvvm.floor.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_floor_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_floor_f_neg_subnorm() {
+; CHECK-NEXT:    ret float -1.000000e+00
+;
+  %res = call float @llvm.nvvm.floor.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_floor_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_floor_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.floor.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                        Lg2 Approx                           #
+;###############################################################
+
+define double @test_lg2_approx_d_1_25() {
+; CHECK-LABEL: define double @test_lg2_approx_d_1_25() {
+; CHECK-NEXT:    ret double 0x3FD49A784BCD1B8B
+;
+  %res = call double @llvm.nvvm.lg2.approx.d(double 1.25)
+  ret double %res
+}
+
+define float @test_lg2_approx_f_1_25() {
+; CHECK-LABEL: define float @test_lg2_approx_f_1_25() {
+; CHECK-NEXT:    ret float 0x3FD49A7840000000
+;
+  %res = call float @llvm.nvvm.lg2.approx.f(float 1.25)
+  ret float %res
+}
+
+define float @test_lg2_approx_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_lg2_approx_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 0x3FD49A7840000000
+;
+  %res = call float @llvm.nvvm.lg2.approx.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_lg2_approx_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_lg2_approx_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 0xC05F800000B8AA3C
+;
+  %res = call double @llvm.nvvm.lg2.approx.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_lg2_approx_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_lg2_approx_f_pos_subnorm() {
+; CHECK-NEXT:    ret float -1.260000e+02
+;
+  %res = call float @llvm.nvvm.lg2.approx.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_lg2_approx_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_lg2_approx_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.lg2.approx.ftz.f(float 0x380FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.lg2.approx.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                            Rcp                              #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                       rcp_rm                                |
+;+-------------------------------------------------------------+
+define double @test_rcp_rm_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rm_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.rm.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_rm_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rm_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rm.f(float 0.5)
+  ret float %res
+}
+
+define float @test_rcp_rm_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rm_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_rm_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rm_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000041
+;
+  %res = call double @llvm.nvvm.rcp.rm.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rcp_rm_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rm_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xC7D0000040000000
+;
+  %res = call float @llvm.nvvm.rcp.rm.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_rcp_rm_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rm_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;+-------------------------------------------------------------+
+;|                       rcp_rn                                |
+;+-------------------------------------------------------------+
+define double @test_rcp_rn_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rn_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.rn.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_rn_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rn_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rn.f(float 0.5)
+  ret float %res
+}
+
+define float @test_rcp_rn_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rn_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_rn_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rn_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000040
+;
+  %res = call double @llvm.nvvm.rcp.rn.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rcp_rn_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rn_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xC7D0000020000000
+;
+  %res = call float @llvm.nvvm.rcp.rn.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_rcp_rn_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rn_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;+-------------------------------------------------------------+
+;|                       rcp_rp                                |
+;+-------------------------------------------------------------+
+define double @test_rcp_rp_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rp_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.rp.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_rp_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rp_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rp.f(float 0.5)
+  ret float %res
+}
+
+define float @test_rcp_rp_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rp_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_rp_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rp_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000040
+;
+  %res = call double @llvm.nvvm.rcp.rp.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rcp_rp_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rp_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xC7D0000020000000
+;
+  %res = call float @llvm.nvvm.rcp.rp.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_rcp_rp_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rp_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;+-------------------------------------------------------------+
+;|                       rcp_rz                                |
+;+-------------------------------------------------------------+
+define double @test_rcp_rz_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rz_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.rz.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_rz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rz.f(float 0.5)
+  ret float %res
+}
+
+define float @test_rcp_rz_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rz_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_rz_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rz_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000040
+;
+  %res = call double @llvm.nvvm.rcp.rz.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rcp_rz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rz_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xC7D0000020000000
+;
+  %res = call float @llvm.nvvm.rcp.rz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_rcp_rz_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rz_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;+-------------------------------------------------------------+
+;|                     rcp_approx_ftz                          |
+;+-------------------------------------------------------------+
+
+define double @test_rcp_approx_ftz_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_approx_ftz_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.approx.ftz.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_approx_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_approx_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.approx.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_approx_ftz_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_approx_ftz_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000040
+;
+  %res = call double @llvm.nvvm.rcp.approx.ftz.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+
+define float @test_rcp_approx_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_approx_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xFFF0000000000000
+;
+  %res = call float @llvm.nvvm.rcp.approx.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define double @test_rcp_approx_ftz_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_rcp_approx_ftz_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 0x47D0000020000040
+;
+  %res = call double @llvm.nvvm.rcp.approx.ftz.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+
+define float @test_rcp_approx_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_rcp_approx_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x7FF0000000000000
+;
+  %res = call float @llvm.nvvm.rcp.approx.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+
+;###############################################################
+;#                          Round                              #
+;###############################################################
+
+define double @test_round_d_neg_1_5() {
+; CHECK-LABEL: define double @test_round_d_neg_1_5() {
+; CHECK-NEXT:    ret double -2.000000e+00
+;
+  %res = call double @llvm.nvvm.round.d(double -1.5)
+  ret double %res
+}
+
+define float @test_round_f_neg_1_5() {
+; CHECK-LABEL: define float @test_round_f_neg_1_5() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.round.f(float -1.5)
+  ret float %res
+}
+
+define float @test_round_ftz_f_neg_1_5() {
+; CHECK-LABEL: define float @test_round_ftz_f_neg_1_5() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.round.ftz.f(float -1.5)
+  ret float %res
+}
+
+define double @test_round_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_round_d_neg_subnorm() {
+; CHECK-NEXT:    ret double -0.000000e+00
+;
+  %res = call double @llvm.nvvm.round.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_round_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_round_f_neg_subnorm() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.round.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_round_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_round_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.round.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                     RSqrt Approx                            #
+;###############################################################
+
+define double @test_rsqrt_approx_d_1_25() {
+; CHECK-LABEL: define double @test_rsqrt_approx_d_1_25() {
+; CHECK-NEXT:    ret double 0x3FEC9F25C5BFEDD9
+;
+  %res = call double @llvm.nvvm.rsqrt.approx.d(double 1.25)
+  ret double %res
+}
+
+define float @test_rsqrt_approx_f_1_25() {
+; CHECK-LABEL: define float @test_rsqrt_approx_f_1_25() {
+; CHECK-NEXT:    ret float 0x3FEC9F25C0000000
+;
+  %res = call float @llvm.nvvm.rsqrt.approx.f(float 1.25)
+  ret float %res
+}
+
+define double @test_rsqrt_approx_ftz_d_1_25() {
+; CHECK-LABEL: define double @test_rsqrt_approx_ftz_d_1_25() {
+; CHECK-NEXT:    ret double 0x3FEC9F25C5BFEDD9
+;
+  %res = call double @llvm.nvvm.rsqrt.approx.ftz.d(double 1.25)
+  ret double %res
+}
+
+define float @test_rsqrt_approx_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_rsqrt_approx_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 0x3FEC9F25C0000000
+;
+  %res = call float @llvm.nvvm.rsqrt.approx.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_rsqrt_approx_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_rsqrt_approx_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 0x43E0000010000018
+;
+  %res = call double @llvm.nvvm.rsqrt.approx.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rsqrt_approx_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_rsqrt_approx_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x43E0000020000000
+;
+  %res = call float @llvm.nvvm.rsqrt.approx.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define double @test_rsqrt_approx_ftz_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_rsqrt_approx_ftz_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 0x43E0000010000018
+;
+  %res = call double @llvm.nvvm.rsqrt.approx.ftz.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rsqrt_approx_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_rsqrt_approx_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x7FF0000000000000
+;
+  %res = call float @llvm.nvvm.rsqrt.approx.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                        Saturate                             #
+;###############################################################
+
+define double @test_saturate_d_1_25() {
+; CHECK-LABEL: define double @test_saturate_d_1_25() {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %res = call double @llvm.nvvm.saturate.d(double 1.25)
+  ret double %res
+}
+
+define float @test_saturate_f_1_25() {
+; CHECK-LABEL: define float @test_saturate_f_1_25() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.f(float 1.25)
+  ret float %res
+}
+
+define float @test_saturate_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_saturate_d_neg_1_25() {
+; CHECK-LABEL: define double @test_saturate_d_neg_1_25() {
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %res = call double @llvm.nvvm.saturate.d(double -1.25)
+  ret double %res
+}
+
+define float @test_saturate_f_neg_1_25() {
+; CHECK-LABEL: define float @test_saturate_f_neg_1_25() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.f(float -1.25)
+  ret float %res
+}
+
+define float @test_saturate_ftz_f_neg_1_25() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_neg_1_25() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.ftz.f(float -1.25)
+  ret float %res
+}
+
+define double @test_saturate_d_0_5() {
+; CHECK-LABEL: define double @test_saturate_d_0_5() {
+; CHECK-NEXT:    ret double 5.000000e-01
+;
+  %res = call double @llvm.nvvm.saturate.d(double 0.5)
+  ret double %res
+}
+
+define float @test_saturate_f_0_5() {
+; CHECK-LABEL: define float @test_saturate_f_0_5() {
+; CHECK-NEXT:    ret float 5.000000e-01
+;
+  %res = call float @llvm.nvvm.saturate.f(float 0.5)
+  ret float %res
+}
+
+define float @test_saturate_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 5.000000e-01
+;
+  %res = call float @llvm.nvvm.saturate.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_saturate_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_saturate_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 0x380FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.saturate.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_saturate_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_saturate_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.saturate.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_saturate_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+
+;###############################################################
+;#                        Sin Approx                           #
+;###############################################################
+
+define float @test_sin_approx_f_1_25() {
+; CHECK-LABEL: define float @test_sin_approx_f_1_25() {
+; CHECK-NEXT:    ret float 0x3FEE5E1500000000
+;
+  %res = call float @llvm.nvvm.sin.approx.f(float 1.25)
+  ret float %res
+}
+
+define float @test_sin_approx_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_sin_approx_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 0x3FEE5E1500000000
+;
+  %res = call float @llvm.nvvm.sin.approx.ftz.f(float 1.25)
+  ret float %res
+}
+
+define float @test_sin_approx_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sin_approx_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.sin.approx.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_sin_approx_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sin_approx_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.sin.approx.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                      Sqrt Approx                            #
+;###############################################################
+
+define float @test_sqrt_f_4() {
+; CHECK-LABEL: define float @test_sqrt_f_4() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.f(float 4.0)
+  ret float %res
+}
+
+define float @test_sqrt_rn_f_4() {
+; CHECK-LABEL: define float @test_sqrt_rn_f_4() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.rn.f(float 4.0)
+  ret float %res
+}
+
+define double @test_sqrt_rn_d_4() {
+; CHECK-LABEL: define double @test_sqrt_rn_d_4() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.sqrt.rn.d(double 4.0)
+  ret double %res
+}
+
+define float @test_sqrt_rn_ftz_f_4() {
+; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_4() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 4.0)
+  ret float %res
+}
+
+define float @test_sqrt_approx_f_4() {
+; CHECK-LABEL: define float @test_sqrt_approx_f_4() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.approx.f(float 4.0)
+  ret float %res
+}
+
+define float @test_sqrt_approx_ftz_f_4() {
+; CHECK-LABEL: define float @test_sqrt_approx_ftz_f_4() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.approx.ftz.f(float 4.0)
+  ret float %res
+}
+
+define float @test_sqrt_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x3BFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.sqrt.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_sqrt_rn_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_rn_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x3BFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.sqrt.rn.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define double @test_sqrt_rn_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_sqrt_rn_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 0x3BFFFFFFDFFFFFF0
+;
+  %res = call double @llvm.nvvm.sqrt.rn.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_sqrt_rn_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_sqrt_approx_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_approx_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x3BFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.sqrt.approx.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_sqrt_approx_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_approx_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.approx.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+
+declare double @llvm.nvvm.ceil.d(double)
+declare float @llvm.nvvm.ceil.f(float)
+declare float @llvm.nvvm.ceil.ftz.f(float)
+
+declare float @llvm.nvvm.cos.approx.f(float)
+declare float @llvm.nvvm.cos.approx.ftz.f(float)
+
+declare double @llvm.nvvm.ex2.approx.d(double)
+declare float @llvm.nvvm.ex2.approx.f(float)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float)
+
+declare float @llvm.nvvm.fabs(float)
+declare float @llvm.nvvm.fabs.ftz(float)
+
+declare double @llvm.nvvm.floor.d(double)
+declare float @llvm.nvvm.floor.f(float)
+declare float @llvm.nvvm.floor.ftz.f(float)
+
+declare double @llvm.nvvm.lg2.approx.d(double)
+declare float @llvm.nvvm.lg2.approx.f(float)
+declare float @llvm.nvvm.lg2.approx.ftz.f(float)
+
+declare double @llvm.nvvm.rcp.rm.d(double)
+declare float @llvm.nvvm.rcp.rm.f(float)
+declare float @llvm.nvvm.rcp.rm.ftz.f(float)
+declare double @llvm.nvvm.rcp.rn.d(double)
+declare float @llvm.nvvm.rcp.rn.f(float)
+declare float @llvm.nvvm.rcp.rn.ftz.f(float)
+declare double @llvm.nvvm.rcp.rp.d(double)
+declare float @llvm.nvvm.rcp.rp.f(float)
+declare float @llvm.nvvm.rcp.rp.ftz.f(float)
+declare double @llvm.nvvm.rcp.rz.d(double)
+declare float @llvm.nvvm.rcp.rz.f(float)
+declare float @llvm.nvvm.rcp.rz.ftz.f(float)
+declare double @llvm.nvvm.rcp.approx.ftz.d(double)
+declare float @llvm.nvvm.rcp.approx.ftz.f(float)
+
+declare double @llvm.nvvm.round.d(double)
+declare float @llvm.nvvm.round.f(float)
+declare float @llvm.nvvm.round.ftz.f(float)
+
+declare double @llvm.nvvm.rsqrt.approx.d(double)
+declare float @llvm.nvvm.rsqrt.approx.f(float)
+declare double @llvm.nvvm.rsqrt.approx.ftz.d(double)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float)
+
+declare double @llvm.nvvm.saturate.d(double)
+declare float @llvm.nvvm.saturate.f(float)
+declare float @llvm.nvvm.saturate.ftz.f(float)
+
+declare float @llvm.nvvm.sin.approx.f(float)
+declare float @llvm.nvvm.sin.approx.ftz.f(float)
+
+declare float @llvm.nvvm.sqrt.f(float)
+declare double @llvm.nvvm.sqrt.rn.d(double)
+declare float @llvm.nvvm.sqrt.rn.f(float)
+declare float @llvm.nvvm.sqrt.rn.ftz.f(float)
+declare float @llvm.nvvm.sqrt.approx.f(float)
+declare float @llvm.nvvm.sqrt.approx.ftz.f(float)

>From d8188fb8c1edd04a34c2299c5c29d3b92037c956 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford at nvidia.com>
Date: Fri, 30 May 2025 11:58:17 +0000
Subject: [PATCH 2/5] Fix _approx_ftz_d implementations for rcp + rsqrt

Fix the rcp_approx_ftz_d and rsqrt_approx_ftz_d implementations
to better match the PTX spec, which states that the inputs and outputs
should zero the lower 32-bits of the mantissa.
---
 llvm/lib/Analysis/ConstantFolding.cpp         | 27 +++++++++++++++++--
 .../const-fold-nvvm-unary-arithmetic.ll       |  8 +++---
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 75f860a28818e..f1c69381473d3 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2006,6 +2006,15 @@ static const APFloat FTZPreserveSign(const APFloat &V) {
   return V;
 }
 
+// Get only the upper word of the input double in 1.11.20 format
+// by making the lower 32-bits of the mantissa all 0.
+static const APFloat ZeroLower32Bits(const APFloat &V) {
+  assert(V.getSizeInBits(V.getSemantics()) == 64);
+  uint64_t DoubleBits = V.bitcastToAPInt().getZExtValue();
+  DoubleBits &= 0xffffffff00000000;
+  return APFloat(V.getSemantics(), APInt(64, DoubleBits, false, false));
+}
+
 Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty,
                          bool ShouldFTZPreservingSign = false) {
   llvm_fenv_clearexcept();
@@ -2651,6 +2660,8 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
         bool IsFTZ = nvvm::RCPShouldFTZ(IntrinsicID);
 
         auto Denominator = IsFTZ ? FTZPreserveSign(APF) : APF;
+        if (IntrinsicID == Intrinsic::nvvm_rcp_approx_ftz_d)
+          Denominator = ZeroLower32Bits(Denominator);
         if (IsApprox && Denominator.isZero()) {
           // According to the PTX spec, approximate rcp should return infinity
           // with the same sign as the denominator when dividing by 0.
@@ -2663,6 +2674,8 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
         if (Status == APFloat::opOK || Status == APFloat::opInexact) {
           if (IsFTZ)
             Res = FTZPreserveSign(Res);
+          if (IntrinsicID == Intrinsic::nvvm_rcp_approx_ftz_d)
+            Res = ZeroLower32Bits(Res);
           return ConstantFP::get(Ty->getContext(), Res);
         }
         return nullptr;
@@ -2680,14 +2693,24 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       case Intrinsic::nvvm_rsqrt_approx_f: {
         bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
         auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+
+        if (IntrinsicID == Intrinsic::nvvm_rsqrt_approx_ftz_d)
+          V = ZeroLower32Bits(V);
+
         APFloat SqrtV(sqrt(V.convertToDouble()));
 
-        bool lost;
-        SqrtV.convert(APF.getSemantics(), APFloat::rmNearestTiesToEven, &lost);
+        if (Ty->isFloatTy()) {
+          bool lost;
+          SqrtV.convert(APF.getSemantics(), APFloat::rmNearestTiesToEven,
+                        &lost);
+        }
 
         APFloat Res = APFloat::getOne(APF.getSemantics());
         Res.divide(SqrtV, APFloat::rmNearestTiesToEven);
 
+        if (IntrinsicID == Intrinsic::nvvm_rsqrt_approx_ftz_d)
+          Res = ZeroLower32Bits(Res);
+
         // We do not need to flush the output for ftz because it is impossible
         // for 1/sqrt(x) to be a denormal value. If x is the largest fp value,
         // sqrt(x) will be a number with the exponent approximately halved and
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
index ef71ff95129ab..c6a51b345650f 100644
--- a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
@@ -551,7 +551,7 @@ define float @test_rcp_approx_ftz_f_0_5() {
 
 define double @test_rcp_approx_ftz_d_neg_subnorm() {
 ; CHECK-LABEL: define double @test_rcp_approx_ftz_d_neg_subnorm() {
-; CHECK-NEXT:    ret double 0xC7D0000020000040
+; CHECK-NEXT:    ret double 0xC7D0000000000000
 ;
   %res = call double @llvm.nvvm.rcp.approx.ftz.d(double 0xB80FFFFFC0000000)
   ret double %res
@@ -568,7 +568,7 @@ define float @test_rcp_approx_ftz_f_neg_subnorm() {
 
 define double @test_rcp_approx_ftz_d_pos_subnorm() {
 ; CHECK-LABEL: define double @test_rcp_approx_ftz_d_pos_subnorm() {
-; CHECK-NEXT:    ret double 0x47D0000020000040
+; CHECK-NEXT:    ret double 0x47D0000000000000
 ;
   %res = call double @llvm.nvvm.rcp.approx.ftz.d(double 0x380FFFFFC0000000)
   ret double %res
@@ -658,7 +658,7 @@ define float @test_rsqrt_approx_f_1_25() {
 
 define double @test_rsqrt_approx_ftz_d_1_25() {
 ; CHECK-LABEL: define double @test_rsqrt_approx_ftz_d_1_25() {
-; CHECK-NEXT:    ret double 0x3FEC9F25C5BFEDD9
+; CHECK-NEXT:    ret double 0x3FEC9F2500000000
 ;
   %res = call double @llvm.nvvm.rsqrt.approx.ftz.d(double 1.25)
   ret double %res
@@ -690,7 +690,7 @@ define float @test_rsqrt_approx_f_pos_subnorm() {
 
 define double @test_rsqrt_approx_ftz_d_pos_subnorm() {
 ; CHECK-LABEL: define double @test_rsqrt_approx_ftz_d_pos_subnorm() {
-; CHECK-NEXT:    ret double 0x43E0000010000018
+; CHECK-NEXT:    ret double 0x43E0000000000000
 ;
   %res = call double @llvm.nvvm.rsqrt.approx.ftz.d(double 0x380FFFFFC0000000)
   ret double %res

>From 54654e840ef236efd30ab7fba6c3d7769fd7f556 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford at nvidia.com>
Date: Mon, 30 Jun 2025 11:49:44 +0000
Subject: [PATCH 3/5] Fix rcp.rp rounding modes

RCP incorrectly folded rcp.rp as rcp.rn due to a bug
in NVVMIntrinsicUtils.h.
---
 llvm/include/llvm/IR/NVVMIntrinsicUtils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index 8f0aa07fff3f0..139e1d4fa48ab 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -426,7 +426,7 @@ inline APFloat::roundingMode GetRCPRoundingMode(Intrinsic::ID IntrinsicID) {
   case Intrinsic::nvvm_rcp_rp_f:
   case Intrinsic::nvvm_rcp_rp_d:
   case Intrinsic::nvvm_rcp_rp_ftz_f:
-    return APFloat::rmNearestTiesToEven;
+    return APFloat::rmTowardPositive;
 
   case Intrinsic::nvvm_rcp_rz_f:
   case Intrinsic::nvvm_rcp_rz_d:

>From 75eb448b653d68b61fdfc1eff17d796a2c3df1a4 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford at nvidia.com>
Date: Tue, 1 Jul 2025 13:43:53 +0000
Subject: [PATCH 4/5] Remove returns after llvm_unreachable

Remove "return" statements providing default values
after llvm_unreachable calls in NVVMIntrinsicUtils.h.
---
 llvm/include/llvm/IR/NVVMIntrinsicUtils.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index 139e1d4fa48ab..046e60af3192f 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -112,7 +112,6 @@ inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking FTZ flag for invalid f2i/d2i intrinsic");
-  return false;
 }
 
 inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) {
@@ -179,7 +178,6 @@ inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) {
   }
   llvm_unreachable(
       "Checking invalid f2i/d2i intrinsic for signed int conversion");
-  return false;
 }
 
 inline APFloat::roundingMode
@@ -250,7 +248,6 @@ GetFPToIntegerRoundingMode(Intrinsic::ID IntrinsicID) {
     return APFloat::rmTowardZero;
   }
   llvm_unreachable("Checking rounding mode for invalid f2i/d2i intrinsic");
-  return APFloat::roundingMode::Invalid;
 }
 
 inline bool FMinFMaxShouldFTZ(Intrinsic::ID IntrinsicID) {
@@ -280,7 +277,6 @@ inline bool FMinFMaxShouldFTZ(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking FTZ flag for invalid fmin/fmax intrinsic");
-  return false;
 }
 
 inline bool FMinFMaxPropagatesNaNs(Intrinsic::ID IntrinsicID) {
@@ -310,7 +306,6 @@ inline bool FMinFMaxPropagatesNaNs(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking NaN flag for invalid fmin/fmax intrinsic");
-  return false;
 }
 
 inline bool FMinFMaxIsXorSignAbs(Intrinsic::ID IntrinsicID) {
@@ -340,7 +335,6 @@ inline bool FMinFMaxIsXorSignAbs(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking XorSignAbs flag for invalid fmin/fmax intrinsic");
-  return false;
 }
 
 inline bool UnaryMathIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
@@ -383,7 +377,6 @@ inline bool UnaryMathIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking FTZ flag for invalid unary intrinsic");
-  return false;
 }
 
 inline bool RCPShouldFTZ(Intrinsic::ID IntrinsicID) {
@@ -406,7 +399,6 @@ inline bool RCPShouldFTZ(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking FTZ flag for invalid rcp intrinsic");
-  return false;
 }
 
 inline APFloat::roundingMode GetRCPRoundingMode(Intrinsic::ID IntrinsicID) {
@@ -434,7 +426,6 @@ inline APFloat::roundingMode GetRCPRoundingMode(Intrinsic::ID IntrinsicID) {
     return APFloat::rmTowardZero;
   }
   llvm_unreachable("Checking rounding mode for invalid rcp intrinsic");
-  return APFloat::roundingMode::Invalid;
 }
 
 inline bool RCPIsApprox(Intrinsic::ID IntrinsicID) {
@@ -461,7 +452,6 @@ inline bool RCPIsApprox(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking approx flag for invalid rcp intrinsic");
-  return false;
 }
 
 } // namespace nvvm

>From 73783177450a8e71494f292333b714745be0d243 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford at nvidia.com>
Date: Wed, 2 Jul 2025 08:07:10 +0000
Subject: [PATCH 5/5] Handle arbitrary denormal in/out modes

Change ConstantFoldFP to accept an optional DenormalMode
parameter, which specifies how denormal inputs and outputs
are handled.

Avoid handling for invalid/dynamic modes, and allow for
flushing to either positive or sign-preserving zero (which
may be different for input vs output modes).

Allow the NVVM intrinsics to be folded with either the
IEEE mode (preserves subnormals), or the sign-preserving-zero
mode depending on whether the FTZ variant of the intrinsic is used.
---
 llvm/include/llvm/IR/NVVMIntrinsicUtils.h |   6 ++
 llvm/lib/Analysis/ConstantFolding.cpp     | 105 +++++++++++++++-------
 2 files changed, 80 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index 046e60af3192f..f71e5c8c517fb 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -454,6 +454,12 @@ inline bool RCPIsApprox(Intrinsic::ID IntrinsicID) {
   llvm_unreachable("Checking approx flag for invalid rcp intrinsic");
 }
 
+inline DenormalMode GetNVVMDenromMode(bool ShouldFTZ) {
+  if (ShouldFTZ)
+    return DenormalMode::getPreserveSign();
+  return DenormalMode::getIEEE();
+}
+
 } // namespace nvvm
 } // namespace llvm
 #endif // LLVM_IR_NVVMINTRINSICUTILS_H
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index c28905ac29ece..4e97690e298b9 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2025,12 +2025,6 @@ inline bool llvm_fenv_testexcept() {
   return false;
 }
 
-static const APFloat FTZPreserveSign(const APFloat &V) {
-  if (V.isDenormal())
-    return APFloat::getZero(V.getSemantics(), V.isNegative());
-  return V;
-}
-
 // Get only the upper word of the input double in 1.11.20 format
 // by making the lower 32-bits of the mantissa all 0.
 static const APFloat ZeroLower32Bits(const APFloat &V) {
@@ -2040,10 +2034,44 @@ static const APFloat ZeroLower32Bits(const APFloat &V) {
   return APFloat(V.getSemantics(), APInt(64, DoubleBits, false, false));
 }
 
+static const APFloat FTZPreserveSign(const APFloat &V) {
+  if (V.isDenormal())
+    return APFloat::getZero(V.getSemantics(), V.isNegative());
+  return V;
+}
+
+static const APFloat FlushToPositiveZero(const APFloat &V) {
+  if (V.isDenormal())
+    return APFloat::getZero(V.getSemantics(), false);
+  return V;
+}
+
+static const APFloat
+FlushWithDenormKind(const APFloat &V,
+                    DenormalMode::DenormalModeKind DenormKind) {
+  assert(DenormKind != DenormalMode::DenormalModeKind::Invalid &&
+         DenormKind != DenormalMode::DenormalModeKind::Dynamic);
+  switch (DenormKind) {
+  case DenormalMode::DenormalModeKind::IEEE:
+    return V;
+  case DenormalMode::DenormalModeKind::PreserveSign:
+    return FTZPreserveSign(V);
+  case DenormalMode::DenormalModeKind::PositiveZero:
+    return FlushToPositiveZero(V);
+  default:
+    llvm_unreachable("Invalid denormal mode!");
+  }
+}
+
 Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty,
-                         bool ShouldFTZPreservingSign = false) {
+                         DenormalMode DenormMode = DenormalMode::getIEEE()) {
+  if (!DenormMode.isValid() ||
+      DenormMode.Input == DenormalMode::DenormalModeKind::Dynamic ||
+      DenormMode.Output == DenormalMode::DenormalModeKind::Dynamic)
+    return nullptr;
+
   llvm_fenv_clearexcept();
-  auto Input = ShouldFTZPreservingSign ? FTZPreserveSign(V) : V;
+  auto Input = FlushWithDenormKind(V, DenormMode.Input);
   double Result = NativeFP(Input.convertToDouble());
   if (llvm_fenv_testexcept()) {
     llvm_fenv_clearexcept();
@@ -2051,12 +2079,11 @@ Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty,
   }
 
   Constant *Output = GetConstantFoldFPValue(Result, Ty);
-  if (ShouldFTZPreservingSign) {
-    const auto *CFP = static_cast<ConstantFP *>(Output);
-    return ConstantFP::get(Ty->getContext(),
-                           FTZPreserveSign(CFP->getValueAPF()));
-  }
-  return Output;
+  if (DenormMode.Output == DenormalMode::DenormalModeKind::IEEE)
+    return Output;
+  const auto *CFP = static_cast<ConstantFP *>(Output);
+  const auto Res = FlushWithDenormKind(CFP->getValueAPF(), DenormMode.Output);
+  return ConstantFP::get(Ty->getContext(), Res);
 }
 
 #if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
@@ -2642,38 +2669,48 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       case Intrinsic::nvvm_ceil_ftz_f:
       case Intrinsic::nvvm_ceil_f:
       case Intrinsic::nvvm_ceil_d:
-        return ConstantFoldFP(ceil, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+        return ConstantFoldFP(
+            ceil, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
 
       case Intrinsic::nvvm_cos_approx_ftz_f:
       case Intrinsic::nvvm_cos_approx_f:
-        return ConstantFoldFP(cos, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+        return ConstantFoldFP(
+            cos, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
 
       case Intrinsic::nvvm_ex2_approx_ftz_f:
       case Intrinsic::nvvm_ex2_approx_d:
       case Intrinsic::nvvm_ex2_approx_f:
-        return ConstantFoldFP(exp2, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+        return ConstantFoldFP(
+            exp2, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                (nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))));
 
       case Intrinsic::nvvm_fabs_ftz:
       case Intrinsic::nvvm_fabs:
         return ConstantFoldFP(fabs, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+                              nvvm::GetNVVMDenromMode(nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
 
       case Intrinsic::nvvm_floor_ftz_f:
       case Intrinsic::nvvm_floor_f:
       case Intrinsic::nvvm_floor_d:
-        return ConstantFoldFP(floor, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+        return ConstantFoldFP(
+            floor, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
 
       case Intrinsic::nvvm_lg2_approx_ftz_f:
       case Intrinsic::nvvm_lg2_approx_d:
       case Intrinsic::nvvm_lg2_approx_f: {
         if (APF.isNegative() || APF.isZero())
           return nullptr;
-        return ConstantFoldFP(log2, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+        return ConstantFoldFP(
+            log2, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
       }
 
       case Intrinsic::nvvm_rcp_rm_ftz_f:
@@ -2719,8 +2756,10 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       case Intrinsic::nvvm_round_ftz_f:
       case Intrinsic::nvvm_round_f:
       case Intrinsic::nvvm_round_d:
-        return ConstantFoldFP(round, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+        return ConstantFoldFP(
+            round, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
 
       case Intrinsic::nvvm_rsqrt_approx_ftz_d:
       case Intrinsic::nvvm_rsqrt_approx_ftz_f:
@@ -2768,8 +2807,10 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
 
       case Intrinsic::nvvm_sin_approx_ftz_f:
       case Intrinsic::nvvm_sin_approx_f:
-        return ConstantFoldFP(sin, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+        return ConstantFoldFP(
+            sin, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
 
       case Intrinsic::nvvm_sqrt_rn_ftz_f:
       case Intrinsic::nvvm_sqrt_approx_ftz_f:
@@ -2779,8 +2820,10 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       case Intrinsic::nvvm_sqrt_approx_f:
         if (APF.isNegative())
           return nullptr;
-        return ConstantFoldFP(sqrt, APF, Ty,
-                              nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID));
+        return ConstantFoldFP(
+            sqrt, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
 
       // AMDGCN Intrinsics:
       case Intrinsic::amdgcn_cos: