[libclc] [libclc] Refine __clc_fp*_subnormals_supported and __clc_flush_denormal_if_not_supported (PR #157633)

Fri Oct 3 00:04:23 PDT 2025

https://github.com/wenju-he updated https://github.com/llvm/llvm-project/pull/157633

>From 7e2d210d9c6cd20c342562a44c2e4d2cb238e229 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Tue, 9 Sep 2025 11:05:02 +0200
Subject: [PATCH 1/8] [libclc] Refine __clc_fp*_subnormals_supported and
 __clc_flush_denormal_if_not_supported

Remove the dependency on the libclc build-time configuration for
__clc_fp*_subnormals_supported. The check is now implemented with LLVM
intrinsics so it can be resolved during target lowering or at runtime.

Improve __clc_flush_denormal_if_not_supported implementation as well.
It doesn't use __clc_fp*_subnormals_supported which canonicalizes sNaN
and thus the new implementation is more foldable.

Remove cmake option ENABLE_RUNTIME_SUBNORMAL and related code.

Resolves #153148

Co-authored-by: Matt Arsenault <Matthew.Arsenault at amd.com>
---
 libclc/CMakeLists.txt                         | 18 --------
 .../include/clc/math/clc_subnormal_config.h   |  1 -
 libclc/clc/include/clc/math/math.h            | 13 ++----
 libclc/clc/lib/generic/SOURCES                |  1 +
 libclc/clc/lib/generic/math/clc_exp10.cl      |  1 -
 libclc/clc/lib/generic/math/clc_hypot.cl      |  1 -
 libclc/clc/lib/generic/math/clc_pow.cl        |  1 -
 libclc/clc/lib/generic/math/clc_pown.cl       |  1 -
 libclc/clc/lib/generic/math/clc_powr.cl       |  1 -
 libclc/clc/lib/generic/math/clc_remquo.cl     |  1 -
 .../lib/generic/math/clc_subnormal_config.cl  | 46 +++++++++++++++++++
 libclc/opencl/lib/clspv/SOURCES               |  1 -
 libclc/opencl/lib/clspv/subnormal_config.cl   | 16 -------
 libclc/opencl/lib/generic/SOURCES             |  2 -
 libclc/opencl/lib/generic/subnormal_config.cl | 18 --------
 .../opencl/lib/generic/subnormal_disable.ll   |  9 ----
 .../lib/generic/subnormal_helper_func.ll      | 16 -------
 .../lib/generic/subnormal_use_default.ll      |  9 ----
 libclc/opencl/lib/spirv/SOURCES               |  1 -
 libclc/opencl/lib/spirv/subnormal_config.cl   | 16 -------
 20 files changed, 52 insertions(+), 121 deletions(-)
 create mode 100644 libclc/clc/lib/generic/math/clc_subnormal_config.cl
 delete mode 100644 libclc/opencl/lib/clspv/subnormal_config.cl
 delete mode 100644 libclc/opencl/lib/generic/subnormal_config.cl
 delete mode 100644 libclc/opencl/lib/generic/subnormal_disable.ll
 delete mode 100644 libclc/opencl/lib/generic/subnormal_helper_func.ll
 delete mode 100644 libclc/opencl/lib/generic/subnormal_use_default.ll
 delete mode 100644 libclc/opencl/lib/spirv/subnormal_config.cl

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index c75f450d8d3ad..572556034e66c 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -41,8 +41,6 @@ set( LIBCLC_MIN_LLVM 3.9.0 )
 set( LIBCLC_TARGETS_TO_BUILD "all"
     CACHE STRING "Semicolon-separated list of libclc targets to build, or 'all'." )
 
-option( ENABLE_RUNTIME_SUBNORMAL "Enable runtime linking of subnormal support." OFF )
-
 option(
   LIBCLC_USE_SPIRV_BACKEND "Build SPIR-V targets with the SPIR-V backend." OFF
 )
@@ -231,19 +229,6 @@ set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii
 configure_file( libclc.pc.in libclc.pc @ONLY )
 install( FILES ${CMAKE_CURRENT_BINARY_DIR}/libclc.pc DESTINATION "${CMAKE_INSTALL_DATADIR}/pkgconfig" )
 
-if( ENABLE_RUNTIME_SUBNORMAL )
-  foreach( file IN ITEMS subnormal_use_default subnormal_disable )
-    link_bc(
-       TARGET ${file}
-       INPUTS ${CMAKE_CURRENT_SOURCE_DIR}/opencl/lib/generic/${file}.ll
-    )
-    install(
-      FILES $<TARGET_PROPERTY:${file},TARGET_FILE>
-      DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
-    )
-  endforeach()
-endif()
-
 find_package( Python3 REQUIRED COMPONENTS Interpreter )
 file( TO_CMAKE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/utils/gen_convert.py script_loc )
 add_custom_command(
@@ -371,9 +356,6 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       list( APPEND opencl_gen_files clspv-convert.cl )
     else()
       list( APPEND opencl_gen_files convert.cl )
-      if ( NOT ENABLE_RUNTIME_SUBNORMAL )
-        list( APPEND opencl_lib_files opencl/lib/generic/subnormal_use_default.ll )
-      endif()
     endif()
   endif()
 
diff --git a/libclc/clc/include/clc/math/clc_subnormal_config.h b/libclc/clc/include/clc/math/clc_subnormal_config.h
index 14693ed01e033..e44ec1958b101 100644
--- a/libclc/clc/include/clc/math/clc_subnormal_config.h
+++ b/libclc/clc/include/clc/math/clc_subnormal_config.h
@@ -10,7 +10,6 @@
 
 #include <clc/clcfunc.h>
 
-_CLC_DECL bool __clc_subnormals_disabled();
 _CLC_DECL bool __clc_fp16_subnormals_supported();
 _CLC_DECL bool __clc_fp32_subnormals_supported();
 _CLC_DECL bool __clc_fp64_subnormals_supported();
diff --git a/libclc/clc/include/clc/math/math.h b/libclc/clc/include/clc/math/math.h
index c2647f66b4006..2db5d187c88ce 100644
--- a/libclc/clc/include/clc/math/math.h
+++ b/libclc/clc/include/clc/math/math.h
@@ -11,7 +11,6 @@
 
 #include <clc/clc_as_type.h>
 #include <clc/clcfunc.h>
-#include <clc/math/clc_subnormal_config.h>
 
 #define SNAN 0x001
 #define QNAN 0x002
@@ -66,13 +65,11 @@ bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void);
 #define LOG_MAGIC_NUM_SP32 (1 + NUMEXPBITS_SP32 - EXPBIAS_SP32)
 
 _CLC_OVERLOAD _CLC_INLINE float __clc_flush_denormal_if_not_supported(float x) {
-  int ix = __clc_as_int(x);
-  if (!__clc_fp32_subnormals_supported() && ((ix & EXPBITS_SP32) == 0) &&
-      ((ix & MANTBITS_SP32) != 0)) {
-    ix &= SIGNBIT_SP32;
-    x = __clc_as_float(ix);
-  }
-  return x;
+  // Avoid calling __clc_fp32_subnormals_supported here: it uses
+  // llvm.canonicalize, which quiets sNaN.
+  return __builtin_fabsf(x) < 0x1p-149f
+             ? __builtin_elementwise_copysign(0.0f, x)
+             : x;
 }
 
 #ifdef cl_khr_fp64
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index ee4f771799e8e..4a6dadc702033 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -137,6 +137,7 @@ math/clc_sincos_helpers.cl
 math/clc_sinh.cl
 math/clc_sinpi.cl
 math/clc_sqrt.cl
+math/clc_subnormal_config.cl
 math/clc_sw_fma.cl
 math/clc_tables.cl
 math/clc_tan.cl
diff --git a/libclc/clc/lib/generic/math/clc_exp10.cl b/libclc/clc/lib/generic/math/clc_exp10.cl
index 0c394ee19475a..fb33367851fda 100644
--- a/libclc/clc/lib/generic/math/clc_exp10.cl
+++ b/libclc/clc/lib/generic/math/clc_exp10.cl
@@ -11,7 +11,6 @@
 #include <clc/math/clc_fma.h>
 #include <clc/math/clc_ldexp.h>
 #include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
 #include <clc/relational/clc_isnan.h>
diff --git a/libclc/clc/lib/generic/math/clc_hypot.cl b/libclc/clc/lib/generic/math/clc_hypot.cl
index c934ab29da91b..fd046bccaed51 100644
--- a/libclc/clc/lib/generic/math/clc_hypot.cl
+++ b/libclc/clc/lib/generic/math/clc_hypot.cl
@@ -12,7 +12,6 @@
 #include <clc/math/clc_fma.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_sqrt.h>
-#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_clamp.h>
diff --git a/libclc/clc/lib/generic/math/clc_pow.cl b/libclc/clc/lib/generic/math/clc_pow.cl
index 70d3d614a8d36..c20d3829ea076 100644
--- a/libclc/clc/lib/generic/math/clc_pow.cl
+++ b/libclc/clc/lib/generic/math/clc_pow.cl
@@ -12,7 +12,6 @@
 #include <clc/math/clc_fma.h>
 #include <clc/math/clc_ldexp.h>
 #include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
 #include <clc/relational/clc_select.h>
diff --git a/libclc/clc/lib/generic/math/clc_pown.cl b/libclc/clc/lib/generic/math/clc_pown.cl
index 5aa9560174b99..cfc415753fd1a 100644
--- a/libclc/clc/lib/generic/math/clc_pown.cl
+++ b/libclc/clc/lib/generic/math/clc_pown.cl
@@ -12,7 +12,6 @@
 #include <clc/math/clc_fma.h>
 #include <clc/math/clc_ldexp.h>
 #include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
 #include <clc/relational/clc_select.h>
diff --git a/libclc/clc/lib/generic/math/clc_powr.cl b/libclc/clc/lib/generic/math/clc_powr.cl
index 0556ec97d6f3c..c35a3e2c382c5 100644
--- a/libclc/clc/lib/generic/math/clc_powr.cl
+++ b/libclc/clc/lib/generic/math/clc_powr.cl
@@ -12,7 +12,6 @@
 #include <clc/math/clc_fma.h>
 #include <clc/math/clc_ldexp.h>
 #include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
 #include <clc/relational/clc_select.h>
diff --git a/libclc/clc/lib/generic/math/clc_remquo.cl b/libclc/clc/lib/generic/math/clc_remquo.cl
index fd83ead06d89a..cdebe4922baa0 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.cl
+++ b/libclc/clc/lib/generic/math/clc_remquo.cl
@@ -12,7 +12,6 @@
 #include <clc/math/clc_floor.h>
 #include <clc/math/clc_fma.h>
 #include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
 #include <clc/shared/clc_max.h>
diff --git a/libclc/clc/lib/generic/math/clc_subnormal_config.cl b/libclc/clc/lib/generic/math/clc_subnormal_config.cl
new file mode 100644
index 0000000000000..6be6eb44ae4f4
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_subnormal_config.cl
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/internal/clc.h>
+#include <clc/math/clc_subnormal_config.h>
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+_CLC_DEF bool __clc_fp16_subnormals_supported() {
+#ifdef CLC_SPIRV
+  // SPIR-V doesn't support llvm.canonicalize for now.
+  return false;
+#else
+  return !__builtin_isfpclass(__builtin_canonicalizef(0x1p-24h),
+                              __FPCLASS_POSZERO);
+#endif
+}
+#endif // cl_khr_fp16
+
+_CLC_DEF bool __clc_fp32_subnormals_supported() {
+#ifdef CLC_SPIRV
+  // SPIR-V doesn't support llvm.canonicalize for now.
+  return false;
+#else
+  return !__builtin_isfpclass(__builtin_canonicalizef(0x1p-149f),
+                              __FPCLASS_POSZERO);
+#endif
+}
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+_CLC_DEF bool __clc_fp64_subnormals_supported() {
+#ifdef CLC_SPIRV
+  // SPIR-V doesn't support llvm.canonicalize for now.
+  return false;
+#else
+  return !__builtin_isfpclass(__builtin_canonicalizef(0x1p-1074),
+                              __FPCLASS_POSZERO);
+#endif
+}
+#endif // cl_khr_fp64
diff --git a/libclc/opencl/lib/clspv/SOURCES b/libclc/opencl/lib/clspv/SOURCES
index 0a142ed3e6043..3d9f871ff57ca 100644
--- a/libclc/opencl/lib/clspv/SOURCES
+++ b/libclc/opencl/lib/clspv/SOURCES
@@ -1,6 +1,5 @@
 math/fma.cl
 shared/vstore_half.cl
-subnormal_config.cl
 ../generic/geometric/distance.cl
 ../generic/geometric/length.cl
 ../generic/math/acos.cl
diff --git a/libclc/opencl/lib/clspv/subnormal_config.cl b/libclc/opencl/lib/clspv/subnormal_config.cl
deleted file mode 100644
index 114aabb2e9435..0000000000000
--- a/libclc/opencl/lib/clspv/subnormal_config.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/opencl/opencl-base.h>
-
-_CLC_DEF bool __clc_fp16_subnormals_supported() { return false; }
-
-_CLC_DEF bool __clc_fp32_subnormals_supported() { return false; }
-
-_CLC_DEF bool __clc_fp64_subnormals_supported() { return false; }
diff --git a/libclc/opencl/lib/generic/SOURCES b/libclc/opencl/lib/generic/SOURCES
index 61757efbcaad7..410fbdee2c71f 100644
--- a/libclc/opencl/lib/generic/SOURCES
+++ b/libclc/opencl/lib/generic/SOURCES
@@ -1,5 +1,3 @@
-subnormal_config.cl
-subnormal_helper_func.ll
 async/async_work_group_copy.cl
 async/async_work_group_strided_copy.cl
 async/prefetch.cl
diff --git a/libclc/opencl/lib/generic/subnormal_config.cl b/libclc/opencl/lib/generic/subnormal_config.cl
deleted file mode 100644
index aa2e30935e5f0..0000000000000
--- a/libclc/opencl/lib/generic/subnormal_config.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/opencl/opencl-base.h>
-
-_CLC_DEF bool __clc_fp16_subnormals_supported() { return false; }
-
-_CLC_DEF bool __clc_fp32_subnormals_supported() { return false; }
-
-_CLC_DEF bool __clc_fp64_subnormals_supported() {
-  return !__clc_subnormals_disabled();
-}
diff --git a/libclc/opencl/lib/generic/subnormal_disable.ll b/libclc/opencl/lib/generic/subnormal_disable.ll
deleted file mode 100644
index 732d09ff09ab4..0000000000000
--- a/libclc/opencl/lib/generic/subnormal_disable.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-;;===----------------------------------------------------------------------===;;
-;
-; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-; See https://llvm.org/LICENSE.txt for license information.
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-;
-;;===----------------------------------------------------------------------===;;
-
- at __CLC_SUBNORMAL_DISABLE = unnamed_addr constant i1 true
diff --git a/libclc/opencl/lib/generic/subnormal_helper_func.ll b/libclc/opencl/lib/generic/subnormal_helper_func.ll
deleted file mode 100644
index 03beecf979260..0000000000000
--- a/libclc/opencl/lib/generic/subnormal_helper_func.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-;;===----------------------------------------------------------------------===;;
-;
-; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-; See https://llvm.org/LICENSE.txt for license information.
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-;
-;;===----------------------------------------------------------------------===;;
-
- at __CLC_SUBNORMAL_DISABLE = external global i1
-
-define i1 @__clc_subnormals_disabled() #0 {
-  %disable = load i1, i1* @__CLC_SUBNORMAL_DISABLE
-  ret i1 %disable
-}
-
-attributes #0 = { alwaysinline }
diff --git a/libclc/opencl/lib/generic/subnormal_use_default.ll b/libclc/opencl/lib/generic/subnormal_use_default.ll
deleted file mode 100644
index c648cc0a8aded..0000000000000
--- a/libclc/opencl/lib/generic/subnormal_use_default.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-;;===----------------------------------------------------------------------===;;
-;
-; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-; See https://llvm.org/LICENSE.txt for license information.
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-;
-;;===----------------------------------------------------------------------===;;
-
- at __CLC_SUBNORMAL_DISABLE = unnamed_addr constant i1 false
diff --git a/libclc/opencl/lib/spirv/SOURCES b/libclc/opencl/lib/spirv/SOURCES
index 0aa923978e9f1..aa7fcee0c4f4a 100644
--- a/libclc/opencl/lib/spirv/SOURCES
+++ b/libclc/opencl/lib/spirv/SOURCES
@@ -1,4 +1,3 @@
-subnormal_config.cl
 ../generic/async/async_work_group_strided_copy.cl
 ../generic/async/wait_group_events.cl
 ../generic/common/degrees.cl
diff --git a/libclc/opencl/lib/spirv/subnormal_config.cl b/libclc/opencl/lib/spirv/subnormal_config.cl
deleted file mode 100644
index 114aabb2e9435..0000000000000
--- a/libclc/opencl/lib/spirv/subnormal_config.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/opencl/opencl-base.h>
-
-_CLC_DEF bool __clc_fp16_subnormals_supported() { return false; }
-
-_CLC_DEF bool __clc_fp32_subnormals_supported() { return false; }
-
-_CLC_DEF bool __clc_fp64_subnormals_supported() { return false; }

>From 96ec9dc17b214839a8f0fcc58b7dae14f369b839 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Tue, 9 Sep 2025 17:18:40 +0800
Subject: [PATCH 2/8] Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
 libclc/clc/lib/generic/math/clc_subnormal_config.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libclc/clc/lib/generic/math/clc_subnormal_config.cl b/libclc/clc/lib/generic/math/clc_subnormal_config.cl
index 6be6eb44ae4f4..dd019789a99cc 100644
--- a/libclc/clc/lib/generic/math/clc_subnormal_config.cl
+++ b/libclc/clc/lib/generic/math/clc_subnormal_config.cl
@@ -16,7 +16,7 @@ _CLC_DEF bool __clc_fp16_subnormals_supported() {
   // SPIR-V doesn't support llvm.canonicalize for now.
   return false;
 #else
-  return !__builtin_isfpclass(__builtin_canonicalizef(0x1p-24h),
+  return !__builtin_isfpclass(__builtin_canonicalizef((float)0x1p-24h),
                               __FPCLASS_POSZERO);
 #endif
 }
@@ -39,7 +39,7 @@ _CLC_DEF bool __clc_fp64_subnormals_supported() {
   // SPIR-V doesn't support llvm.canonicalize for now.
   return false;
 #else
-  return !__builtin_isfpclass(__builtin_canonicalizef(0x1p-1074),
+  return !__builtin_isfpclass(__builtin_canonicalize(0x1p-1074),
                               __FPCLASS_POSZERO);
 #endif
 }

>From d52fcdb027f8b45019bc7904aca78d0ff90bf48d Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Tue, 9 Sep 2025 12:25:27 +0200
Subject: [PATCH 3/8] use __builtin_elementwise_canonicalize

---
 libclc/clc/lib/generic/math/clc_subnormal_config.cl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libclc/clc/lib/generic/math/clc_subnormal_config.cl b/libclc/clc/lib/generic/math/clc_subnormal_config.cl
index dd019789a99cc..55e28a35f6ce8 100644
--- a/libclc/clc/lib/generic/math/clc_subnormal_config.cl
+++ b/libclc/clc/lib/generic/math/clc_subnormal_config.cl
@@ -16,7 +16,7 @@ _CLC_DEF bool __clc_fp16_subnormals_supported() {
   // SPIR-V doesn't support llvm.canonicalize for now.
   return false;
 #else
-  return !__builtin_isfpclass(__builtin_canonicalizef((float)0x1p-24h),
+  return !__builtin_isfpclass(__builtin_elementwise_canonicalize(0x1p-24h),
                               __FPCLASS_POSZERO);
 #endif
 }
@@ -27,7 +27,7 @@ _CLC_DEF bool __clc_fp32_subnormals_supported() {
   // SPIR-V doesn't support llvm.canonicalize for now.
   return false;
 #else
-  return !__builtin_isfpclass(__builtin_canonicalizef(0x1p-149f),
+  return !__builtin_isfpclass(__builtin_elementwise_canonicalize(0x1p-149f),
                               __FPCLASS_POSZERO);
 #endif
 }
@@ -39,7 +39,7 @@ _CLC_DEF bool __clc_fp64_subnormals_supported() {
   // SPIR-V doesn't support llvm.canonicalize for now.
   return false;
 #else
-  return !__builtin_isfpclass(__builtin_canonicalize(0x1p-1074),
+  return !__builtin_isfpclass(__builtin_elementwise_canonicalize(0x1p-1074),
                               __FPCLASS_POSZERO);
 #endif
 }

>From 4608f77d4bc81ec584e5c3c2d19b446ffc919a2f Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Tue, 9 Sep 2025 12:40:49 +0200
Subject: [PATCH 4/8] set -fdenormal-fp-math-f32=dynamic build flag globally

---
 libclc/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 572556034e66c..2447d97d2b624 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -412,6 +412,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       # Error on undefined macros
       -Werror=undef
       -fdiscard-value-names
+      -Xclang -fdenormal-fp-math-f32=dynamic
     )
 
     if( NOT "${cpu}" STREQUAL "" )

>From 3f665ce5b145ca5ab38a95e6cb145064e8c9dbfe Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Thu, 11 Sep 2025 01:02:12 +0200
Subject: [PATCH 5/8] rename  __clc_flush_denormal_if_not_supportedto
 __clc_soft_flush_denormal

---
 libclc/clc/include/clc/math/math.h         | 4 ++--
 libclc/clc/lib/clspv/math/clc_sw_fma.cl    | 6 +++---
 libclc/clc/lib/generic/math/clc_remquo.inc | 4 ++--
 libclc/clc/lib/generic/math/clc_sw_fma.cl  | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/libclc/clc/include/clc/math/math.h b/libclc/clc/include/clc/math/math.h
index 2db5d187c88ce..b43f6a0f4c993 100644
--- a/libclc/clc/include/clc/math/math.h
+++ b/libclc/clc/include/clc/math/math.h
@@ -64,10 +64,10 @@ bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void);
 
 #define LOG_MAGIC_NUM_SP32 (1 + NUMEXPBITS_SP32 - EXPBIAS_SP32)
 
-_CLC_OVERLOAD _CLC_INLINE float __clc_flush_denormal_if_not_supported(float x) {
+_CLC_OVERLOAD _CLC_INLINE float __clc_soft_flush_denormal(float x) {
   // Avoid calling __clc_fp32_subnormals_supported here: it uses
   // llvm.canonicalize, which quiets sNaN.
-  return __builtin_fabsf(x) < 0x1p-149f
+  return __builtin_elementwise_abs(x) < 0x1p-149f
              ? __builtin_elementwise_copysign(0.0f, x)
              : x;
 }
diff --git a/libclc/clc/lib/clspv/math/clc_sw_fma.cl b/libclc/clc/lib/clspv/math/clc_sw_fma.cl
index c28b9441b05ff..e67456b7f7ebc 100644
--- a/libclc/clc/lib/clspv/math/clc_sw_fma.cl
+++ b/libclc/clc/lib/clspv/math/clc_sw_fma.cl
@@ -127,9 +127,9 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
     return c;
   }
 
-  a = __clc_flush_denormal_if_not_supported(a);
-  b = __clc_flush_denormal_if_not_supported(b);
-  c = __clc_flush_denormal_if_not_supported(c);
+  a = __clc_soft_flush_denormal(a);
+  b = __clc_soft_flush_denormal(b);
+  c = __clc_soft_flush_denormal(c);
 
   if (a == 0.0f || b == 0.0f) {
     return c;
diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc
index 3a76ffed7f039..681020f501d65 100644
--- a/libclc/clc/lib/generic/math/clc_remquo.inc
+++ b/libclc/clc/lib/generic/math/clc_remquo.inc
@@ -8,8 +8,8 @@
 
 _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
                                           __CLC_ADDRESS_SPACE int *quo) {
-  x = __clc_flush_denormal_if_not_supported(x);
-  y = __clc_flush_denormal_if_not_supported(y);
+  x = __clc_soft_flush_denormal(x);
+  y = __clc_soft_flush_denormal(y);
   int ux = __clc_as_int(x);
   int ax = ux & EXSIGNBIT_SP32;
   float xa = __clc_as_float(ax);
diff --git a/libclc/clc/lib/generic/math/clc_sw_fma.cl b/libclc/clc/lib/generic/math/clc_sw_fma.cl
index 606e4df320a89..e8bf673b50f36 100644
--- a/libclc/clc/lib/generic/math/clc_sw_fma.cl
+++ b/libclc/clc/lib/generic/math/clc_sw_fma.cl
@@ -36,9 +36,9 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
     return c;
   }
 
-  a = __clc_flush_denormal_if_not_supported(a);
-  b = __clc_flush_denormal_if_not_supported(b);
-  c = __clc_flush_denormal_if_not_supported(c);
+  a = __clc_soft_flush_denormal(a);
+  b = __clc_soft_flush_denormal(b);
+  c = __clc_soft_flush_denormal(c);
 
   if (c == 0) {
     return a * b;

>From 7b290a24e3662f203160e8df33421eb3928dd475 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Fri, 3 Oct 2025 07:46:43 +0200
Subject: [PATCH 6/8] delete clc_sw_fma

---
 .../include/clc/internal/math/clc_sw_fma.h    |  19 --
 libclc/clc/include/clc/math/math.h            |   9 -
 libclc/clc/lib/clspv/SOURCES                  |   1 -
 libclc/clc/lib/clspv/math/clc_sw_fma.cl       | 274 ------------------
 libclc/clc/lib/generic/SOURCES                |   1 -
 libclc/clc/lib/generic/math/clc_fma.inc       |   4 -
 .../lib/generic/math/clc_sincos_helpers.inc   |  32 +-
 libclc/clc/lib/generic/math/clc_sw_fma.cl     | 165 -----------
 libclc/clc/lib/spirv/SOURCES                  |   1 -
 .../spirv/math/clc_runtime_has_hw_fma32.cl    |   9 -
 libclc/opencl/lib/clspv/math/fma.cl           |   2 +-
 libclc/opencl/lib/spirv/math/fma.cl           |   2 +-
 12 files changed, 7 insertions(+), 512 deletions(-)
 delete mode 100644 libclc/clc/include/clc/internal/math/clc_sw_fma.h
 delete mode 100644 libclc/clc/lib/clspv/math/clc_sw_fma.cl
 delete mode 100644 libclc/clc/lib/generic/math/clc_sw_fma.cl
 delete mode 100644 libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl

diff --git a/libclc/clc/include/clc/internal/math/clc_sw_fma.h b/libclc/clc/include/clc/internal/math/clc_sw_fma.h
deleted file mode 100644
index 5d6c76879ceb9..0000000000000
--- a/libclc/clc/include/clc/internal/math/clc_sw_fma.h
+++ /dev/null
@@ -1,19 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
-#define __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
-
-#define __CLC_FUNCTION __clc_sw_fma
-#define __CLC_BODY <clc/shared/ternary_decl.inc>
-
-#include <clc/math/gentype.inc>
-
-#undef __CLC_FUNCTION
-
-#endif // __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
diff --git a/libclc/clc/include/clc/math/math.h b/libclc/clc/include/clc/math/math.h
index b43f6a0f4c993..cc4eb4ce3ec76 100644
--- a/libclc/clc/include/clc/math/math.h
+++ b/libclc/clc/include/clc/math/math.h
@@ -23,15 +23,6 @@
 #define PNOR 0x100
 #define PINF 0x200
 
-#if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__
-#define __CLC_HAVE_HW_FMA32() (0)
-#elif defined(CLC_SPIRV)
-bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void);
-#define __CLC_HAVE_HW_FMA32() __clc_runtime_has_hw_fma32()
-#else
-#define __CLC_HAVE_HW_FMA32() (1)
-#endif
-
 #define HAVE_BITALIGN() (0)
 #define HAVE_FAST_FMA32() (0)
 
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
index b91b0e70a397d..2faea79cbc0bf 100644
--- a/libclc/clc/lib/clspv/SOURCES
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -1,2 +1 @@
-math/clc_sw_fma.cl
 integer/clc_mul_hi.cl
diff --git a/libclc/clc/lib/clspv/math/clc_sw_fma.cl b/libclc/clc/lib/clspv/math/clc_sw_fma.cl
deleted file mode 100644
index e67456b7f7ebc..0000000000000
--- a/libclc/clc/lib/clspv/math/clc_sw_fma.cl
+++ /dev/null
@@ -1,274 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// This version is derived from the generic fma software implementation
-// (__clc_sw_fma), but avoids the use of ulong in favor of uint2. The logic has
-// been updated as appropriate.
-
-#include <clc/clc_as_type.h>
-#include <clc/float/definitions.h>
-#include <clc/integer/clc_abs.h>
-#include <clc/integer/clc_clz.h>
-#include <clc/integer/clc_hadd.h>
-#include <clc/integer/clc_mul_hi.h>
-#include <clc/integer/definitions.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/math.h>
-#include <clc/relational/clc_isinf.h>
-#include <clc/relational/clc_isnan.h>
-#include <clc/shared/clc_max.h>
-
-struct fp {
-  uint2 mantissa;
-  int exponent;
-  uint sign;
-};
-
-static uint2 u2_set(uint hi, uint lo) {
-  uint2 res;
-  res.lo = lo;
-  res.hi = hi;
-  return res;
-}
-
-static uint2 u2_set_u(uint val) { return u2_set(0, val); }
-
-static uint2 u2_mul(uint a, uint b) {
-  uint2 res;
-  res.hi = __clc_mul_hi(a, b);
-  res.lo = a * b;
-  return res;
-}
-
-static uint2 u2_sll(uint2 val, uint shift) {
-  if (shift == 0)
-    return val;
-  if (shift < 32) {
-    val.hi <<= shift;
-    val.hi |= val.lo >> (32 - shift);
-    val.lo <<= shift;
-  } else {
-    val.hi = val.lo << (shift - 32);
-    val.lo = 0;
-  }
-  return val;
-}
-
-static uint2 u2_srl(uint2 val, uint shift) {
-  if (shift == 0)
-    return val;
-  if (shift < 32) {
-    val.lo >>= shift;
-    val.lo |= val.hi << (32 - shift);
-    val.hi >>= shift;
-  } else {
-    val.lo = val.hi >> (shift - 32);
-    val.hi = 0;
-  }
-  return val;
-}
-
-static uint2 u2_or(uint2 a, uint b) {
-  a.lo |= b;
-  return a;
-}
-
-static uint2 u2_and(uint2 a, uint2 b) {
-  a.lo &= b.lo;
-  a.hi &= b.hi;
-  return a;
-}
-
-static uint2 u2_add(uint2 a, uint2 b) {
-  uint carry = (__clc_hadd(a.lo, b.lo) >> 31) & 0x1;
-  a.lo += b.lo;
-  a.hi += b.hi + carry;
-  return a;
-}
-
-static uint2 u2_add_u(uint2 a, uint b) { return u2_add(a, u2_set_u(b)); }
-
-static uint2 u2_inv(uint2 a) {
-  a.lo = ~a.lo;
-  a.hi = ~a.hi;
-  return u2_add_u(a, 1);
-}
-
-static uint u2_clz(uint2 a) {
-  uint leading_zeroes = __clc_clz(a.hi);
-  if (leading_zeroes == 32) {
-    leading_zeroes += __clc_clz(a.lo);
-  }
-  return leading_zeroes;
-}
-
-static bool u2_eq(uint2 a, uint2 b) { return a.lo == b.lo && a.hi == b.hi; }
-
-static bool u2_zero(uint2 a) { return u2_eq(a, u2_set_u(0)); }
-
-static bool u2_gt(uint2 a, uint2 b) {
-  return a.hi > b.hi || (a.hi == b.hi && a.lo > b.lo);
-}
-
-_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
-  /* special cases */
-  if (__clc_isnan(a) || __clc_isnan(b) || __clc_isnan(c) || __clc_isinf(a) ||
-      __clc_isinf(b)) {
-    return __clc_mad(a, b, c);
-  }
-
-  /* If only c is inf, and both a,b are regular numbers, the result is c*/
-  if (__clc_isinf(c)) {
-    return c;
-  }
-
-  a = __clc_soft_flush_denormal(a);
-  b = __clc_soft_flush_denormal(b);
-  c = __clc_soft_flush_denormal(c);
-
-  if (a == 0.0f || b == 0.0f) {
-    return c;
-  }
-
-  if (c == 0) {
-    return a * b;
-  }
-
-  struct fp st_a, st_b, st_c;
-
-  st_a.exponent = a == .0f ? 0 : ((__clc_as_uint(a) & 0x7f800000) >> 23) - 127;
-  st_b.exponent = b == .0f ? 0 : ((__clc_as_uint(b) & 0x7f800000) >> 23) - 127;
-  st_c.exponent = c == .0f ? 0 : ((__clc_as_uint(c) & 0x7f800000) >> 23) - 127;
-
-  st_a.mantissa =
-      u2_set_u(a == .0f ? 0 : (__clc_as_uint(a) & 0x7fffff) | 0x800000);
-  st_b.mantissa =
-      u2_set_u(b == .0f ? 0 : (__clc_as_uint(b) & 0x7fffff) | 0x800000);
-  st_c.mantissa =
-      u2_set_u(c == .0f ? 0 : (__clc_as_uint(c) & 0x7fffff) | 0x800000);
-
-  st_a.sign = __clc_as_uint(a) & 0x80000000;
-  st_b.sign = __clc_as_uint(b) & 0x80000000;
-  st_c.sign = __clc_as_uint(c) & 0x80000000;
-
-  // Multiplication.
-  // Move the product to the highest bits to maximize precision
-  // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
-  // Add one bit for future addition overflow,
-  // add another bit to detect subtraction underflow
-  struct fp st_mul;
-  st_mul.sign = st_a.sign ^ st_b.sign;
-  st_mul.mantissa = u2_sll(u2_mul(st_a.mantissa.lo, st_b.mantissa.lo), 14);
-  st_mul.exponent =
-      !u2_zero(st_mul.mantissa) ? st_a.exponent + st_b.exponent : 0;
-
-  // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
-  if (st_mul.exponent == 0 && u2_zero(st_mul.mantissa))
-    return c;
-
-// Mantissa is 23 fractional bits, shift it the same way as product mantissa
-#define C_ADJUST 37ul
-
-  // both exponents are bias adjusted
-  int exp_diff = st_mul.exponent - st_c.exponent;
-
-  st_c.mantissa = u2_sll(st_c.mantissa, C_ADJUST);
-  uint2 cutoff_bits = u2_set_u(0);
-  uint2 cutoff_mask = u2_add(u2_sll(u2_set_u(1), __clc_abs(exp_diff)),
-                             u2_set(0xffffffff, 0xffffffff));
-  if (exp_diff > 0) {
-    cutoff_bits =
-        exp_diff >= 64 ? st_c.mantissa : u2_and(st_c.mantissa, cutoff_mask);
-    st_c.mantissa =
-        exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_c.mantissa, exp_diff);
-  } else {
-    cutoff_bits = -exp_diff >= 64 ? st_mul.mantissa
-                                  : u2_and(st_mul.mantissa, cutoff_mask);
-    st_mul.mantissa =
-        -exp_diff >= 64 ? u2_set_u(0) : u2_srl(st_mul.mantissa, -exp_diff);
-  }
-
-  struct fp st_fma;
-  st_fma.sign = st_mul.sign;
-  st_fma.exponent = __clc_max(st_mul.exponent, st_c.exponent);
-  if (st_c.sign == st_mul.sign) {
-    st_fma.mantissa = u2_add(st_mul.mantissa, st_c.mantissa);
-  } else {
-    // cutoff bits borrow one
-    st_fma.mantissa =
-        u2_add(u2_add(st_mul.mantissa, u2_inv(st_c.mantissa)),
-               (!u2_zero(cutoff_bits) && (st_mul.exponent > st_c.exponent)
-                    ? u2_set(0xffffffff, 0xffffffff)
-                    : u2_set_u(0)));
-  }
-
-  // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
-  if (u2_gt(st_fma.mantissa, u2_set(0x7fffffff, 0xffffffff))) {
-    st_fma.mantissa = u2_inv(st_fma.mantissa);
-    st_fma.sign = st_mul.sign ^ 0x80000000;
-  }
-
-  // detect overflow/underflow
-  int overflow_bits = 3 - u2_clz(st_fma.mantissa);
-
-  // adjust exponent
-  st_fma.exponent += overflow_bits;
-
-  // handle underflow
-  if (overflow_bits < 0) {
-    st_fma.mantissa = u2_sll(st_fma.mantissa, -overflow_bits);
-    overflow_bits = 0;
-  }
-
-  // rounding
-  uint2 trunc_mask = u2_add(u2_sll(u2_set_u(1), C_ADJUST + overflow_bits),
-                            u2_set(0xffffffff, 0xffffffff));
-  uint2 trunc_bits =
-      u2_or(u2_and(st_fma.mantissa, trunc_mask), !u2_zero(cutoff_bits));
-  uint2 last_bit =
-      u2_and(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
-  uint2 grs_bits = u2_sll(u2_set_u(4), C_ADJUST - 3 + overflow_bits);
-
-  // round to nearest even
-  if (u2_gt(trunc_bits, grs_bits) ||
-      (u2_eq(trunc_bits, grs_bits) && !u2_zero(last_bit))) {
-    st_fma.mantissa =
-        u2_add(st_fma.mantissa, u2_sll(u2_set_u(1), C_ADJUST + overflow_bits));
-  }
-
-  // Shift mantissa back to bit 23
-  st_fma.mantissa = u2_srl(st_fma.mantissa, C_ADJUST + overflow_bits);
-
-  // Detect rounding overflow
-  if (u2_gt(st_fma.mantissa, u2_set_u(0xffffff))) {
-    ++st_fma.exponent;
-    st_fma.mantissa = u2_srl(st_fma.mantissa, 1);
-  }
-
-  if (u2_zero(st_fma.mantissa)) {
-    return 0.0f;
-  }
-
-  // Flating point range limit
-  if (st_fma.exponent > 127) {
-    return __clc_as_float(__clc_as_uint(INFINITY) | st_fma.sign);
-  }
-
-  // Flush denormals
-  if (st_fma.exponent <= -127) {
-    return __clc_as_float(st_fma.sign);
-  }
-
-  return __clc_as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
-                        ((uint)st_fma.mantissa.lo & 0x7fffff));
-}
-
-#define __CLC_FLOAT_ONLY
-#define __CLC_FUNCTION __clc_sw_fma
-#define __CLC_BODY <clc/shared/ternary_def_scalarize.inc>
-#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index 4a6dadc702033..ef35c43ce443e 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -138,7 +138,6 @@ math/clc_sinh.cl
 math/clc_sinpi.cl
 math/clc_sqrt.cl
 math/clc_subnormal_config.cl
-math/clc_sw_fma.cl
 math/clc_tables.cl
 math/clc_tan.cl
 math/clc_tanh.cl
diff --git a/libclc/clc/lib/generic/math/clc_fma.inc b/libclc/clc/lib/generic/math/clc_fma.inc
index b23b6433d2922..a55e9c0f9b2b7 100644
--- a/libclc/clc/lib/generic/math/clc_fma.inc
+++ b/libclc/clc/lib/generic/math/clc_fma.inc
@@ -8,9 +8,5 @@
 
 _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_fma(__CLC_GENTYPE a, __CLC_GENTYPE b,
                                                __CLC_GENTYPE c) {
-#if __CLC_FPSIZE == 32
-  if (!__CLC_HAVE_HW_FMA32())
-    return __clc_sw_fma(a, b, c);
-#endif
   return __builtin_elementwise_fma(a, b, c);
 }
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
index bddc0998cf950..e902bf3830626 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
@@ -97,19 +97,9 @@ _CLC_DEF _CLC_OVERLOAD void __clc_fullMulS(private __CLC_FLOATN *hi,
                                            private __CLC_FLOATN *lo,
                                            __CLC_FLOATN a, __CLC_FLOATN b,
                                            __CLC_FLOATN bh, __CLC_FLOATN bt) {
-  if (__CLC_HAVE_HW_FMA32()) {
-    __CLC_FLOATN ph = a * b;
-    *hi = ph;
-    *lo = __clc_fma(a, b, -ph);
-  } else {
-    __CLC_FLOATN ah = __CLC_AS_FLOATN(__CLC_AS_UINTN(a) & 0xfffff000U);
-    __CLC_FLOATN at = a - ah;
-    __CLC_FLOATN ph = a * b;
-    __CLC_FLOATN pt = __clc_mad(
-        at, bt, __clc_mad(at, bh, __clc_mad(ah, bt, __clc_mad(ah, bh, -ph))));
-    *hi = ph;
-    *lo = pt;
-  }
+  __CLC_FLOATN ph = a * b;
+  *hi = ph;
+  *lo = __clc_fma(a, b, -ph);
 }
 
 _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_removePi2S(private __CLC_FLOATN *hi,
@@ -280,20 +270,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
   const __CLC_FLOATN pio2t = (__CLC_FLOATN)0xa22168 / 0x1.0p+47f;
 
   __CLC_FLOATN rh, rt;
-
-  if (__CLC_HAVE_HW_FMA32()) {
-    rh = q1 * pio2h;
-    rt = __clc_fma(q0, pio2h, __clc_fma(q1, pio2t, __clc_fma(q1, pio2h, -rh)));
-  } else {
-    __CLC_FLOATN q1h = __CLC_AS_FLOATN(__CLC_AS_UINTN(q1) & 0xfffff000);
-    __CLC_FLOATN q1t = q1 - q1h;
-    rh = q1 * pio2h;
-    rt = __clc_mad(
-        q1t, pio2ht,
-        __clc_mad(q1t, pio2hh,
-                  __clc_mad(q1h, pio2ht, __clc_mad(q1h, pio2hh, -rh))));
-    rt = __clc_mad(q0, pio2h, __clc_mad(q1, pio2t, rt));
-  }
+  rh = q1 * pio2h;
+  rt = __clc_fma(q0, pio2h, __clc_fma(q1, pio2t, __clc_fma(q1, pio2h, -rh)));
 
   __CLC_FLOATN t = rh + rt;
   rt = rt - (t - rh);
diff --git a/libclc/clc/lib/generic/math/clc_sw_fma.cl b/libclc/clc/lib/generic/math/clc_sw_fma.cl
deleted file mode 100644
index e8bf673b50f36..0000000000000
--- a/libclc/clc/lib/generic/math/clc_sw_fma.cl
+++ /dev/null
@@ -1,165 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/clc_as_type.h>
-#include <clc/float/definitions.h>
-#include <clc/integer/clc_abs.h>
-#include <clc/integer/clc_clz.h>
-#include <clc/integer/definitions.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/math.h>
-#include <clc/relational/clc_isinf.h>
-#include <clc/relational/clc_isnan.h>
-#include <clc/shared/clc_max.h>
-
-struct fp {
-  ulong mantissa;
-  int exponent;
-  uint sign;
-};
-
-_CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
-  /* special cases */
-  if (__clc_isnan(a) || __clc_isnan(b) || __clc_isnan(c) || __clc_isinf(a) ||
-      __clc_isinf(b)) {
-    return __clc_mad(a, b, c);
-  }
-
-  /* If only c is inf, and both a,b are regular numbers, the result is c*/
-  if (__clc_isinf(c)) {
-    return c;
-  }
-
-  a = __clc_soft_flush_denormal(a);
-  b = __clc_soft_flush_denormal(b);
-  c = __clc_soft_flush_denormal(c);
-
-  if (c == 0) {
-    return a * b;
-  }
-
-  struct fp st_a, st_b, st_c;
-
-  st_a.exponent = a == .0f ? 0 : ((__clc_as_uint(a) & 0x7f800000) >> 23) - 127;
-  st_b.exponent = b == .0f ? 0 : ((__clc_as_uint(b) & 0x7f800000) >> 23) - 127;
-  st_c.exponent = c == .0f ? 0 : ((__clc_as_uint(c) & 0x7f800000) >> 23) - 127;
-
-  st_a.mantissa = a == .0f ? 0 : (__clc_as_uint(a) & 0x7fffff) | 0x800000;
-  st_b.mantissa = b == .0f ? 0 : (__clc_as_uint(b) & 0x7fffff) | 0x800000;
-  st_c.mantissa = c == .0f ? 0 : (__clc_as_uint(c) & 0x7fffff) | 0x800000;
-
-  st_a.sign = __clc_as_uint(a) & 0x80000000;
-  st_b.sign = __clc_as_uint(b) & 0x80000000;
-  st_c.sign = __clc_as_uint(c) & 0x80000000;
-
-  // Multiplication.
-  // Move the product to the highest bits to maximize precision
-  // mantissa is 24 bits => product is 48 bits, 2bits non-fraction.
-  // Add one bit for future addition overflow,
-  // add another bit to detect subtraction underflow
-  struct fp st_mul;
-  st_mul.sign = st_a.sign ^ st_b.sign;
-  st_mul.mantissa = (st_a.mantissa * st_b.mantissa) << 14ul;
-  st_mul.exponent = st_mul.mantissa ? st_a.exponent + st_b.exponent : 0;
-
-  // FIXME: Detecting a == 0 || b == 0 above crashed GCN isel
-  if (st_mul.exponent == 0 && st_mul.mantissa == 0)
-    return c;
-
-// Mantissa is 23 fractional bits, shift it the same way as product mantissa
-#define C_ADJUST 37ul
-
-  // both exponents are bias adjusted
-  int exp_diff = st_mul.exponent - st_c.exponent;
-
-  st_c.mantissa <<= C_ADJUST;
-  ulong cutoff_bits = 0;
-  ulong cutoff_mask = (1ul << __clc_abs(exp_diff)) - 1ul;
-  if (exp_diff > 0) {
-    cutoff_bits =
-        exp_diff >= 64 ? st_c.mantissa : (st_c.mantissa & cutoff_mask);
-    st_c.mantissa = exp_diff >= 64 ? 0 : (st_c.mantissa >> exp_diff);
-  } else {
-    cutoff_bits =
-        -exp_diff >= 64 ? st_mul.mantissa : (st_mul.mantissa & cutoff_mask);
-    st_mul.mantissa = -exp_diff >= 64 ? 0 : (st_mul.mantissa >> -exp_diff);
-  }
-
-  struct fp st_fma;
-  st_fma.sign = st_mul.sign;
-  st_fma.exponent = __clc_max(st_mul.exponent, st_c.exponent);
-  if (st_c.sign == st_mul.sign) {
-    st_fma.mantissa = st_mul.mantissa + st_c.mantissa;
-  } else {
-    // cutoff bits borrow one
-    st_fma.mantissa =
-        st_mul.mantissa - st_c.mantissa -
-        (cutoff_bits && (st_mul.exponent > st_c.exponent) ? 1 : 0);
-  }
-
-  // underflow: st_c.sign != st_mul.sign, and magnitude switches the sign
-  if (st_fma.mantissa > LONG_MAX) {
-    st_fma.mantissa = 0 - st_fma.mantissa;
-    st_fma.sign = st_mul.sign ^ 0x80000000;
-  }
-
-  // detect overflow/underflow
-  int overflow_bits = 3 - __clc_clz(st_fma.mantissa);
-
-  // adjust exponent
-  st_fma.exponent += overflow_bits;
-
-  // handle underflow
-  if (overflow_bits < 0) {
-    st_fma.mantissa <<= -overflow_bits;
-    overflow_bits = 0;
-  }
-
-  // rounding
-  ulong trunc_mask = (1ul << (C_ADJUST + overflow_bits)) - 1;
-  ulong trunc_bits = (st_fma.mantissa & trunc_mask) | (cutoff_bits != 0);
-  ulong last_bit = st_fma.mantissa & (1ul << (C_ADJUST + overflow_bits));
-  ulong grs_bits = (0x4ul << (C_ADJUST - 3 + overflow_bits));
-
-  // round to nearest even
-  if ((trunc_bits > grs_bits) || (trunc_bits == grs_bits && last_bit != 0)) {
-    st_fma.mantissa += (1ul << (C_ADJUST + overflow_bits));
-  }
-
-  // Shift mantissa back to bit 23
-  st_fma.mantissa = (st_fma.mantissa >> (C_ADJUST + overflow_bits));
-
-  // Detect rounding overflow
-  if (st_fma.mantissa > 0xffffff) {
-    ++st_fma.exponent;
-    st_fma.mantissa >>= 1;
-  }
-
-  if (st_fma.mantissa == 0) {
-    return .0f;
-  }
-
-  // Flating point range limit
-  if (st_fma.exponent > 127) {
-    return __clc_as_float(__clc_as_uint(INFINITY) | st_fma.sign);
-  }
-
-  // Flush denormals
-  if (st_fma.exponent <= -127) {
-    return __clc_as_float(st_fma.sign);
-  }
-
-  return __clc_as_float(st_fma.sign | ((st_fma.exponent + 127) << 23) |
-                        ((uint)st_fma.mantissa & 0x7fffff));
-}
-
-#define __CLC_FLOAT_ONLY
-#define __CLC_FUNCTION __clc_sw_fma
-#define __CLC_BODY <clc/shared/ternary_def_scalarize.inc>
-#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index 07bc7aaead8e8..ed63fe6b7c529 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -1,3 +1,2 @@
 math/clc_fmax.cl
 math/clc_fmin.cl
-math/clc_runtime_has_hw_fma32.cl
diff --git a/libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl b/libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl
deleted file mode 100644
index 2f6ad2c5175dd..0000000000000
--- a/libclc/clc/lib/spirv/math/clc_runtime_has_hw_fma32.cl
+++ /dev/null
@@ -1,9 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-bool __clc_runtime_has_hw_fma32() { return false; }
diff --git a/libclc/opencl/lib/clspv/math/fma.cl b/libclc/opencl/lib/clspv/math/fma.cl
index 0e328903ba263..1ea6f034b0d1f 100644
--- a/libclc/opencl/lib/clspv/math/fma.cl
+++ b/libclc/opencl/lib/clspv/math/fma.cl
@@ -11,7 +11,7 @@
 
 #define __CLC_FLOAT_ONLY
 #define __CLC_FUNCTION fma
-#define __CLC_IMPL_FUNCTION(x) __clc_sw_fma
+#define __CLC_IMPL_FUNCTION(x) __clc_fma
 #define __CLC_BODY <clc/shared/ternary_def.inc>
 
 #include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/spirv/math/fma.cl b/libclc/opencl/lib/spirv/math/fma.cl
index 0e328903ba263..1ea6f034b0d1f 100644
--- a/libclc/opencl/lib/spirv/math/fma.cl
+++ b/libclc/opencl/lib/spirv/math/fma.cl
@@ -11,7 +11,7 @@
 
 #define __CLC_FLOAT_ONLY
 #define __CLC_FUNCTION fma
-#define __CLC_IMPL_FUNCTION(x) __clc_sw_fma
+#define __CLC_IMPL_FUNCTION(x) __clc_fma
 #define __CLC_BODY <clc/shared/ternary_def.inc>
 
 #include <clc/math/gentype.inc>

>From 3da9705621c6e3bbce01baba3b2a7e1c3aac6a0c Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Fri, 3 Oct 2025 07:50:42 +0200
Subject: [PATCH 7/8] -fdenormal-fp-math-f32 -> -fdenormal-fp-math

---
 libclc/CMakeLists.txt                  | 2 +-
 libclc/clc/lib/generic/math/clc_fma.cl | 2 +-
 libclc/opencl/lib/clspv/math/fma.cl    | 2 +-
 libclc/opencl/lib/spirv/math/fma.cl    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 2447d97d2b624..bc6dc416193fb 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -412,7 +412,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       # Error on undefined macros
       -Werror=undef
       -fdiscard-value-names
-      -Xclang -fdenormal-fp-math-f32=dynamic
+      -Xclang -fdenormal-fp-math=dynamic
     )
 
     if( NOT "${cpu}" STREQUAL "" )
diff --git a/libclc/clc/lib/generic/math/clc_fma.cl b/libclc/clc/lib/generic/math/clc_fma.cl
index e69ef614e780f..27ea962af398d 100644
--- a/libclc/clc/lib/generic/math/clc_fma.cl
+++ b/libclc/clc/lib/generic/math/clc_fma.cl
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <clc/internal/clc.h>
-#include <clc/internal/math/clc_sw_fma.h>
+#include <clc/math/clc_fma.h>
 #include <clc/math/math.h>
 
 #define __CLC_BODY <clc_fma.inc>
diff --git a/libclc/opencl/lib/clspv/math/fma.cl b/libclc/opencl/lib/clspv/math/fma.cl
index 1ea6f034b0d1f..5b5b13d81cf68 100644
--- a/libclc/opencl/lib/clspv/math/fma.cl
+++ b/libclc/opencl/lib/clspv/math/fma.cl
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/internal/math/clc_sw_fma.h>
+#include <clc/math/clc_fma.h>
 #include <clc/opencl/math/fma.h>
 
 #define __CLC_FLOAT_ONLY
diff --git a/libclc/opencl/lib/spirv/math/fma.cl b/libclc/opencl/lib/spirv/math/fma.cl
index 1ea6f034b0d1f..5b5b13d81cf68 100644
--- a/libclc/opencl/lib/spirv/math/fma.cl
+++ b/libclc/opencl/lib/spirv/math/fma.cl
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/internal/math/clc_sw_fma.h>
+#include <clc/math/clc_fma.h>
 #include <clc/opencl/math/fma.h>
 
 #define __CLC_FLOAT_ONLY

>From 7d21a1a82f05735c1bcfdf23ef8ac0171be85bfc Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Fri, 3 Oct 2025 09:03:43 +0200
Subject: [PATCH 8/8] remove -Xclang before -fdenormal-fp-math=dynamic

---
 libclc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index bc6dc416193fb..97896715e2712 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -412,7 +412,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       # Error on undefined macros
       -Werror=undef
       -fdiscard-value-names
-      -Xclang -fdenormal-fp-math=dynamic
+      -fdenormal-fp-math=dynamic
     )
 
     if( NOT "${cpu}" STREQUAL "" )