[clang] [llvm] [X86] Honor MXCSR for AVX-512 CUR_DIRECTION FP intrinsics in strictfp funcs (PR #203521)

Fri Jun 12 06:02:07 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-backend-x86

Author: Rohit Aggarwal (rohitaggarwal007)

<details>
<summary>Changes</summary>

The PR is based on the issue reported in the SemiAnalysisAI repo by @jlebar 

Bug : [059-avx512-cur-direction-mxcsr](https://github.com/SemiAnalysisAI/FuzzX/blob/master/x86/bugs/059-avx512-cur-direction-mxcsr/NOTES.md)

AVX-512 arithmetic intrinsics with an explicit rounding operand accept
_MM_FROUND_CUR_DIRECTION (4), meaning "use the live MXCSR rounding mode".

Both InstCombine and X86 lowering rewrote these into plain fadd/fsub/fmul/
fdiv, which carry default round-to-nearest-even semantics. That lets the
operation be constant-folded under RNE and silently drops any non-default
rounding mode the program installed (fesetround / ldmxcsr), producing wrong
results.

Use the strictfp attribute as the signal that the FP environment is live:
  - InstCombine: skip the CUR_DIRECTION -> plain-FP fold in strictfp
    functions, preserving the intrinsic.
  - X86ISelLowering: in strictfp functions lower CUR_DIRECTION to the strict
    node (STRICT_FADD/FSUB/FMUL/FDIV), which is not constant-folded and
    lowers to a real MXCSR-reading instruction.

Non-strict code is unchanged (RNE remains a valid assumption). Covers the
packed add/sub/mul/div {ps,pd}.512 and masked scalar {ss,sd} round forms.

Adds InstCombine, CodeGen, and clang CodeGen tests.

---

Patch is 23.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/203521.diff


5 Files Affected:

- (added) clang/test/CodeGen/X86/avx512f-cur-direction-rounding.c (+134) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+24) 
- (modified) llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp (+16-2) 
- (added) llvm/test/CodeGen/X86/avx512-cur-direction-rounding.ll (+61) 
- (added) llvm/test/Transforms/InstCombine/X86/x86-avx512-cur-direction-rounding.ll (+168) 


``````````diff

diff --git a/clang/test/CodeGen/X86/avx512f-cur-direction-rounding.c b/clang/test/CodeGen/X86/avx512f-cur-direction-rounding.c
new file mode 100644
index 0000000000000..fbbb0efdbebf2
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx512f-cur-direction-rounding.c
@@ -0,0 +1,134 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -emit-llvm -o - -Wall -Werror | FileCheck --check-prefixes=COMMON,UNCONSTRAINED %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -ffp-exception-behavior=strict -emit-llvm -o - -Wall -Werror | FileCheck --check-prefixes=COMMON,STRICT %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -S -o - -Wall -Werror | FileCheck --check-prefix=CHECK-ASM %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -ffp-exception-behavior=strict -S -o - -Wall -Werror | FileCheck --check-prefix=CHECK-ASM %s
+//
+// At -O2 the default (non-strictfp) intrinsic is folded to a plain fadd, while
+// the strictfp form is preserved (see test_mm512_add_round_ps_fold below).
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -O2 -emit-llvm -o - -Wall -Werror | FileCheck --check-prefix=FOLD %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -ffp-exception-behavior=strict -O2 -emit-llvm -o - -Wall -Werror | FileCheck --check-prefix=KEEP %s
+
+// The packed add/sub/mul/div "_round" builtins with _MM_FROUND_CUR_DIRECTION
+// lower to the unmasked x86 intrinsic with rounding operand 4.
+//
+// Without -ffp-exception-behavior=strict the call is a plain (non-strictfp)
+// intrinsic call: under the default FP environment the optimizer is free to
+// fold it to round-to-nearest IR.
+//
+// With -ffp-exception-behavior=strict the enclosing function and the call are
+// marked "strictfp". That attribute is what makes the rest of the pipeline
+// (InstCombine and the X86 SelectionDAG lowering) preserve the operation so it
+// honors the live MXCSR rounding mode instead of constant-folding it.
+
+#include <immintrin.h>
+
+__m512 test_mm512_add_round_ps(__m512 a, __m512 b) {
+  // COMMON-LABEL: test_mm512_add_round_ps
+  // UNCONSTRAINED: call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
+  // STRICT: call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) #[[ATTR:[0-9]+]]
+  // CHECK-ASM: vaddps
+  return _mm512_add_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_sub_round_ps(__m512 a, __m512 b) {
+  // COMMON-LABEL: test_mm512_sub_round_ps
+  // UNCONSTRAINED: call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
+  // STRICT: call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) #[[ATTR]]
+  // CHECK-ASM: vsubps
+  return _mm512_sub_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mul_round_ps(__m512 a, __m512 b) {
+  // COMMON-LABEL: test_mm512_mul_round_ps
+  // UNCONSTRAINED: call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
+  // STRICT: call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) #[[ATTR]]
+  // CHECK-ASM: vmulps
+  return _mm512_mul_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_div_round_ps(__m512 a, __m512 b) {
+  // COMMON-LABEL: test_mm512_div_round_ps
+  // UNCONSTRAINED: call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
+  // STRICT: call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) #[[ATTR]]
+  // CHECK-ASM: vdivps
+  return _mm512_div_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+// Optimized (-O2) view of the same operation, equivalent to the InstCombine
+// test add_ps_512_cur_direction: without strictfp the rounding-mode operand is
+// dropped and the call becomes a plain fadd that no longer carries any MXCSR
+// dependence; with strictfp the intrinsic (and its MXCSR dependence) survives.
+__m512 test_mm512_add_round_ps_fold(__m512 a, __m512 b) {
+  // FOLD-LABEL: @test_mm512_add_round_ps_fold(
+  // FOLD: fadd <16 x float> %{{.*}}, %{{.*}}
+  // KEEP-LABEL: @test_mm512_add_round_ps_fold(
+  // KEEP: call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
+  return _mm512_add_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_sub_round_ps_fold(__m512 a, __m512 b) {
+  // FOLD-LABEL: @test_mm512_sub_round_ps_fold(
+  // FOLD: fsub <16 x float> %{{.*}}, %{{.*}}
+  // KEEP-LABEL: @test_mm512_sub_round_ps_fold(
+  // KEEP: call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
+  return _mm512_sub_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mul_round_ps_fold(__m512 a, __m512 b) {
+  // FOLD-LABEL: @test_mm512_mul_round_ps_fold(
+  // FOLD: fmul <16 x float> %{{.*}}, %{{.*}}
+  // KEEP-LABEL: @test_mm512_mul_round_ps_fold(
+  // KEEP: call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
+  return _mm512_mul_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_div_round_ps_fold(__m512 a, __m512 b) {
+  // FOLD-LABEL: @test_mm512_div_round_ps_fold(
+  // FOLD: fdiv <16 x float> %{{.*}}, %{{.*}}
+  // KEEP-LABEL: @test_mm512_div_round_ps_fold(
+  // KEEP: call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
+  return _mm512_div_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_add_round_pd_fold(__m512d a, __m512d b) {
+  // FOLD-LABEL: @test_mm512_add_round_pd_fold(
+  // FOLD: fadd <8 x double> %{{.*}}, %{{.*}}
+  // KEEP-LABEL: @test_mm512_add_round_pd_fold(
+  // KEEP: call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
+  return _mm512_add_round_pd(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_sub_round_pd_fold(__m512d a, __m512d b) {
+  // FOLD-LABEL: @test_mm512_sub_round_pd_fold(
+  // FOLD: fsub <8 x double> %{{.*}}, %{{.*}}
+  // KEEP-LABEL: @test_mm512_sub_round_pd_fold(
+  // KEEP: call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
+  return _mm512_sub_round_pd(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_mul_round_pd_fold(__m512d a, __m512d b) {
+  // FOLD-LABEL: @test_mm512_mul_round_pd_fold(
+  // FOLD: fmul <8 x double> %{{.*}}, %{{.*}}
+  // KEEP-LABEL: @test_mm512_mul_round_pd_fold(
+  // KEEP: call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
+  return _mm512_mul_round_pd(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_div_round_pd_fold(__m512d a, __m512d b) {
+  // FOLD-LABEL: @test_mm512_div_round_pd_fold(
+  // FOLD: fdiv <8 x double> %{{.*}}, %{{.*}}
+  // KEEP-LABEL: @test_mm512_div_round_pd_fold(
+  // KEEP: call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
+  return _mm512_div_round_pd(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_add_round_pd(__m512d a, __m512d b) {
+  // COMMON-LABEL: test_mm512_add_round_pd
+  // UNCONSTRAINED: call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
+  // STRICT: call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4) #[[ATTR]]
+  // CHECK-ASM: vaddpd
+  return _mm512_add_round_pd(a, b, _MM_FROUND_CUR_DIRECTION);
+}
+
+// STRICT: attributes #[[ATTR]] = { strictfp }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8bb44e55d713f..1b902d3af8345 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27093,6 +27093,30 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                              DAG.getTargetConstant(RC, dl, MVT::i32));
         if (!isRoundModeCurDirection(Rnd))
           return SDValue();
+
+        // CUR_DIRECTION means "use the current MXCSR rounding mode". In a
+        // function that accesses the FP environment (strictfp), lowering to a
+        // plain FADD/FSUB/FMUL/FDIV would let the DAG constant-fold the
+        // operation under round-to-nearest-even, discarding the live MXCSR
+        // rounding mode. Emit the corresponding strict node instead: it is not
+        // constant-folded and lowers to a real instruction that reads MXCSR.
+        if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+                Attribute::StrictFP)) {
+          unsigned StrictOpc = 0;
+          switch (IntrData->Opc0) {
+          case ISD::FADD: StrictOpc = ISD::STRICT_FADD; break;
+          case ISD::FSUB: StrictOpc = ISD::STRICT_FSUB; break;
+          case ISD::FMUL: StrictOpc = ISD::STRICT_FMUL; break;
+          case ISD::FDIV: StrictOpc = ISD::STRICT_FDIV; break;
+          default: break;
+          }
+          if (StrictOpc) {
+            SDValue StrictNode = DAG.getNode(
+                StrictOpc, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
+                {DAG.getEntryNode(), Op.getOperand(1), Src2});
+            return StrictNode.getValue(0);
+          }
+        }
       }
 
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 932b4a416a8d3..2e48406a3599c 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2451,7 +2451,14 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   case Intrinsic::x86_avx512_mul_pd_512:
   case Intrinsic::x86_avx512_sub_pd_512:
     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
-    // IR operations.
+    // IR operations. A plain fadd/fsub/fmul/fdiv is unconstrained FP and
+    // assumes the default rounding mode (round-to-nearest-even), whereas
+    // CUR_DIRECTION must honor whatever rounding the live MXCSR selects. Only
+    // fold when the function does not access the FP environment; inside a
+    // strictfp function MXCSR may have been changed (e.g. via fesetround), so
+    // the intrinsic must be preserved.
+    if (II.getFunction()->getAttributes().hasFnAttr(Attribute::StrictFP))
+      break;
     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
       if (R->getValue() == 4) {
         Value *Arg0 = II.getArgOperand(0);
@@ -2493,7 +2500,14 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   case Intrinsic::x86_avx512_mask_mul_sd_round:
   case Intrinsic::x86_avx512_mask_sub_sd_round:
     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
-    // IR operations.
+    // IR operations. A plain fadd/fsub/fmul/fdiv is unconstrained FP and
+    // assumes the default rounding mode (round-to-nearest-even), whereas
+    // CUR_DIRECTION must honor whatever rounding the live MXCSR selects. Only
+    // fold when the function does not access the FP environment; inside a
+    // strictfp function MXCSR may have been changed (e.g. via fesetround), so
+    // the intrinsic must be preserved.
+    if (II.getFunction()->getAttributes().hasFnAttr(Attribute::StrictFP))
+      break;
     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
       if (R->getValue() == 4) {
         // Extract the element as scalars.
diff --git a/llvm/test/CodeGen/X86/avx512-cur-direction-rounding.ll b/llvm/test/CodeGen/X86/avx512-cur-direction-rounding.ll
new file mode 100644
index 0000000000000..5c13f2196da52
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-cur-direction-rounding.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s
+
+; Backend behavior for the AVX-512 packed add/sub/mul/div "_round" intrinsics
+; with rounding-mode operand == 4 (_MM_FROUND_CUR_DIRECTION, "use current
+; MXCSR").
+;
+; - With symbolic operands the intrinsic lowers to a plain vaddps that reads the
+;   live MXCSR rounding mode at run time (correct), including under strictfp.
+; - With constant operands in a non-strictfp function the SelectionDAG
+;   constant-folds the result using round-to-nearest-even (0x3F800001 ==
+;   1.00000012f). This is allowed under the default-FP-environment contract.
+; - With constant operands in a strictfp function the lowering emits a strict
+;   node, which is NOT constant-folded, so a real runtime vaddps is produced
+;   that honors the live MXCSR rounding mode (no round-to-nearest value is
+;   baked in).
+
+declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @add_ps_512_symbolic(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: add_ps_512_symbolic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  ret <16 x float> %r
+}
+
+define <16 x float> @add_ps_512_symbolic_strictfp(<16 x float> %a, <16 x float> %b) strictfp {
+; CHECK-LABEL: add_ps_512_symbolic_strictfp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4) strictfp
+  ret <16 x float> %r
+}
+
+define <16 x float> @add_ps_512_constant() {
+; CHECK-LABEL: add_ps_512_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0]
+; CHECK-NEXT:    retq
+  %r = call <16 x float> @llvm.x86.avx512.add.ps.512(
+        <16 x float> splat (float 1.0),
+        <16 x float> splat (float 0x3E78000000000000),
+        i32 4)
+  ret <16 x float> %r
+}
+
+define <16 x float> @add_ps_512_constant_strictfp() strictfp {
+; CHECK-LABEL: add_ps_512_constant_strictfp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %r = call <16 x float> @llvm.x86.avx512.add.ps.512(
+        <16 x float> splat (float 1.0),
+        <16 x float> splat (float 0x3E78000000000000),
+        i32 4) strictfp
+  ret <16 x float> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512-cur-direction-rounding.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512-cur-direction-rounding.ll
new file mode 100644
index 0000000000000..26d8ec1e6c57a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512-cur-direction-rounding.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; The AVX512 packed/scalar "_round" arithmetic intrinsics take a rounding-mode
+; immediate.  A value of 4 (_MM_FROUND_CUR_DIRECTION) means "use whatever the
+; MXCSR register currently selects".  InstCombine rewrites these to plain
+; unconstrained FP operations, which are semantically round-to-nearest-even and
+; are then free to be constant-folded.
+;
+; These tests document that behavior.  The 'constfold' tests are the dangerous
+; case: with constant operands chosen so that round-to-nearest and
+; round-toward-zero disagree, the result is baked in as the round-to-nearest
+; value (0x3F800001 == 1.00000012f), even though a caller that set MXCSR to
+; round-toward-zero before the call expects 1.0 (0x3F800000).
+
+declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
+declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
+declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
+declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32)
+declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+; In a strictfp function the FP environment may have been changed (e.g. via
+; fesetround), so CUR_DIRECTION must NOT be folded to an unconstrained fadd:
+; the intrinsic has to be preserved so the live MXCSR rounding mode is honored.
+define <16 x float> @add_ps_512_cur_direction_strictfp(<16 x float> %a, <16 x float> %b) strictfp {
+; CHECK-LABEL: @add_ps_512_cur_direction_strictfp(
+; CHECK-NEXT:    [[R:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 4) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    ret <16 x float> [[R]]
+;
+  %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4) strictfp
+  ret <16 x float> %r
+}
+
+; Same constants as the constfold test above, but strictfp: must stay an
+; intrinsic instead of constant-folding to the round-to-nearest value.
+define <16 x float> @add_ps_512_cur_direction_constfold_strictfp() strictfp {
+; CHECK-LABEL: @add_ps_512_cur_direction_constfold_strictfp(
+; CHECK-NEXT:    [[R:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> splat (float 1.000000e+00), <16 x float> splat (float f0x33C00000), i32 4) #[[ATTR1]]
+; CHECK-NEXT:    ret <16 x float> [[R]]
+;
+  %r = call <16 x float> @llvm.x86.avx512.add.ps.512(
+  <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0,
+  float 1.0, float 1.0, float 1.0, float 1.0,
+  float 1.0, float 1.0, float 1.0, float 1.0,
+  float 1.0, float 1.0, float 1.0, float 1.0>,
+  <16 x float> <float 0x3E78000000000000, float 0x3E78000000000000,
+  float 0x3E78000000000000, float 0x3E78000000000000,
+  float 0x3E78000000000000, float 0x3E78000000000000,
+  float 0x3E78000000000000, float 0x3E78000000000000,
+  float 0x3E78000000000000, float 0x3E78000000000000,
+  float 0x3E78000000000000, float 0x3E78000000000000,
+  float 0x3E78000000000000, float 0x3E78000000000000,
+  float 0x3E78000000000000, float 0x3E78000000000000>,
+  i32 4) strictfp
+  ret <16 x float> %r
+}
+
+; Masked scalar variant must likewise be preserved under strictfp.
+define <4 x float> @mask_add_ss_round_cur_direction_strictfp(<4 x float> %a, <4 x float> %b, <4 x float> %c) strictfp {
+; CHECK-LABEL: @mask_add_ss_round_cur_direction_strictfp(
+; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 -1, i32 4) #[[ATTR1]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %r = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 -1, i32 4) strictfp
+  ret <4 x float> %r
+}
+
+; The rounding-mode operand is dropped: CUR_DIRECTION folds to a plain fadd that
+; no longer carries any MXCSR dependence.
+define <16 x float> @add_ps_512_cur_direction(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @add_ps_512_cur_direction(
+; CHECK-NEXT:    [[R:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[R]]
+;
+  %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  ret <16 x float> %r
+}
+
+define <16 x float> @sub_ps_512_cur_direction(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @sub_ps_512_cur_direction(
+; CHECK-NEXT:    [[R:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[R]]
+;
+  %r = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  ret <16 x float> %r
+}
+
+define <16 x float> @mul_ps_512_cur_direction(<16 x float> %a, <16 x...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/203521