[clang] [llvm] Implement operand bundles for floating-point operations (PR #109798)

Tue Sep 24 06:26:07 PDT 2024

https://github.com/spavloff created https://github.com/llvm/llvm-project/pull/109798

Currently floating-point operations in general form (beyond the default mode) are always represented by calls to constrained intrinsics. In addition to the side effect, they carry additional information in the form of metadata arguments. This scheme is not efficient in the case of intrinsic function calls, as was noted in
https://discourse.llvm.org/t/thought-on-strictfp-support/71453, because it requires defining a separate intrinsic for the same operation but used in non-default FP environment. The solution proposed in the discussion was "to move the complexity about the environment tracking from the intrinsics themselves to the call instruction".

The way implemented in this change is to use operand bundles (https://llvm.org/docs/LangRef.html#operand-bundles). This way was tried previously (https://reviews.llvm.org/D93455), but was not finished.

This change does not add any new functionality, it only adds the new way of keeping FP related information in LLVM IR. Metadata arguments of constrained functions are preserved, but they are not used in the queries like `getRoundingMode` or `getExceptionBehavior`.

>From 671fbc1e4c4f8ba53f78789d8708f5f79a13befd Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff at gmail.com>
Date: Thu, 22 Aug 2024 17:33:20 +0700
Subject: [PATCH] Implement operand bundles for floating-point operations

Currently floating-point operations in general form (beyond the default
mode) are always represented by calls to constrained intrinsics. In
addition to the side effect, they carry additional information in the
form of metadata arguments. This scheme is not efficient in the case of
intrinsic function calls, as was noted in
https://discourse.llvm.org/t/thought-on-strictfp-support/71453, because
it requires defining a separate intrinsic for the same operation but
used in non-default FP environment. The solution proposed in the
discussion was "to move the complexity about the environment tracking
from the intrinsics themselves to the call instruction".

The way implemented in this change is to use operand bundles
(https://llvm.org/docs/LangRef.html#operand-bundles). This way was tried
previously (https://reviews.llvm.org/D93455), but was not finished.

This change does not add any new functionality, it only adds the new way
of keeping FP related information in LLVM IR. Metadata arguments of
constrained functions are preserved, but they are not used in the
queries like `getRoundingMode` or `getExceptionBehavior`.
---
 clang/test/CodeGen/X86/strictfp_builtins.c    |  6 +-
 clang/test/CodeGen/strictfp_builtins.c        | 34 ++++----
 .../cl20-device-side-enqueue-attributes.cl    |  4 +-
 llvm/docs/LangRef.rst                         | 23 ++++++
 llvm/include/llvm/ADT/FloatingPointMode.h     |  9 +++
 llvm/include/llvm/IR/AutoUpgrade.h            |  2 +
 llvm/include/llvm/IR/FPEnv.h                  |  9 +++
 llvm/include/llvm/IR/IRBuilder.h              | 48 ++++++++++++
 llvm/include/llvm/IR/InstrTypes.h             |  7 ++
 llvm/include/llvm/IR/IntrinsicInst.h          |  2 -
 llvm/include/llvm/IR/LLVMContext.h            |  2 +
 llvm/lib/AsmParser/LLParser.cpp               | 46 +++++++++++
 llvm/lib/IR/AutoUpgrade.cpp                   | 78 +++++++++++++++++++
 llvm/lib/IR/IRBuilder.cpp                     | 56 ++++++++++---
 llvm/lib/IR/Instructions.cpp                  | 18 +++++
 llvm/lib/IR/IntrinsicInst.cpp                 | 23 ------
 llvm/lib/IR/LLVMContext.cpp                   | 10 +++
 llvm/lib/IR/Verifier.cpp                      | 70 ++++++++++++++++-
 .../Scalar/TailRecursionElimination.cpp       | 10 ++-
 llvm/lib/Transforms/Utils/CloneFunction.cpp   | 16 +++-
 .../Bitcode/operand-bundles-bc-analyzer.ll    |  2 +
 .../AMDGPU/amdgpu-simplify-libcall-pown.ll    | 10 +--
 22 files changed, 413 insertions(+), 72 deletions(-)

diff --git a/clang/test/CodeGen/X86/strictfp_builtins.c b/clang/test/CodeGen/X86/strictfp_builtins.c
index 43e4060bef259b..75ed3a2555b3d7 100644
--- a/clang/test/CodeGen/X86/strictfp_builtins.c
+++ b/clang/test/CodeGen/X86/strictfp_builtins.c
@@ -27,7 +27,7 @@ void p(char *str, int x) {
 // CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca x86_fp80, align 16
 // CHECK-NEXT:    store x86_fp80 [[LD:%.*]], ptr [[LD_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[TMP0]], i32 516) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[TMP0]], i32 516) #[[ATTR4:[0-9]+]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.1, i32 noundef [[TMP2]]) #[[ATTR3]]
 // CHECK-NEXT:    ret void
@@ -43,7 +43,7 @@ void test_long_double_isinf(long double ld) {
 // CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca x86_fp80, align 16
 // CHECK-NEXT:    store x86_fp80 [[LD:%.*]], ptr [[LD_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[TMP0]], i32 504) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[TMP0]], i32 504) #[[ATTR4]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.2, i32 noundef [[TMP2]]) #[[ATTR3]]
 // CHECK-NEXT:    ret void
@@ -59,7 +59,7 @@ void test_long_double_isfinite(long double ld) {
 // CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca x86_fp80, align 16
 // CHECK-NEXT:    store x86_fp80 [[LD:%.*]], ptr [[LD_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load x86_fp80, ptr [[LD_ADDR]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[TMP0]], i32 3) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[TMP0]], i32 3) #[[ATTR4]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.3, i32 noundef [[TMP2]]) #[[ATTR3]]
 // CHECK-NEXT:    ret void
diff --git a/clang/test/CodeGen/strictfp_builtins.c b/clang/test/CodeGen/strictfp_builtins.c
index 58815c7de4fa94..2e758115779711 100644
--- a/clang/test/CodeGen/strictfp_builtins.c
+++ b/clang/test/CodeGen/strictfp_builtins.c
@@ -31,21 +31,21 @@ void p(char *str, int x) {
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca double, align 8
 // CHECK-NEXT:    store double [[D:%.*]], ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[D_ADDR]], align 8
-// CHECK-NEXT:    [[ISZERO:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    [[ISZERO:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR5:[0-9]+]] [ "fpe.except"(i32 2) ]
 // CHECK-NEXT:    br i1 [[ISZERO]], label [[FPCLASSIFY_END:%.*]], label [[FPCLASSIFY_NOT_ZERO:%.*]]
 // CHECK:       fpclassify_end:
 // CHECK-NEXT:    [[FPCLASSIFY_RESULT:%.*]] = phi i32 [ 4, [[ENTRY:%.*]] ], [ 0, [[FPCLASSIFY_NOT_ZERO]] ], [ 1, [[FPCLASSIFY_NOT_NAN:%.*]] ], [ [[TMP2:%.*]], [[FPCLASSIFY_NOT_INF:%.*]] ]
 // CHECK-NEXT:    call void @p(ptr noundef @.str.1, i32 noundef [[FPCLASSIFY_RESULT]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
 // CHECK:       fpclassify_not_zero:
-// CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double [[TMP0]], metadata !"uno", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double [[TMP0]], metadata !"uno", metadata !"fpexcept.strict") #[[ATTR5]] [ "fpe.except"(i32 2) ]
 // CHECK-NEXT:    br i1 [[CMP]], label [[FPCLASSIFY_END]], label [[FPCLASSIFY_NOT_NAN]]
 // CHECK:       fpclassify_not_nan:
-// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[TMP0]]) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT:    [[ISINF:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP1]], double 0x7FF0000000000000, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[TMP0]]) #[[ATTR6:[0-9]+]]
+// CHECK-NEXT:    [[ISINF:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP1]], double 0x7FF0000000000000, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR5]] [ "fpe.except"(i32 2) ]
 // CHECK-NEXT:    br i1 [[ISINF]], label [[FPCLASSIFY_END]], label [[FPCLASSIFY_NOT_INF]]
 // CHECK:       fpclassify_not_inf:
-// CHECK-NEXT:    [[ISNORMAL:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP1]], double 0x10000000000000, metadata !"uge", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    [[ISNORMAL:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP1]], double 0x10000000000000, metadata !"uge", metadata !"fpexcept.strict") #[[ATTR5]] [ "fpe.except"(i32 2) ]
 // CHECK-NEXT:    [[TMP2]] = select i1 [[ISNORMAL]], i32 2, i32 3
 // CHECK-NEXT:    br label [[FPCLASSIFY_END]]
 //
@@ -60,7 +60,7 @@ void test_fpclassify(double d) {
 // CHECK-NEXT:    [[H_ADDR:%.*]] = alloca half, align 2
 // CHECK-NEXT:    store half [[H:%.*]], ptr [[H_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[H_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f16(half [[TMP0]], i32 516) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f16(half [[TMP0]], i32 516) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.2, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -76,7 +76,7 @@ void test_fp16_isinf(_Float16 h) {
 // CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
 // CHECK-NEXT:    store float [[F:%.*]], ptr [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP0]], i32 516) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP0]], i32 516) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.3, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -92,7 +92,7 @@ void test_float_isinf(float f) {
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca double, align 8
 // CHECK-NEXT:    store double [[D:%.*]], ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[D_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP0]], i32 516) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP0]], i32 516) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.4, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -108,7 +108,7 @@ void test_double_isinf(double d) {
 // CHECK-NEXT:    [[H_ADDR:%.*]] = alloca half, align 2
 // CHECK-NEXT:    store half [[H:%.*]], ptr [[H_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[H_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f16(half [[TMP0]], i32 504) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f16(half [[TMP0]], i32 504) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.5, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -124,7 +124,7 @@ void test_fp16_isfinite(_Float16 h) {
 // CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
 // CHECK-NEXT:    store float [[F:%.*]], ptr [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP0]], i32 504) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP0]], i32 504) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.6, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -140,7 +140,7 @@ void test_float_isfinite(float f) {
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca double, align 8
 // CHECK-NEXT:    store double [[D:%.*]], ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[D_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP0]], i32 504) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP0]], i32 504) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.7, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -156,8 +156,8 @@ void test_double_isfinite(double d) {
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca double, align 8
 // CHECK-NEXT:    store double [[D:%.*]], ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[D_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[TMP0]]) #[[ATTR5]]
-// CHECK-NEXT:    [[ISINF:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP1]], double 0x7FF0000000000000, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[TMP0]]) #[[ATTR6]]
+// CHECK-NEXT:    [[ISINF:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP1]], double 0x7FF0000000000000, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR5]] [ "fpe.except"(i32 2) ]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP0]] to i64
 // CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i64 [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 -1, i32 1
@@ -176,7 +176,7 @@ void test_isinf_sign(double d) {
 // CHECK-NEXT:    [[H_ADDR:%.*]] = alloca half, align 2
 // CHECK-NEXT:    store half [[H:%.*]], ptr [[H_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[H_ADDR]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f16(half [[TMP0]], i32 3) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f16(half [[TMP0]], i32 3) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.9, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -192,7 +192,7 @@ void test_fp16_isnan(_Float16 h) {
 // CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
 // CHECK-NEXT:    store float [[F:%.*]], ptr [[F_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F_ADDR]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP0]], i32 3) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP0]], i32 3) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.10, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -208,7 +208,7 @@ void test_float_isnan(float f) {
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca double, align 8
 // CHECK-NEXT:    store double [[D:%.*]], ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[D_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP0]], i32 3) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP0]], i32 3) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.11, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
@@ -224,7 +224,7 @@ void test_double_isnan(double d) {
 // CHECK-NEXT:    [[D_ADDR:%.*]] = alloca double, align 8
 // CHECK-NEXT:    store double [[D:%.*]], ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[D_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP0]], i32 264) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[TMP0]], i32 264) #[[ATTR5]]
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
 // CHECK-NEXT:    call void @p(ptr noundef @.str.12, i32 noundef [[TMP2]]) #[[ATTR4]]
 // CHECK-NEXT:    ret void
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
index 1544770dfa4f30..42b53f8725e03a 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
@@ -144,7 +144,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
 // STRICTFP-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR1]], align 4
 // STRICTFP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[TMP0]], i32 [[TMP1]]
 // STRICTFP-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4
-// STRICTFP-NEXT:    [[TMP3:%.*]] = call float @llvm.experimental.constrained.fmuladd.f32(float 4.000000e+00, float [[TMP2]], float 1.000000e+00, metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR5]]
+// STRICTFP-NEXT:    [[TMP3:%.*]] = call float @llvm.experimental.constrained.fmuladd.f32(float 4.000000e+00, float [[TMP2]], float 1.000000e+00, metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR5]] [ "fpe.round"(i32 1), "fpe.except"(i32 2) ]
 // STRICTFP-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
 // STRICTFP-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR2]], align 4
 // STRICTFP-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
@@ -173,7 +173,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
 // STRICTFP: attributes #[[ATTR2]] = { convergent noinline nounwind optnone strictfp "stack-protector-buffer-size"="8" }
 // STRICTFP: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind strictfp willreturn memory(inaccessiblemem: readwrite) }
 // STRICTFP: attributes #[[ATTR4]] = { convergent nounwind "stack-protector-buffer-size"="8" }
-// STRICTFP: attributes #[[ATTR5]] = { strictfp }
+// STRICTFP: attributes #[[ATTR5]] = { strictfp memory(inaccessiblemem: readwrite) }
 //.
 // SPIR32: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // SPIR32: [[META1:![0-9]+]] = !{i32 2, i32 0}
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 91c3e60bb0acb1..b1a546a2d4f808 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2994,6 +2994,29 @@ A "convergencectrl" operand bundle is only valid on a ``convergent`` operation.
 When present, the operand bundle must contain exactly one value of token type.
 See the :doc:`ConvergentOperations` document for details.
 
+.. _ob_fpe:
+
+Floating-point Environment Operand Bundles
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These operand bundles provide details on how the operation interacts with the
+:ref:`floating-point environment <_floatenv>`. There are two kinds of such
+operand bundles, which characterize interaction with floating-point control
+modes and status bits.
+
+An operand bundle tagged with "fpe.round" may be associated with the operations
+that may depend on rounding mode. It has an integer value, which represents
+the rounding mode with the same encoding as ``llvm::RoundingMode`` uses. If it
+is present and is not equal to ``llvm::Dynamic``, it specifies the rounding
+mode, which will be used for the operation evaluation. The value
+``llvm::RoundingMode`` indicates that the rounding mode used by the operation is
+specified in a floating-point control register.
+
+An operand bundle tagged with "fpe.except" may be associated with the operations
+that may read or write floating-point exception flags. It has the same meaning
+and encoding as the corresponding argument in
+:ref:`constrained intrinsics <_constrainedfp>`.
+
 .. _moduleasm:
 
 Module-Level Inline Assembly
diff --git a/llvm/include/llvm/ADT/FloatingPointMode.h b/llvm/include/llvm/ADT/FloatingPointMode.h
index 639d931ef88fec..970cc89093924b 100644
--- a/llvm/include/llvm/ADT/FloatingPointMode.h
+++ b/llvm/include/llvm/ADT/FloatingPointMode.h
@@ -47,6 +47,15 @@ enum class RoundingMode : int8_t {
   Invalid = -1    ///< Denotes invalid value.
 };
 
+inline bool isValidRoundingMode(int X) {
+  return X >= 0 && X <= static_cast<int>(RoundingMode::Dynamic);
+}
+
+inline RoundingMode castToRoundingMode(int X) {
+  assert(isValidRoundingMode(X));
+  return static_cast<RoundingMode>(X);
+}
+
 /// Returns text representation of the given rounding mode.
 inline StringRef spell(RoundingMode RM) {
   switch (RM) {
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 97c3e4d7589d7b..8bd005d73fba36 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -107,6 +107,8 @@ namespace llvm {
   /// Upgrade operand bundles (without knowing about their user instruction).
   void UpgradeOperandBundles(std::vector<OperandBundleDef> &OperandBundles);
 
+  CallBase *upgradeConstrainedFunctionCall(CallBase *CB);
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/IR/FPEnv.h b/llvm/include/llvm/IR/FPEnv.h
index a0197377759daf..e4602bab6038e0 100644
--- a/llvm/include/llvm/IR/FPEnv.h
+++ b/llvm/include/llvm/IR/FPEnv.h
@@ -43,6 +43,15 @@ enum ExceptionBehavior : uint8_t {
 
 }
 
+inline bool isValidExceptionBehavior(unsigned X) {
+  return X <= fp::ExceptionBehavior::ebStrict;
+}
+
+inline fp::ExceptionBehavior castToExceptionBehavior(unsigned X) {
+  assert(isValidExceptionBehavior(X));
+  return static_cast<fp::ExceptionBehavior>(X);
+}
+
 /// Returns a valid RoundingMode enumerator when given a string
 /// that is valid as input in constrained intrinsic rounding mode
 /// metadata.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 23fd8350a29b3d..ca732f4903ce44 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -357,6 +357,9 @@ class IRBuilderBase {
 
   void setConstrainedFPCallAttr(CallBase *I) {
     I->addFnAttr(Attribute::StrictFP);
+    MemoryEffects ME = MemoryEffects::inaccessibleMemOnly();
+    auto A = Attribute::getWithMemoryEffects(getContext(), ME);
+    I->addFnAttr(A);
   }
 
   void setDefaultOperandBundles(ArrayRef<OperandBundleDef> OpBundles) {
@@ -975,6 +978,16 @@ class IRBuilderBase {
                             Instruction *FMFSource = nullptr,
                             const Twine &Name = "");
 
+  /// Create a call to intrinsic \p ID with \p Args, mangled using \p Types and
+  /// with operand bundles.
+  /// If \p FMFSource is provided, copy fast-math-flags from that instruction to
+  /// the intrinsic.
+  CallInst *CreateIntrinsic(Intrinsic::ID ID, ArrayRef<Type *> Types,
+                            ArrayRef<Value *> Args,
+                            ArrayRef<OperandBundleDef> OpBundles,
+                            Instruction *FMFSource = nullptr,
+                            const Twine &Name = "");
+
   /// Create a call to intrinsic \p ID with \p RetTy and \p Args. If
   /// \p FMFSource is provided, copy fast-math-flags from that instruction to
   /// the intrinsic.
@@ -1311,6 +1324,15 @@ class IRBuilderBase {
     return I;
   }
 
+  RoundingMode
+  getEffectiveRounding(std::optional<RoundingMode> Rounding = std::nullopt) {
+    RoundingMode RM = DefaultConstrainedRounding;
+
+    if (Rounding)
+      RM = *Rounding;
+    return RM;
+  }
+
   Value *getConstrainedFPRounding(std::optional<RoundingMode> Rounding) {
     RoundingMode UseRounding = DefaultConstrainedRounding;
 
@@ -1325,6 +1347,14 @@ class IRBuilderBase {
     return MetadataAsValue::get(Context, RoundingMDS);
   }
 
+  fp::ExceptionBehavior getEffectiveExceptionBehavior(
+      std::optional<fp::ExceptionBehavior> Except = std::nullopt) {
+    fp::ExceptionBehavior EB = DefaultConstrainedExcept;
+    if (Except)
+      EB = *Except;
+    return EB;
+  }
+
   Value *getConstrainedFPExcept(std::optional<fp::ExceptionBehavior> Except) {
     std::optional<StringRef> ExceptStr = convertExceptionBehaviorToStr(
         Except.value_or(DefaultConstrainedExcept));
@@ -2475,6 +2505,10 @@ class IRBuilderBase {
       Function *Callee, ArrayRef<Value *> Args, const Twine &Name = "",
       std::optional<RoundingMode> Rounding = std::nullopt,
       std::optional<fp::ExceptionBehavior> Except = std::nullopt);
+  CallInst *CreateConstrainedFPCall(
+      Intrinsic::ID ID, ArrayRef<Value *> Args, const Twine &Name = "",
+      std::optional<RoundingMode> Rounding = std::nullopt,
+      std::optional<fp::ExceptionBehavior> Except = std::nullopt);
 
   Value *CreateSelect(Value *C, Value *True, Value *False,
                       const Twine &Name = "", Instruction *MDFrom = nullptr);
@@ -2671,6 +2705,20 @@ class IRBuilderBase {
   CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
                                       Value *Alignment,
                                       Value *OffsetValue = nullptr);
+
+  void
+  createFPRoundingBundle(SmallVectorImpl<OperandBundleDef> &Bundles,
+                         std::optional<RoundingMode> Rounding = std::nullopt) {
+    int RM = static_cast<int32_t>(getEffectiveRounding(Rounding));
+    Bundles.emplace_back("fpe.round", getInt32(RM));
+  }
+
+  void createFPExceptionBundle(
+      SmallVectorImpl<OperandBundleDef> &Bundles,
+      std::optional<fp::ExceptionBehavior> Except = std::nullopt) {
+    int EB = getEffectiveExceptionBehavior(Except);
+    Bundles.emplace_back("fpe.except", getInt32(EB));
+  }
 };
 
 /// This provides a uniform API for creating instructions and inserting
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 4852f64d0977fb..f1e6672c90dc46 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -25,6 +25,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/FMF.h"
+#include "llvm/IR/FPEnv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
@@ -2139,6 +2140,12 @@ class CallBase : public Instruction {
     return false;
   }
 
+  /// Return rounding mode specified by operand bundles.
+  std::optional<RoundingMode> getRoundingMode() const;
+
+  /// Return exception behavior specified by operand bundles.
+  std::optional<fp::ExceptionBehavior> getExceptionBehavior() const;
+
   /// Used to keep track of an operand bundle.  See the main comment on
   /// OperandBundleUser above.
   struct BundleOpInfo {
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 4458126ffa759d..8e782897a5f2c5 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -723,8 +723,6 @@ class VPBinOpIntrinsic : public VPIntrinsic {
 class ConstrainedFPIntrinsic : public IntrinsicInst {
 public:
   unsigned getNonMetadataArgCount() const;
-  std::optional<RoundingMode> getRoundingMode() const;
-  std::optional<fp::ExceptionBehavior> getExceptionBehavior() const;
   bool isDefaultFPEnvironment() const;
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index 6ffa2bdaa319a7..fd5779d1996c61 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -96,6 +96,8 @@ class LLVMContext {
     OB_ptrauth = 7,                // "ptrauth"
     OB_kcfi = 8,                   // "kcfi"
     OB_convergencectrl = 9,        // "convergencectrl"
+    OB_fpe_round = 10,             // "fpe.round"
+    OB_fpe_except = 11,            // "fpe.except"
   };
 
   /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 08b917fdb260aa..094ac56651af58 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -6306,6 +6306,50 @@ bool isOldDbgFormatIntrinsic(StringRef Name) {
          FnID == Intrinsic::dbg_assign;
 }
 
+bool updateConstrainedIntrinsic(StringRef Name, ArrayRef<Value *> Args,
+                                SmallVectorImpl<OperandBundleDef> &Bundles,
+                                LLVMContext &C) {
+  if (Args.empty())
+    return false;
+  if (!Name.starts_with("llvm.experimental.constrained."))
+    return false;
+  for (auto &B : Bundles) {
+    if (B.getTag().starts_with("fpe."))
+      return false;
+  }
+
+  const auto getMetadataArgumentValue = [](Value *Arg) -> StringRef {
+    if (auto *MAV = dyn_cast<MetadataAsValue>(Arg)) {
+      if (const auto *MD = MAV->getMetadata()) {
+        if (auto MDStr = dyn_cast<MDString>(MD))
+          return MDStr->getString();
+      }
+    }
+    return StringRef();
+  };
+
+  if (Args.size() > 1) {
+    Value *V = Args.take_back(2).front();
+    if (StringRef VStr = getMetadataArgumentValue(V); !VStr.empty()) {
+      if (auto RM = convertStrToRoundingMode(VStr)) {
+        int RMVal = static_cast<int>(*RM);
+        Bundles.emplace_back("fpe.round",
+                             ConstantInt::get(Type::getInt32Ty(C), RMVal));
+      }
+    }
+  }
+
+  Value *V = Args.back();
+  if (StringRef VStr = getMetadataArgumentValue(V); !VStr.empty()) {
+    if (auto EB = convertStrToExceptionBehavior(VStr)) {
+      Bundles.emplace_back("fpe.except",
+                           ConstantInt::get(Type::getInt32Ty(C), *EB));
+    }
+  }
+
+  return true;
+}
+
 /// FunctionHeader
 ///   ::= OptionalLinkage OptionalPreemptionSpecifier OptionalVisibility
 ///       OptionalCallingConv OptRetAttrs OptUnnamedAddr Type GlobalName
@@ -8067,6 +8111,8 @@ bool LLParser::parseCall(Instruction *&Inst, PerFunctionState &PFS,
       AttributeList::get(Context, AttributeSet::get(Context, FnAttrs),
                          AttributeSet::get(Context, RetAttrs), Attrs);
 
+  updateConstrainedIntrinsic(CalleeID.StrVal, Args, BundleList, Context);
+
   CallInst *CI = CallInst::Create(Ty, Callee, Args, BundleList);
   CI->setTailCallKind(TCK);
   CI->setCallingConv(CC);
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3390d651d6c693..269e919f38afff 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -4282,6 +4282,64 @@ static void upgradeDbgIntrinsicToDbgRecord(StringRef Name, CallBase *CI) {
   CI->getParent()->insertDbgRecordBefore(DR, CI->getIterator());
 }
 
+static CallBase *upgradeConstrainedIntrinsicCall(CallBase *CB, Function *F,
+                                                 IRBuilder<> &Builder) {
+  if (CB->getOperandBundle(LLVMContext::OB_fpe_round))
+    return nullptr;
+
+  auto *CFPI = cast<ConstrainedFPIntrinsic>(F);
+  SmallVector<OperandBundleDef, 2> NewBundles;
+  LLVMContext &C = CB->getContext();
+
+  auto RM = CFPI->getRoundingMode();
+  if (RM) {
+    auto CurrentRM = CB->getRoundingMode();
+    if (CurrentRM) {
+      assert(*RM == *CurrentRM);
+    } else {
+      int RMValue = static_cast<int>(*RM);
+      NewBundles.emplace_back("fpe.round",
+                              ConstantInt::get(Type::getInt32Ty(C), RMValue));
+    }
+  }
+
+  auto EB = CFPI->getExceptionBehavior();
+  if (EB) {
+    auto CurrentEB = CB->getExceptionBehavior();
+    if (CurrentEB) {
+      assert(*EB == *CurrentEB);
+    } else {
+      NewBundles.emplace_back("fpe.except",
+                              ConstantInt::get(Type::getInt32Ty(C), *EB));
+    }
+  }
+
+  CallInst *NewCB = nullptr;
+  if (!NewBundles.empty()) {
+    SmallVector<Value *, 4> Args(CB->args());
+    SmallVector<OperandBundleDef, 2> Bundles;
+    CB->getOperandBundlesAsDefs(Bundles);
+    Bundles.append(NewBundles);
+
+    Builder.SetInsertPoint(CB->getParent(), CB->getIterator());
+    MDNode *FPMath = CB->getMetadata(LLVMContext::MD_fpmath);
+    NewCB = Builder.CreateCall(F, Args, Bundles, CB->getName(), FPMath);
+
+    NewCB->copyMetadata(*CB);
+    AttributeList Attrs = CB->getAttributes();
+    NewCB->setAttributes(Attrs);
+    if (isa<FPMathOperator>(CB)) {
+      FastMathFlags FMF = CB->getFastMathFlags();
+      NewCB->setFastMathFlags(FMF);
+    }
+    MemoryEffects ME = MemoryEffects::inaccessibleMemOnly();
+    auto A = Attribute::getWithMemoryEffects(C, ME);
+    NewCB->addFnAttr(A);
+  }
+
+  return NewCB;
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
@@ -4309,6 +4367,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     bool IsARM = Name.consume_front("arm.");
     bool IsAMDGCN = Name.consume_front("amdgcn.");
     bool IsDbg = Name.consume_front("dbg.");
+    bool IsConstrained = Name.starts_with("experimental.constrained.");
     Value *Rep = nullptr;
 
     if (!IsX86 && Name == "stackprotectorcheck") {
@@ -4335,6 +4394,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       } else {
         upgradeDbgIntrinsicToDbgRecord(Name, CI);
       }
+    } else if (IsConstrained) {
+      Rep = upgradeConstrainedIntrinsicCall(CI, F, Builder);
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
@@ -5577,3 +5638,20 @@ void llvm::UpgradeOperandBundles(std::vector<OperandBundleDef> &Bundles) {
            OBD.inputs().empty();
   });
 }
+
+CallBase *llvm::upgradeConstrainedFunctionCall(CallBase *CB) {
+  Function *F = dyn_cast<Function>(CB->getCalledOperand());
+  if (!F)
+    return nullptr;
+
+  if (CB->getNumOperands() < 1)
+    return nullptr;
+
+  StringRef Name = F->getName();
+  if (!Name.starts_with("experimental.constrained."))
+    return nullptr;
+
+  LLVMContext &C = CB->getContext();
+  IRBuilder<> Builder(C);
+  return upgradeConstrainedIntrinsicCall(CB, F, Builder);
+}
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 8bf695e835c368..a6b979cb2a66d0 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -940,6 +940,17 @@ CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID,
   return createCallHelper(Fn, Args, Name, FMFSource);
 }
 
+CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID,
+                                         ArrayRef<Type *> Types,
+                                         ArrayRef<Value *> Args,
+                                         ArrayRef<OperandBundleDef> OpBundles,
+                                         Instruction *FMFSource,
+                                         const Twine &Name) {
+  Module *M = BB->getModule();
+  Function *Fn = Intrinsic::getDeclaration(M, ID, Types);
+  return createCallHelper(Fn, Args, Name, FMFSource, OpBundles);
+}
+
 CallInst *IRBuilderBase::CreateIntrinsic(Type *RetTy, Intrinsic::ID ID,
                                          ArrayRef<Value *> Args,
                                          Instruction *FMFSource,
@@ -979,8 +990,11 @@ CallInst *IRBuilderBase::CreateConstrainedFPBinOp(
   if (FMFSource)
     UseFMF = FMFSource->getFastMathFlags();
 
-  CallInst *C = CreateIntrinsic(ID, {L->getType()},
-                                {L, R, RoundingV, ExceptV}, nullptr, Name);
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  createFPRoundingBundle(OpBundles, Rounding);
+  createFPExceptionBundle(OpBundles, Except);
+  CallInst *C = CreateIntrinsic(ID, {L->getType()}, {L, R, RoundingV, ExceptV},
+                                OpBundles, nullptr, Name);
   setConstrainedFPCallAttr(C);
   setFPAttrs(C, FPMathTag, UseFMF);
   return C;
@@ -996,8 +1010,12 @@ CallInst *IRBuilderBase::CreateConstrainedFPUnroundedBinOp(
   if (FMFSource)
     UseFMF = FMFSource->getFastMathFlags();
 
-  CallInst *C =
-      CreateIntrinsic(ID, {L->getType()}, {L, R, ExceptV}, nullptr, Name);
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  int EB = getEffectiveExceptionBehavior(Except);
+  OpBundles.emplace_back("fpe.except", getInt32(EB));
+
+  CallInst *C = CreateIntrinsic(ID, {L->getType()}, {L, R, ExceptV}, OpBundles,
+                                nullptr, Name);
   setConstrainedFPCallAttr(C);
   setFPAttrs(C, FPMathTag, UseFMF);
   return C;
@@ -1024,19 +1042,24 @@ CallInst *IRBuilderBase::CreateConstrainedFPCast(
     std::optional<RoundingMode> Rounding,
     std::optional<fp::ExceptionBehavior> Except) {
   Value *ExceptV = getConstrainedFPExcept(Except);
+  bool HasRounding = Intrinsic::hasConstrainedFPRoundingModeOperand(ID);
 
   FastMathFlags UseFMF = FMF;
   if (FMFSource)
     UseFMF = FMFSource->getFastMathFlags();
 
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  createFPRoundingBundle(OpBundles, Rounding);
+  createFPExceptionBundle(OpBundles, Except);
+
   CallInst *C;
-  if (Intrinsic::hasConstrainedFPRoundingModeOperand(ID)) {
+  if (HasRounding) {
     Value *RoundingV = getConstrainedFPRounding(Rounding);
     C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, RoundingV, ExceptV},
-                        nullptr, Name);
+                        OpBundles, nullptr, Name);
   } else
-    C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, ExceptV}, nullptr,
-                        Name);
+    C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, ExceptV}, OpBundles,
+                        nullptr, Name);
 
   setConstrainedFPCallAttr(C);
 
@@ -1065,8 +1088,12 @@ CallInst *IRBuilderBase::CreateConstrainedFPCmp(
   Value *PredicateV = getConstrainedFPPredicate(P);
   Value *ExceptV = getConstrainedFPExcept(Except);
 
-  CallInst *C = CreateIntrinsic(ID, {L->getType()},
-                                {L, R, PredicateV, ExceptV}, nullptr, Name);
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  int EB = getEffectiveExceptionBehavior(Except);
+  OpBundles.emplace_back("fpe.except", getInt32(EB));
+
+  CallInst *C = CreateIntrinsic(ID, {L->getType()}, {L, R, PredicateV, ExceptV},
+                                OpBundles, nullptr, Name);
   setConstrainedFPCallAttr(C);
   return C;
 }
@@ -1076,14 +1103,19 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
     std::optional<RoundingMode> Rounding,
     std::optional<fp::ExceptionBehavior> Except) {
   llvm::SmallVector<Value *, 6> UseArgs;
+  SmallVector<OperandBundleDef, 2> OpBundles;
 
   append_range(UseArgs, Args);
 
-  if (Intrinsic::hasConstrainedFPRoundingModeOperand(Callee->getIntrinsicID()))
+  if (Intrinsic::hasConstrainedFPRoundingModeOperand(
+          Callee->getIntrinsicID())) {
     UseArgs.push_back(getConstrainedFPRounding(Rounding));
+    createFPRoundingBundle(OpBundles, Rounding);
+  }
   UseArgs.push_back(getConstrainedFPExcept(Except));
+  createFPExceptionBundle(OpBundles, Except);
 
-  CallInst *C = CreateCall(Callee, UseArgs, Name);
+  CallInst *C = CreateCall(Callee, UseArgs, OpBundles, Name);
   setConstrainedFPCallAttr(C);
   return C;
 }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index e95b98a6404432..7fc33ea156ef82 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -602,6 +602,24 @@ bool CallBase::hasClobberingOperandBundles() const {
          getIntrinsicID() != Intrinsic::assume;
 }
 
+std::optional<RoundingMode> CallBase::getRoundingMode() const {
+  if (auto RoundingBundle = getOperandBundle(LLVMContext::OB_fpe_round)) {
+    uint64_t RM =
+        cast<ConstantInt>(RoundingBundle->Inputs.front())->getSExtValue();
+    return castToRoundingMode(RM);
+  }
+  return std::nullopt;
+}
+
+std::optional<fp::ExceptionBehavior> CallBase::getExceptionBehavior() const {
+  if (auto ExceptionBundle = getOperandBundle(LLVMContext::OB_fpe_except)) {
+    uint64_t EB =
+        cast<ConstantInt>(ExceptionBundle->Inputs.front())->getZExtValue();
+    return castToExceptionBehavior(EB);
+  }
+  return std::nullopt;
+}
+
 MemoryEffects CallBase::getMemoryEffects() const {
   MemoryEffects ME = getAttributes().getMemoryEffects();
   if (auto *Fn = dyn_cast<Function>(getCalledOperand())) {
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 7ed82c2ece464a..49bf41404e0acd 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -310,29 +310,6 @@ void InstrProfCallsite::setCallee(Value *Callee) {
   setArgOperand(4, Callee);
 }
 
-std::optional<RoundingMode> ConstrainedFPIntrinsic::getRoundingMode() const {
-  unsigned NumOperands = arg_size();
-  Metadata *MD = nullptr;
-  auto *MAV = dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 2));
-  if (MAV)
-    MD = MAV->getMetadata();
-  if (!MD || !isa<MDString>(MD))
-    return std::nullopt;
-  return convertStrToRoundingMode(cast<MDString>(MD)->getString());
-}
-
-std::optional<fp::ExceptionBehavior>
-ConstrainedFPIntrinsic::getExceptionBehavior() const {
-  unsigned NumOperands = arg_size();
-  Metadata *MD = nullptr;
-  auto *MAV = dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 1));
-  if (MAV)
-    MD = MAV->getMetadata();
-  if (!MD || !isa<MDString>(MD))
-    return std::nullopt;
-  return convertStrToExceptionBehavior(cast<MDString>(MD)->getString());
-}
-
 bool ConstrainedFPIntrinsic::isDefaultFPEnvironment() const {
   std::optional<fp::ExceptionBehavior> Except = getExceptionBehavior();
   if (Except) {
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index c0fee93a233808..446de64ed7f2d4 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -97,6 +97,16 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
          "convergencectrl operand bundle id drifted!");
   (void)ConvergenceCtrlEntry;
 
+  auto *RoundingEntry = pImpl->getOrInsertBundleTag("fpe.round");
+  assert(RoundingEntry->second == LLVMContext::OB_fpe_round &&
+         "fpe.round operand bundle id drifted!");
+  (void)RoundingEntry;
+
+  auto *ExceptionEntry = pImpl->getOrInsertBundleTag("fpe.except");
+  assert(ExceptionEntry->second == LLVMContext::OB_fpe_except &&
+         "fpe.except operand bundle id drifted!");
+  (void)ExceptionEntry;
+
   SyncScope::ID SingleThreadSSID =
       pImpl->getOrInsertSyncScopeID("singlethread");
   assert(SingleThreadSSID == SyncScope::SingleThread &&
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 1cd5eb36c4ab69..100073ba8b3eb1 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -651,6 +651,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   /// Verify the llvm.experimental.noalias.scope.decl declarations
   void verifyNoAliasScopeDecl();
+
+  /// Verify the call of a constrained intrinsic call.
+  void verifyConstrainedInstrinsicCall(const CallBase &CB);
 };
 
 } // end anonymous namespace
@@ -3694,7 +3697,9 @@ void Verifier::visitCallBase(CallBase &Call) {
        FoundGCTransitionBundle = false, FoundCFGuardTargetBundle = false,
        FoundPreallocatedBundle = false, FoundGCLiveBundle = false,
        FoundPtrauthBundle = false, FoundKCFIBundle = false,
-       FoundAttachedCallBundle = false;
+       FoundAttachedCallBundle = false, FoundFpeRoundBundle = false,
+       FoundFpeExceptBundle = false;
+
   for (unsigned i = 0, e = Call.getNumOperandBundles(); i < e; ++i) {
     OperandBundleUse BU = Call.getOperandBundleAt(i);
     uint32_t Tag = BU.getTagID();
@@ -3757,9 +3762,31 @@ void Verifier::visitCallBase(CallBase &Call) {
             "Multiple \"clang.arc.attachedcall\" operand bundles", Call);
       FoundAttachedCallBundle = true;
       verifyAttachedCallBundle(Call, BU);
+    } else if (Tag == LLVMContext::OB_fpe_round) {
+      Check(!FoundFpeRoundBundle, "Multiple fpe.round operand bundles", Call);
+      Check(BU.Inputs.size() == 1,
+            "Expected exactly one fpe.round bundle operand", Call);
+      auto RM = dyn_cast<ConstantInt>(BU.Inputs.front());
+      Check(RM, "Value of fpe.round bundle operand must be an integer", Call);
+      Check(isValidRoundingMode(RM->getSExtValue()),
+            "Invalid value of fpe.round bundle operand", Call);
+      FoundFpeRoundBundle = true;
+    } else if (Tag == LLVMContext::OB_fpe_except) {
+      Check(!FoundFpeExceptBundle, "Multiple fpe.except operand bundles", Call);
+      Check(BU.Inputs.size() == 1,
+            "Expected exactly one fpe.except bundle operand", Call);
+      auto EB = dyn_cast<ConstantInt>(BU.Inputs.front());
+      Check(EB, "Value of fpe.except bundle operand must be an integer", Call);
+      Check(isValidExceptionBehavior(EB->getZExtValue()),
+            "Invalid value of fpe.except bundle operand", Call);
+      FoundFpeExceptBundle = true;
     }
   }
 
+  // Verify if FP options specified in constrained intrinsic arguments agree
+  // with the options specified in operand bundles.
+  verifyConstrainedInstrinsicCall(Call);
+
   // Verify that callee and callsite agree on whether to use pointer auth.
   Check(!(Call.getCalledFunction() && FoundPtrauthBundle),
         "Direct call cannot have a ptrauth bundle", Call);
@@ -3786,6 +3813,47 @@ void Verifier::visitCallBase(CallBase &Call) {
   visitInstruction(Call);
 }
 
+void Verifier::verifyConstrainedInstrinsicCall(const CallBase &CB) {
+  const auto *CFPI = dyn_cast<ConstrainedFPIntrinsic>(&CB);
+  if (!CFPI)
+    return;
+
+  // FP metadata arguments must not conflict with the corresponding
+  // operand bundles.
+  if (std::optional<RoundingMode> RM = CFPI->getRoundingMode()) {
+    RoundingMode Rounding = *RM;
+    auto RoundingBundle = CB.getOperandBundle(LLVMContext::OB_fpe_round);
+    Check(RoundingBundle,
+          "Constrained intrinsic has a rounding argument but the call does not",
+          CB);
+    if (RoundingBundle) {
+      OperandBundleUse OBU = *RoundingBundle;
+      uint64_t BundleRM = cast<ConstantInt>(OBU.Inputs.front())->getZExtValue();
+      Check(BundleRM == static_cast<uint64_t>(Rounding),
+            "Rounding mode of the constrained intrinsic differs from that in "
+            "operand bundle",
+            CB);
+    }
+  }
+
+  if (std::optional<fp::ExceptionBehavior> EB = CFPI->getExceptionBehavior()) {
+    fp::ExceptionBehavior Excepts = *EB;
+    auto ExceptionBundle = CB.getOperandBundle(LLVMContext::OB_fpe_except);
+    Check(ExceptionBundle,
+          "Constrained intrinsic has an exception handling argument but the "
+          "call does not",
+          CB);
+    if (ExceptionBundle) {
+      OperandBundleUse OBU = *ExceptionBundle;
+      uint64_t BundleEB = cast<ConstantInt>(OBU.Inputs.front())->getZExtValue();
+      Check(BundleEB == static_cast<uint64_t>(Excepts),
+            "Exception behavior of the constrained intrinsic differs from that "
+            "in operand bundle",
+            CB);
+    }
+  }
+}
+
 void Verifier::verifyTailCCMustTailAttrs(const AttrBuilder &Attrs,
                                          StringRef Context) {
   Check(!Attrs.contains(Attribute::InAlloca),
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 53e486f3dc6cda..478691d031150f 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -251,10 +251,12 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
 
       // Special-case operand bundles "clang.arc.attachedcall", "ptrauth", and
       // "kcfi".
-      bool IsNoTail = CI->isNoTailCall() ||
-                      CI->hasOperandBundlesOtherThan(
-                          {LLVMContext::OB_clang_arc_attachedcall,
-                           LLVMContext::OB_ptrauth, LLVMContext::OB_kcfi});
+      bool IsNoTail =
+          CI->isNoTailCall() ||
+          CI->hasOperandBundlesOtherThan(
+              {LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_ptrauth,
+               LLVMContext::OB_kcfi, LLVMContext::OB_fpe_round,
+               LLVMContext::OB_fpe_except});
 
       if (!IsNoTail && CI->doesNotAccessMemory()) {
         // A call to a readnone function whose arguments are all things computed
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index dc9ca1423f3e79..3a704b1aaefe9c 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -443,13 +443,23 @@ PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) {
       // The last arguments of a constrained intrinsic are metadata that
       // represent rounding mode (absents in some intrinsics) and exception
       // behavior. The inlined function uses default settings.
-      if (Intrinsic::hasConstrainedFPRoundingModeOperand(CIID))
+      SmallVector<OperandBundleDef, 2> Bundles;
+      if (Intrinsic::hasConstrainedFPRoundingModeOperand(CIID)) {
         Args.push_back(
             MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.tonearest")));
+        Bundles.emplace_back(
+            "fpe.round",
+            ConstantInt::get(
+                Type::getInt32Ty(Ctx),
+                static_cast<int>(RoundingMode::NearestTiesToEven)));
+      }
       Args.push_back(
           MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore")));
-
-      NewInst = CallInst::Create(IFn, Args, OldInst.getName() + ".strict");
+      Bundles.emplace_back("fpe.except",
+                           ConstantInt::get(Type::getInt32Ty(Ctx),
+                                            fp::ExceptionBehavior::ebIgnore));
+      NewInst =
+          CallInst::Create(IFn, Args, Bundles, OldInst.getName() + ".strict");
     }
   }
   if (!NewInst)
diff --git a/llvm/test/Bitcode/operand-bundles-bc-analyzer.ll b/llvm/test/Bitcode/operand-bundles-bc-analyzer.ll
index d860104b9cb3d9..01e5b3f6673ae5 100644
--- a/llvm/test/Bitcode/operand-bundles-bc-analyzer.ll
+++ b/llvm/test/Bitcode/operand-bundles-bc-analyzer.ll
@@ -13,6 +13,8 @@
 ; CHECK-NEXT:    <OPERAND_BUNDLE_TAG
 ; CHECK-NEXT:    <OPERAND_BUNDLE_TAG
 ; CHECK-NEXT:    <OPERAND_BUNDLE_TAG
+; CHECK-NEXT:    <OPERAND_BUNDLE_TAG
+; CHECK-NEXT:    <OPERAND_BUNDLE_TAG
 ; CHECK-NEXT:  </OPERAND_BUNDLE_TAGS_BLOCK
 
 ; CHECK:   <FUNCTION_BLOCK
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index bd4b86f0387666..edc3300ddffb66 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -819,11 +819,11 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
 ; CHECK-LABEL: define float @test_pown_fast_f32_strictfp
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) #[[ATTR0]]
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) #[[ATTR0]]
-; CHECK-NEXT:    [[POWNI2F:%.*]] = call fast float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[Y]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]]
-; CHECK-NEXT:    [[__YLOGX:%.*]] = call fast float @llvm.experimental.constrained.fmul.f32(float [[POWNI2F]], float [[__LOG2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) #[[ATTR0]]
+; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) #[[ATTR5]]
+; CHECK-NEXT:    [[POWNI2F:%.*]] = call fast float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[Y]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR5]] [ "fpe.round"(i32 7), "fpe.except"(i32 2) ]
+; CHECK-NEXT:    [[__YLOGX:%.*]] = call fast float @llvm.experimental.constrained.fmul.f32(float [[POWNI2F]], float [[__LOG2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR5]] [ "fpe.round"(i32 7), "fpe.except"(i32 2) ]
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) #[[ATTR5]]
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]