[llvm] 8290134 - [X86] EmitX86BuiltinExpr - attempt to convert SSE41/AVX1 roundps/d/ss/sd builtins to regular rounding modes (#171227)

Tue Dec 23 05:44:26 PST 2025

Author: Gergo Stomfai
Date: 2025-12-23T13:44:21Z
New Revision: 8290134a7421c5a5ce3348b29f908887158b4b8a

URL: https://github.com/llvm/llvm-project/commit/8290134a7421c5a5ce3348b29f908887158b4b8a
DIFF: https://github.com/llvm/llvm-project/commit/8290134a7421c5a5ce3348b29f908887158b4b8a.diff

LOG: [X86] EmitX86BuiltinExpr - attempt to convert SSE41/AVX1 roundps/d/ss/sd builtins to regular rounding modes (#171227)

Adding clauses to `CodeGenFunction::EmitX86BuiltinExpr` to convert
SSE4.1/AVX1 builts `roundps/pd/ss/sd` to regular rounding modes.

We use:
1. `roundeven/floor/ceil/trunc` when not using MXCSR or _MM_FROUND_RAISE_EXC , and FP mode is not strict,
2. `experimental_constrained_roundeven/floor/ceil/trunc` when not using MXCSR or _MM_FROUND_RAISE_EXC , and FP mode is strict
3. `x86_sse41/avx_round_ps/pd/ss/sd` when using MXCSR or _MM_FROUND_RAISE_EXC .

Closes #170273

Added: 
    clang/test/CodeGen/X86/sse41-builtins-constrained.c

Modified: 
    clang/lib/CodeGen/TargetBuiltins/X86.cpp
    clang/test/CodeGen/X86/avx-builtins-constrained.c
    clang/test/CodeGen/X86/avx-builtins.c
    clang/test/CodeGen/X86/sse41-builtins.c
    llvm/include/llvm/IR/IntrinsicsX86.td

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index be2b7d442645e..685040fc4524f 100644

--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -75,6 +75,62 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
   return MaskVec;
 }
 
+/// Emit rounding for the value \p X according to the rounding \p
+/// RoundingControl based on bits 0 and 1.
+static Value *emitX86RoundImmediate(CodeGenFunction &CGF, Value *X,
+                                    unsigned RoundingControl) {
+  unsigned RoundingMask = 0b11;
+  unsigned RoundingMode = RoundingControl & RoundingMask;
+
+  Intrinsic::ID ID = Intrinsic::not_intrinsic;
+  LLVMContext &Ctx = CGF.CGM.getLLVMContext();
+  if (CGF.Builder.getIsFPConstrained()) {
+
+    Value *ExceptMode =
+        MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore"));
+
+    switch (RoundingMode) {
+    case 0b00:
+      ID = Intrinsic::experimental_constrained_roundeven;
+      break;
+    case 0b01:
+      ID = Intrinsic::experimental_constrained_floor;
+      break;
+    case 0b10:
+      ID = Intrinsic::experimental_constrained_ceil;
+      break;
+    case 0b11:
+      ID = Intrinsic::experimental_constrained_trunc;
+      break;
+    default:
+      llvm_unreachable("Invalid rounding mode");
+    }
+
+    Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
+    return CGF.Builder.CreateCall(F, {X, ExceptMode});
+  }
+
+  switch (RoundingMode) {
+  case 0b00:
+    ID = Intrinsic::roundeven;
+    break;
+  case 0b01:
+    ID = Intrinsic::floor;
+    break;
+  case 0b10:
+    ID = Intrinsic::ceil;
+    break;
+  case 0b11:
+    ID = Intrinsic::trunc;
+    break;
+  default:
+    llvm_unreachable("Invalid rounding mode");
+  }
+
+  Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
+  return CGF.Builder.CreateCall(F, {X});
+}
+
 static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
                                  Align Alignment) {
   Value *Ptr = Ops[0];
@@ -840,6 +896,76 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
                                       Ops[0]);
     return Builder.CreateExtractValue(Call, 0);
   }
+  case X86::BI__builtin_ia32_roundps:
+  case X86::BI__builtin_ia32_roundpd:
+  case X86::BI__builtin_ia32_roundps256:
+  case X86::BI__builtin_ia32_roundpd256: {
+    unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue();
+    unsigned MXCSRMask = 0b100;
+    unsigned FRoundNoExcMask = 0b1000;
+    unsigned UseMXCSR = MXCSRMask & M;
+    unsigned FRoundNoExc = FRoundNoExcMask & M;
+
+    if (UseMXCSR || !FRoundNoExc) {
+
+      Intrinsic::ID ID = Intrinsic::not_intrinsic;
+
+      switch (BuiltinID) {
+      case X86::BI__builtin_ia32_roundps:
+        ID = Intrinsic::x86_sse41_round_ps;
+        break;
+      case X86::BI__builtin_ia32_roundps256:
+        ID = Intrinsic::x86_avx_round_ps_256;
+        break;
+      case X86::BI__builtin_ia32_roundpd:
+        ID = Intrinsic::x86_sse41_round_pd;
+        break;
+      case X86::BI__builtin_ia32_roundpd256:
+        ID = Intrinsic::x86_avx_round_pd_256;
+        break;
+      default:
+        llvm_unreachable("must return from switch");
+      }
+
+      Function *F = CGM.getIntrinsic(ID);
+      return Builder.CreateCall(F, Ops);
+    }
+
+    return emitX86RoundImmediate(*this, Ops[0], M);
+  }
+  case X86::BI__builtin_ia32_roundss:
+  case X86::BI__builtin_ia32_roundsd: {
+    unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue();
+    unsigned MXCSRMask = 0b100;
+    unsigned FRoundNoExcMask = 0b1000;
+    unsigned UseMXCSR = MXCSRMask & M;
+    unsigned FRoundNoExc = FRoundNoExcMask & M;
+
+    if (UseMXCSR || !FRoundNoExc) {
+
+      Intrinsic::ID ID = Intrinsic::not_intrinsic;
+
+      switch (BuiltinID) {
+      case X86::BI__builtin_ia32_roundss:
+        ID = Intrinsic::x86_sse41_round_ss;
+        break;
+      case X86::BI__builtin_ia32_roundsd:
+        ID = Intrinsic::x86_sse41_round_sd;
+        break;
+      default:
+        llvm_unreachable("must return from switch");
+      }
+
+      Function *F = CGM.getIntrinsic(ID);
+      return Builder.CreateCall(F, Ops);
+    }
+
+    Value *Idx = Builder.getInt32(0);
+    Value *ValAt0 = Builder.CreateExtractElement(Ops[1], Idx);
+    Value *RoundedAt0 = emitX86RoundImmediate(*this, ValAt0, M);
+
+    return Builder.CreateInsertElement(Ops[0], RoundedAt0, Idx);
+  }
   case X86::BI__builtin_ia32_lzcnt_u16:
   case X86::BI__builtin_ia32_lzcnt_u32:
   case X86::BI__builtin_ia32_lzcnt_u64: {

diff  --git a/clang/test/CodeGen/X86/avx-builtins-constrained.c b/clang/test/CodeGen/X86/avx-builtins-constrained.c
index 428febeb1d293..357b6e1c66339 100644
--- a/clang/test/CodeGen/X86/avx-builtins-constrained.c
+++ b/clang/test/CodeGen/X86/avx-builtins-constrained.c
@@ -32,4 +32,40 @@ __m256d test_mm256_sqrt_pd(__m256d x) {
   // CONSTRAINED: call {{.*}}<4 x double> @llvm.experimental.constrained.sqrt.v4f64(<4 x double> {{.*}}, metadata !{{.*}})
   // CHECK-ASM: vsqrtpd %ymm{{.*}}, 
   return _mm256_sqrt_pd(x);
-}
\ No newline at end of file
+}
+
+__m256d test_mm256_round_pd_mxcsr(__m256d x) {
+  // CONSTRAINED-LABEL: test_mm256_round_pd_mxcsr
+  // CONSTRAINED: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 12)
+  return _mm256_round_pd(x, 0b1100);
+}
+
+__m256d test_mm256_round_pd_fround_no_exc(__m256d x) {
+  // CONSTRAINED-LABEL: test_mm256_round_pd_fround_no_exc
+  // CONSTRAINED: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 0)
+  return _mm256_round_pd(x, 0b0000);
+}
+
+__m256d test_mm256_round_pd_trunc(__m256d x) {
+  // CONSTRAINED-LABEL: test_mm256_round_pd_trunc
+  // CONSTRAINED: %{{.*}} = call <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.ignore")
+  return _mm256_round_pd(x, 0b1011);
+}
+
+__m256 test_mm256_round_ps_mxcsr(__m256 x) {
+  // CONSTRAINED-LABEL: test_mm256_round_ps_mxcsr
+  // CONSTRAINED: %{{.*}} = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 12)
+  return _mm256_round_ps(x, 0b1100);
+}
+
+__m256 test_mm256_round_ps_fround_no_exc(__m256 x) {
+  // CONSTRAINED-LABEL: test_mm256_round_ps_fround_no_exc
+  // CONSTRAINED: %{{.*}} = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 0)
+  return _mm256_round_ps(x, 0b0000);
+}
+
+__m256 test_mm256_round_ps_trunc(__m256 x) {
+  // CONSTRAINED-LABEL: test_mm256_round_ps_trunc
+  // CONSTRAINED: %{{.*}} = call <8 x float> @llvm.experimental.constrained.trunc.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.ignore")
+  return _mm256_round_ps(x, 0b1011);
+}

diff  --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index d92869ff6574f..d2f8740cffbbd 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -246,7 +246,7 @@ TEST_CONSTEXPR(match_m128i(_mm256_castsi256_si128((__m256i)(__v4du){0xBFF0000000
 
 __m256d test_mm256_ceil_pd(__m256d x) {
   // CHECK-LABEL: test_mm256_ceil_pd
-  // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
+  // CHECK: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
   return _mm256_ceil_pd(x);
 }
 
@@ -1526,14 +1526,38 @@ __m256 test_mm256_rcp_ps(__m256 A) {
 
 __m256d test_mm256_round_pd(__m256d x) {
   // CHECK-LABEL: test_mm256_round_pd
-  // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4)
-  return _mm256_round_pd(x, 4);
+  // CHECK: %{{.*}} = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %{{.*}})
+  return _mm256_round_pd(x, 0b1000);
+}
+
+__m256d test_mm256_round_pd_mxcsr(__m256d x) {
+  // CHECK-LABEL: test_mm256_round_pd_mxcsr
+  // CHECK: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 12)
+  return _mm256_round_pd(x, 0b1100);
+}
+
+__m256d test_mm256_round_pd_fround_no_exc(__m256d x) {
+  // CHECK-LABEL: test_mm256_round_pd_fround_no_exc
+  // CHECK: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 0)
+  return _mm256_round_pd(x, 0b0000);
 }
 
 __m256 test_mm256_round_ps(__m256 x) {
   // CHECK-LABEL: test_mm256_round_ps
-  // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4)
-  return _mm256_round_ps(x, 4);
+  // CHECK: %{{.*}} = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %{{.*}})
+  return _mm256_round_ps(x, 0b1000);
+}
+
+__m256 test_mm256_round_ps_mxcsr(__m256 x) {
+  // CHECK-LABEL: test_mm256_round_ps_mxcsr
+  // CHECK: %{{.*}} = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 12)
+  return _mm256_round_ps(x, 0b1100);
+}
+
+__m256 test_mm256_round_ps_fround_no_exc(__m256 x) {
+  // CHECK-LABEL: test_mm256_round_ps_fround_no_exc
+  // CHECK: %{{.*}} = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 0)
+  return _mm256_round_ps(x, 0b0000);
 }
 
 __m256 test_mm256_rsqrt_ps(__m256 A) {

diff  --git a/clang/test/CodeGen/X86/sse41-builtins-constrained.c b/clang/test/CodeGen/X86/sse41-builtins-constrained.c
new file mode 100644
index 0000000000000..6b25bd27af7e0
--- /dev/null
+++ b/clang/test/CodeGen/X86/sse41-builtins-constrained.c
@@ -0,0 +1,96 @@
+// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK
+
+// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror  -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror  -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror  -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror  -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror  -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror  -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror  -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror  -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+
+
+#include <immintrin.h>
+
+__m128d test_mm_round_pd_roundeven(__m128d x) {
+  // CHECK-LABEL: test_mm_round_pd_roundeven
+  // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.ignore")
+  return _mm_round_pd(x, 0b1000);
+}
+
+__m128d test_mm_round_pd_mxcsr(__m128d x) {
+  // CHECK-LABEL: test_mm_round_pd_mxcsr
+  // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 12)
+  return _mm_round_pd(x, 0b1100);
+}
+
+__m128d test_mm_round_pd_fround_no_exc(__m128d x) {
+  // CHECK-LABEL: test_mm_round_pd_fround_no_exc
+  // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 0)
+  return _mm_round_pd(x, 0b0000);
+}
+
+__m128 test_mm_round_ps_floor(__m128 x) {
+  // CHECK-LABEL: test_mm_round_ps_floor
+  // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.ignore")
+  return _mm_round_ps(x, 0b1001);
+}
+
+__m128 test_mm_round_ps_mxcsr(__m128 x) {
+  // CHECK-LABEL: test_mm_round_ps_mxcsr
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 12)
+  return _mm_round_ps(x, 0b1100);
+}
+
+__m128 test_mm_round_ps_fround_no_exc(__m128 x) {
+  // CHECK-LABEL: test_mm_round_ps_fround_no_exc
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
+  return _mm_round_ps(x, 0b0000);
+}
+
+__m128d test_mm_round_sd_ceil(__m128d x, __m128d y) {
+  // CHECK-LABEL: test_mm_round_sd_ceil
+  // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.ceil.f64(double %[[A:.*]], metadata !"fpexcept.ignore")
+  // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
+  return _mm_round_sd(x, y, 0b1010);
+}
+
+__m128d test_mm_round_sd_mxcsr(__m128d x, __m128d y) {
+  // CHECK-LABEL: test_mm_round_sd_mxcsr
+  // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 12)
+  return _mm_round_sd(x, y, 0b1100);
+}
+
+__m128d test_mm_round_sd_fround_no_exc(__m128d x, __m128d y) {
+  // CHECK-LABEL: test_mm_round_sd_fround_no_exc
+  // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 0)
+  return _mm_round_sd(x, y, 0b0000);
+}
+
+__m128 test_mm_round_ss_trunc(__m128 x, __m128 y) {
+  // CHECK-LABEL: test_mm_round_ss_trunc
+  // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.trunc.f32(float %[[A:.*]], metadata !"fpexcept.ignore") 
+  // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
+  return _mm_round_ss(x, y, 0b1011);
+}
+
+__m128 test_mm_round_ss_mxcsr(__m128 x, __m128 y) {
+  // CHECK-LABEL: test_mm_round_ss_mxcsr
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %1, i32 12)
+  return _mm_round_ss(x, y, 0b1100);
+}
+
+__m128 test_mm_round_ss_fround_no_exc(__m128 x, __m128 y) {
+  // CHECK-LABEL: test_mm_round_ss_fround_no_exc
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %1, i32 0)
+  return _mm_round_ss(x, y, 0b0000);
+}

diff  --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 35fa65a99836b..1be1aa71de737 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -75,13 +75,13 @@ TEST_CONSTEXPR(match_m128(_mm_blendv_ps((__m128)(__v4sf){0.0f, 1.0f, 2.0f, 3.0f}
 
 __m128d test_mm_ceil_pd(__m128d x) {
   // CHECK-LABEL: test_mm_ceil_pd
-  // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
+  // CHECK  %{{.*}} = call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}})
   return _mm_ceil_pd(x);
 }
 
 __m128 test_mm_ceil_ps(__m128 x) {
   // CHECK-LABEL: test_mm_ceil_ps
-  // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
   return _mm_ceil_ps(x);
 }
 
@@ -430,26 +430,78 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276
 
 __m128d test_mm_round_pd(__m128d x) {
   // CHECK-LABEL: test_mm_round_pd
-  // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4)
-  return _mm_round_pd(x, 4);
+  // CHECK: %{{.*}} = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %{{.*}})
+  return _mm_round_pd(x, 0b1000);
+}
+
+__m128d test_mm_round_pd_mxcsr(__m128d x) {
+  // CHECK-LABEL: test_mm_round_pd_mxcsr
+  // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 12)
+  return _mm_round_pd(x, 0b1100);
+}
+
+__m128d test_mm_round_pd_fround_no_exc(__m128d x) {
+  // CHECK-LABEL: test_mm_round_pd_fround_no_exc
+  // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 0)
+  return _mm_round_pd(x, 0b0000);
 }
 
 __m128 test_mm_round_ps(__m128 x) {
   // CHECK-LABEL: test_mm_round_ps
-  // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4)
-  return _mm_round_ps(x, 4);
+  // CHECK:  %{{.*}} = call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}})
+  return _mm_round_ps(x, 0b1001);
+}
+
+__m128 test_mm_round_ps_mxcsr(__m128 x) {
+  // CHECK-LABEL: test_mm_round_ps_mxcsr
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 12)
+  return _mm_round_ps(x, 0b1100);
+}
+
+__m128 test_mm_round_ps_fround_no_exc(__m128 x) {
+  // CHECK-LABEL: test_mm_round_ps_fround_no_exc
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
+  return _mm_round_ps(x, 0b0000);
 }
 
 __m128d test_mm_round_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_round_sd
-  // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4)
-  return _mm_round_sd(x, y, 4);
+  // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: %[[B:.*]] = call double @llvm.roundeven.f64(double %[[A:.*]])
+  // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
+  return _mm_round_sd(x, y, 0b1000);
+}
+
+__m128d test_mm_round_sd_mxcsr(__m128d x, __m128d y) {
+  // CHECK-LABEL: test_mm_round_sd_mxcsr
+  // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %0, <2 x double> %{{.*}}, i32 12)
+  return _mm_round_sd(x, y, 0b1100);
+}
+
+__m128d test_mm_round_sd_fround_no_exc(__m128d x, __m128d y) {
+  // CHECK-LABEL: test_mm_round_sd_fround_no_exc
+  // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %0, <2 x double> %{{.*}}, i32 0)
+  return _mm_round_sd(x, y, 0b0000);
 }
 
 __m128 test_mm_round_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_round_ss
-  // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4)
-  return _mm_round_ss(x, y, 4);
+  // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: %[[B:.*]] = call float @llvm.trunc.f32(float %[[A:.*]]) 
+  // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
+  return _mm_round_ss(x, y, 0b1011);
+}
+
+__m128 test_mm_round_ss_mxcsr(__m128 x, __m128 y) {
+  // CHECK-LABEL: test_mm_round_ss_mxcsr
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %0, <4 x float> %{{.*}}, i32 12)
+  return _mm_round_ss(x, y, 0b1100);
+}
+
+__m128 test_mm_round_ss_fround_no_exc(__m128 x, __m128 y) {
+  // CHECK-LABEL: test_mm_round_ss_fround_no_exc
+  // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %0, <4 x float> %{{.*}}, i32 0)
+  return _mm_round_ss(x, y, 0b0000);
 }
 
 __m128i test_mm_stream_load_si128(__m128i const *a) {

diff  --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 0245611bc422b..e36187ea54d6f 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -626,18 +626,20 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // FP rounding ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse41_round_ss        : ClangBuiltin<"__builtin_ia32_roundss">,
-      DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                             llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse41_round_ps        : ClangBuiltin<"__builtin_ia32_roundps">,
-      DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                             llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_sse41_round_sd        : ClangBuiltin<"__builtin_ia32_roundsd">,
-      DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                             llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
-  def int_x86_sse41_round_pd        : ClangBuiltin<"__builtin_ia32_roundpd">,
-      DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                             llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_x86_sse41_round_ss
+      : DefaultAttrsIntrinsic<[llvm_v4f32_ty],
+                              [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_x86_sse41_round_ps
+      : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_x86_sse41_round_sd
+      : DefaultAttrsIntrinsic<[llvm_v2f64_ty],
+                              [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_x86_sse41_round_pd
+      : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Vector min element
@@ -921,12 +923,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">,
       DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 
-  def int_x86_avx_round_pd_256 : ClangBuiltin<"__builtin_ia32_roundpd256">,
-      DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  def int_x86_avx_round_ps_256 : ClangBuiltin<"__builtin_ia32_roundps256">,
-      DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_x86_avx_round_pd_256
+      : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_x86_avx_round_ps_256
+      : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Horizontal ops