[clang] 073cc29 - [X86][Reduce] Preserve fast math flags when change it. NFCI

Fri Dec 23 20:17:45 PST 2022

Author: Phoebe Wang
Date: 2022-12-24T11:41:17+08:00
New Revision: 073cc29e04b756cb4997bf3538c733c0938cd4ae

URL: https://github.com/llvm/llvm-project/commit/073cc29e04b756cb4997bf3538c733c0938cd4ae
DIFF: https://github.com/llvm/llvm-project/commit/073cc29e04b756cb4997bf3538c733c0938cd4ae.diff

LOG: [X86][Reduce] Preserve fast math flags when change it. NFCI

@arsenm raised a good question that we should use a flag guard.
But I found it is not a problem as long as user uses intrinsics only: https://godbolt.org/z/WoYsqqjh3
Anyway, it is still nice to have.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D140467

Added: 
    clang/test/CodeGen/builtins-x86-reduce.c

Modified: 
    clang/lib/CodeGen/CGBuiltin.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a194fc7b105cb..ca21612e442e6 100644

--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14741,6 +14741,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_reduce_fadd_ph128: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
     Builder.getFastMathFlags().setAllowReassoc();
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
@@ -14751,6 +14752,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_reduce_fmul_ph128: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
     Builder.getFastMathFlags().setAllowReassoc();
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
@@ -14761,6 +14763,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_reduce_fmax_ph128: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
     Builder.getFastMathFlags().setNoNaNs();
     return Builder.CreateCall(F, {Ops[0]});
   }
@@ -14771,6 +14774,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_reduce_fmin_ph128: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
     Builder.getFastMathFlags().setNoNaNs();
     return Builder.CreateCall(F, {Ops[0]});
   }

diff  --git a/clang/test/CodeGen/builtins-x86-reduce.c b/clang/test/CodeGen/builtins-x86-reduce.c
new file mode 100644
index 0000000000000..9e5b479df6584
--- /dev/null
+++ b/clang/test/CodeGen/builtins-x86-reduce.c
@@ -0,0 +1,92 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 %s -triple x86_64-unknown-unknown -target-feature +avx512f -target-feature +avx512vl -target-feature +avx512fp16 -emit-llvm -o - | FileCheck %s
+
+typedef double double8 __attribute__((ext_vector_type(8)));
+
+// CHECK-LABEL: @fadd1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store <8 x double> [[A:%.*]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store double [[B:%.*]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    ret double [[ADD]]
+//
+double fadd1(double8 a, double b) {
+  return __builtin_ia32_reduce_fadd_pd512(0.0, a) + b;
+}
+
+#pragma clang fp reassociate(on)
+// CHECK-LABEL: @fadd2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store <8 x double> [[A:%.*]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store double [[B:%.*]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[ADD:%.*]] = fadd reassoc double [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    ret double [[ADD]]
+//
+double fadd2(double8 a, double b) {
+  return __builtin_ia32_reduce_fadd_pd512(0.0, a) + b;
+}
+
+typedef float float16 __attribute__((ext_vector_type(16)));
+
+#pragma clang fp reassociate(off)
+// CHECK-LABEL: @fmul1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float, align 4
+// CHECK-NEXT:    store <16 x float> [[A:%.*]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store float [[B:%.*]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    ret float [[ADD]]
+//
+float fmul1(float16 a, float b) {
+  return __builtin_ia32_reduce_fmul_ps512(1.0f, a) + b;
+}
+
+typedef _Float16 half8 __attribute__((ext_vector_type(8)));
+
+// CHECK-LABEL: @fmax1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    store <8 x half> [[A:%.*]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store half [[B:%.*]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call nnan half @llvm.vector.reduce.fmax.v8f16(<8 x half> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[ADD:%.*]] = fadd half [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    ret half [[ADD]]
+//
+_Float16 fmax1(half8 a, _Float16 b) {
+  return __builtin_ia32_reduce_fmax_ph128(a) + b;
+}
+
+typedef _Float16 half16 __attribute__((ext_vector_type(16)));
+
+// CHECK-LABEL: @fmin1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    store <16 x half> [[A:%.*]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store half [[B:%.*]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = call nnan half @llvm.vector.reduce.fmin.v16f16(<16 x half> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[ADD:%.*]] = fadd half [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    ret half [[ADD]]
+//
+_Float16 fmin1(half16 a, _Float16 b) {
+  return __builtin_ia32_reduce_fmin_ph256(a) + b;
+}