r220167 - [complex] Teach the complex math IR gen to emit direct math and

Sun Oct 19 12:13:50 PDT 2014

Author: chandlerc
Date: Sun Oct 19 14:13:49 2014
New Revision: 220167

URL: http://llvm.org/viewvc/llvm-project?rev=220167&view=rev
Log:
[complex] Teach the complex math IR gen to emit direct math and
a NaN-test prior to the call to the library function.

This should automatically make fastmath (including just non-NaNs) able to avoid
the expensive libcalls and also open the door to more advanced folding in LLVM
based on the rules for complex math.

Two important notes to remember: first is that this isn't yet a proper
limited range mode, it's still just improving the unlimited range mode.
Also, it isn't really perfecet w.r.t. what an unlimited range mode
should be doing because it isn't quite handling the flags produced by
all the operations in the way desirable for that mode, but then neither
is compiler-rt's libcall. When the compiler-rt libcall is improved to
carefully manage flags, the code emitted here should be improved
correspondingly. And it is still a long-term desirable thing to add
a limited range mode to Clang that would be able to use direct math
without library calls here.

Special thanks to Steve Canon for the careful review on this patch and
teaching me about these issues. =D

Differential Revision: http://reviews.llvm.org/D5756

Modified:
    cfe/trunk/lib/CodeGen/CGExprComplex.cpp
    cfe/trunk/test/CodeGen/complex-math.c

Modified: cfe/trunk/lib/CodeGen/CGExprComplex.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGExprComplex.cpp?rev=220167&r1=220166&r2=220167&view=diff
==============================================================================

--- cfe/trunk/lib/CodeGen/CGExprComplex.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGExprComplex.cpp Sun Oct 19 14:13:49 2014
@@ -15,9 +15,13 @@
 #include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/StmtVisitor.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include <algorithm>
 using namespace clang;
 using namespace CodeGen;
@@ -587,11 +591,31 @@ ComplexPairTy ComplexExprEmitter::EmitCo
   return CGF.EmitCall(FuncInfo, Func, ReturnValueSlot(), Args).getComplexVal();
 }
 
+/// \brief Lookup the libcall name for a given floating point type complex
+/// multiply.
+static StringRef getComplexMultiplyLibCallName(llvm::Type *Ty) {
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("Unsupported floating point type!");
+  case llvm::Type::HalfTyID:
+    return "__mulhc3";
+  case llvm::Type::FloatTyID:
+    return "__mulsc3";
+  case llvm::Type::DoubleTyID:
+    return "__muldc3";
+  case llvm::Type::PPC_FP128TyID:
+    return "__multc3";
+  case llvm::Type::X86_FP80TyID:
+    return "__mulxc3";
+  }
+}
+
 // See C11 Annex G.5.1 for the semantics of multiplicative operators on complex
 // typed values.
 ComplexPairTy ComplexExprEmitter::EmitBinMul(const BinOpInfo &Op) {
   using llvm::Value;
   Value *ResR, *ResI;
+  llvm::MDBuilder MDHelper(CGF.getLLVMContext());
 
   if (Op.LHS.first->getType()->isFloatingPointTy()) {
     // The general formulation is:
@@ -603,23 +627,65 @@ ComplexPairTy ComplexExprEmitter::EmitBi
     // still more of this within the type system.
 
     if (Op.LHS.second && Op.RHS.second) {
-      // If both operands are complex, delegate to a libcall which works to
-      // prevent underflow and overflow.
-      StringRef LibCallName;
-      switch (Op.LHS.first->getType()->getTypeID()) {
-      default:
-        llvm_unreachable("Unsupported floating point type!");
-      case llvm::Type::HalfTyID:
-        return EmitComplexBinOpLibCall("__mulhc3", Op);
-      case llvm::Type::FloatTyID:
-        return EmitComplexBinOpLibCall("__mulsc3", Op);
-      case llvm::Type::DoubleTyID:
-        return EmitComplexBinOpLibCall("__muldc3", Op);
-      case llvm::Type::PPC_FP128TyID:
-        return EmitComplexBinOpLibCall("__multc3", Op);
-      case llvm::Type::X86_FP80TyID:
-        return EmitComplexBinOpLibCall("__mulxc3", Op);
-      }
+      // If both operands are complex, emit the core math directly, and then
+      // test for NaNs. If we find NaNs in the result, we delegate to a libcall
+      // to carefully re-compute the correct infinity representation if
+      // possible. The expectation is that the presence of NaNs here is
+      // *extremely* rare, and so the cost of the libcall is almost irrelevant.
+      // This is good, because the libcall re-computes the core multiplication
+      // exactly the same as we do here and re-tests for NaNs in order to be
+      // a generic complex*complex libcall.
+
+      // First compute the four products.
+      Value *AC = Builder.CreateFMul(Op.LHS.first, Op.RHS.first, "mul_ac");
+      Value *BD = Builder.CreateFMul(Op.LHS.second, Op.RHS.second, "mul_bd");
+      Value *AD = Builder.CreateFMul(Op.LHS.first, Op.RHS.second, "mul_ad");
+      Value *BC = Builder.CreateFMul(Op.LHS.second, Op.RHS.first, "mul_bc");
+
+      // The real part is the difference of the first two, the imaginary part is
+      // the sum of the second.
+      ResR = Builder.CreateFSub(AC, BD, "mul_r");
+      ResI = Builder.CreateFAdd(AD, BC, "mul_i");
+
+      // Emit the test for the real part becoming NaN and create a branch to
+      // handle it. We test for NaN by comparing the number to itself.
+      Value *IsRNaN = Builder.CreateFCmpUNO(ResR, ResR, "isnan_cmp");
+      llvm::BasicBlock *ContBB = CGF.createBasicBlock("complex_mul_cont");
+      llvm::BasicBlock *INaNBB = CGF.createBasicBlock("complex_mul_imag_nan");
+      llvm::Instruction *Branch = Builder.CreateCondBr(IsRNaN, INaNBB, ContBB);
+      llvm::BasicBlock *OrigBB = Branch->getParent();
+
+      // Give hint that we very much don't expect to see NaNs.
+      // Value chosen to match UR_NONTAKEN_WEIGHT, see BranchProbabilityInfo.cpp
+      llvm::MDNode *BrWeight = MDHelper.createBranchWeights(1, (1U << 20) - 1);
+      Branch->setMetadata(llvm::LLVMContext::MD_prof, BrWeight);
+
+      // Now test the imaginary part and create its branch.
+      CGF.EmitBlock(INaNBB);
+      Value *IsINaN = Builder.CreateFCmpUNO(ResI, ResI, "isnan_cmp");
+      llvm::BasicBlock *LibCallBB = CGF.createBasicBlock("complex_mul_libcall");
+      Branch = Builder.CreateCondBr(IsINaN, LibCallBB, ContBB);
+      Branch->setMetadata(llvm::LLVMContext::MD_prof, BrWeight);
+
+      // Now emit the libcall on this slowest of the slow paths.
+      CGF.EmitBlock(LibCallBB);
+      Value *LibCallR, *LibCallI;
+      std::tie(LibCallR, LibCallI) = EmitComplexBinOpLibCall(
+          getComplexMultiplyLibCallName(Op.LHS.first->getType()), Op);
+      Builder.CreateBr(ContBB);
+
+      // Finally continue execution by phi-ing together the different
+      // computation paths.
+      CGF.EmitBlock(ContBB);
+      llvm::PHINode *RealPHI = Builder.CreatePHI(ResR->getType(), 3, "real_mul_phi");
+      RealPHI->addIncoming(ResR, OrigBB);
+      RealPHI->addIncoming(ResR, INaNBB);
+      RealPHI->addIncoming(LibCallR, LibCallBB);
+      llvm::PHINode *ImagPHI = Builder.CreatePHI(ResI->getType(), 3, "imag_mul_phi");
+      ImagPHI->addIncoming(ResI, OrigBB);
+      ImagPHI->addIncoming(ResI, INaNBB);
+      ImagPHI->addIncoming(LibCallI, LibCallBB);
+      return ComplexPairTy(RealPHI, ImagPHI);
     }
     assert((Op.LHS.second || Op.RHS.second) &&
            "At least one operand must be complex!");

Modified: cfe/trunk/test/CodeGen/complex-math.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/complex-math.c?rev=220167&r1=220166&r2=220167&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/complex-math.c (original)
+++ cfe/trunk/test/CodeGen/complex-math.c Sun Oct 19 14:13:49 2014
@@ -89,7 +89,17 @@ float _Complex mul_float_rc(float a, flo
 }
 float _Complex mul_float_cc(float _Complex a, float _Complex b) {
   // X86-LABEL: @mul_float_cc(
-  // X86-NOT: fmul
+  // X86: %[[AC:[^ ]+]] = fmul
+  // X86: %[[BD:[^ ]+]] = fmul
+  // X86: %[[AD:[^ ]+]] = fmul
+  // X86: %[[BC:[^ ]+]] = fmul
+  // X86: %[[RR:[^ ]+]] = fsub float %[[AC]], %[[BD]]
+  // X86: %[[RI:[^ ]+]] = fadd float
+  // X86-DAG: %[[AD]]
+  // X86-DAG: ,
+  // X86-DAG: %[[BC]]
+  // X86: fcmp uno float %[[RR]]
+  // X86: fcmp uno float %[[RI]]
   // X86: call {{.*}} @__mulsc3(
   // X86: ret
   return a * b;
@@ -211,7 +221,17 @@ double _Complex mul_double_rc(double a,
 }
 double _Complex mul_double_cc(double _Complex a, double _Complex b) {
   // X86-LABEL: @mul_double_cc(
-  // X86-NOT: fmul
+  // X86: %[[AC:[^ ]+]] = fmul
+  // X86: %[[BD:[^ ]+]] = fmul
+  // X86: %[[AD:[^ ]+]] = fmul
+  // X86: %[[BC:[^ ]+]] = fmul
+  // X86: %[[RR:[^ ]+]] = fsub double %[[AC]], %[[BD]]
+  // X86: %[[RI:[^ ]+]] = fadd double
+  // X86-DAG: %[[AD]]
+  // X86-DAG: ,
+  // X86-DAG: %[[BC]]
+  // X86: fcmp uno double %[[RR]]
+  // X86: fcmp uno double %[[RI]]
   // X86: call {{.*}} @__muldc3(
   // X86: ret
   return a * b;
@@ -333,11 +353,31 @@ long double _Complex mul_long_double_rc(
 }
 long double _Complex mul_long_double_cc(long double _Complex a, long double _Complex b) {
   // X86-LABEL: @mul_long_double_cc(
-  // X86-NOT: fmul
+  // X86: %[[AC:[^ ]+]] = fmul
+  // X86: %[[BD:[^ ]+]] = fmul
+  // X86: %[[AD:[^ ]+]] = fmul
+  // X86: %[[BC:[^ ]+]] = fmul
+  // X86: %[[RR:[^ ]+]] = fsub x86_fp80 %[[AC]], %[[BD]]
+  // X86: %[[RI:[^ ]+]] = fadd x86_fp80
+  // X86-DAG: %[[AD]]
+  // X86-DAG: ,
+  // X86-DAG: %[[BC]]
+  // X86: fcmp uno x86_fp80 %[[RR]]
+  // X86: fcmp uno x86_fp80 %[[RI]]
   // X86: call {{.*}} @__mulxc3(
   // X86: ret
   // PPC-LABEL: @mul_long_double_cc(
-  // PPC-NOT: fmul
+  // PPC: %[[AC:[^ ]+]] = fmul
+  // PPC: %[[BD:[^ ]+]] = fmul
+  // PPC: %[[AD:[^ ]+]] = fmul
+  // PPC: %[[BC:[^ ]+]] = fmul
+  // PPC: %[[RR:[^ ]+]] = fsub ppc_fp128 %[[AC]], %[[BD]]
+  // PPC: %[[RI:[^ ]+]] = fadd ppc_fp128
+  // PPC-DAG: %[[AD]]
+  // PPC-DAG: ,
+  // PPC-DAG: %[[BC]]
+  // PPC: fcmp uno ppc_fp128 %[[RR]]
+  // PPC: fcmp uno ppc_fp128 %[[RI]]
   // PPC: call {{.*}} @__multc3(
   // PPC: ret
   return a * b;