[PATCH] Enable SLP-vectorization of intrinsics.

Fri Jan 10 14:47:06 PST 2014

Hi hfinkel, nadav,

Recognize vectorizable intrinsics during SLP vectorization
and transform accordingly. Based on similar code from Loop vectorization.
Subsequent commits will include vectorization of function calls to
vector intrinsics and from function calls to vector library calls.

http://llvm-reviews.chandlerc.com/D2535

Files:
  lib/Transforms/Vectorize/SLPVectorizer.cpp
  test/Transforms/SLPVectorizer/X86/intrinsic.ll

Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================

--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -915,6 +915,39 @@
       buildTree_rec(Operands, Depth + 1);
       return;
     }
+    case Instruction::Call: {
+      // Check if the calls are all to the same vectorizable intrinsic.
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(VL[0])) {
+        Intrinsic::ID ID = II->getIntrinsicID();
+
+        unsigned i,e;
+        for ( i = 1, e = VL.size(); i != e; ++i) {
+          IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[i]);
+          if (!II2 || II2->getIntrinsicID() != ID) {
+            newTreeEntry(VL, false);
+            DEBUG(dbgs() << "SLP: mismatched calls:" 
+                  << *II << "!=" << *VL[i] << "\n");
+            return;
+          }
+        }
+
+        newTreeEntry(VL, true);
+        for (unsigned i = 0, e = II->getNumArgOperands(); i < e; ++i) {
+          ValueList Operands;
+          // Prepare the operand vector.
+          for (unsigned j = 0; j < VL.size(); ++j) {
+            IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[j]);
+
+            Operands.push_back(II2->getArgOperand(i));
+          }
+          buildTree_rec(Operands, Depth+1);
+        }
+        return;
+      }
+      newTreeEntry(VL, false);
+      DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+      return;
+    }
     default:
       newTreeEntry(VL, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
@@ -1040,6 +1073,36 @@
       int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
       return VecStCost - ScalarStCost;
     }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(VL0);
+      IntrinsicInst *II = cast<IntrinsicInst>(CI);
+      Intrinsic::ID ID = II->getIntrinsicID();
+
+      int ScalarCallCost, VectorCallCost;
+      {
+        // Calculate the cost of the scalar calls.
+        SmallVector<Type*, 4> Tys;
+        for (unsigned op = 0, opc = II->getNumArgOperands(); op!= opc; ++op)
+          Tys.push_back(CI->getArgOperand(op)->getType());
+        ScalarCallCost = TTI->getIntrinsicInstrCost(ID, ScalarTy, Tys) * 
+            VecTy->getNumElements();
+      }
+
+      {
+        // Calculate the cost of the vectorized call.
+        SmallVector<Type*, 4> Tys;
+        for (unsigned op = 0, opc = II->getNumArgOperands(); op!= opc; ++op)
+          Tys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 
+                                        VecTy->getNumElements()));
+        VectorCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, Tys);
+      }
+
+      DEBUG(dbgs() << "SLP: Call cost "<< VectorCallCost - ScalarCallCost 
+            << " (" << VectorCallCost  << "-" <<  ScalarCallCost << ")"
+            << " for " << *II << "\n");
+
+      return VectorCallCost - ScalarCallCost;
+    }
     default:
       llvm_unreachable("Unknown instruction");
   }
@@ -1518,6 +1581,34 @@
       E->VectorizedValue = S;
       return S;
     }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(VL0);
+
+      setInsertPointAfterBundle(E->Scalars);
+      std::vector<Value *> OpVecs;
+      for (int op=0, opcount = CI->getNumArgOperands(); op != opcount; ++op) {
+        ValueList OpVL;
+        for (int i = 0, e = E->Scalars.size(); i < e; ++i)  {
+          CallInst *CEI = cast<CallInst>(E->Scalars[i]);
+          OpVL.push_back(CEI->getArgOperand(op));
+        }
+
+        Value *OpVec = vectorizeTree(OpVL);
+        DEBUG(dbgs() << "SLP: OpVec[" << op << "]: " << *OpVec << "\n");
+        OpVecs.push_back(OpVec);
+      }
+
+      Module *M = F->getParent();
+      IntrinsicInst *II = cast<IntrinsicInst>(CI);
+      Intrinsic::ID ID = II->getIntrinsicID();
+
+      Type *Tys[] = {VectorType::get(CI->getType(),E->Scalars.size())};
+
+      Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+      Value *V = Builder.CreateCall(CF, OpVecs);
+      E->VectorizedValue = V;
+      return V;
+    }
     default:
     llvm_unreachable("unknown inst");
   }
Index: test/Transforms/SLPVectorizer/X86/intrinsic.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -0,0 +1,75 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare double @llvm.fabs.f64(double) nounwind readnone
+
+;CHECK-LABEL: @vec_fabs_f64(
+;CHECK: load <2 x double>
+;CHECK: load <2 x double>
+;CHECK: call <2 x double> @llvm.fabs.v2f64
+;CHECK: store <2 x double>
+;CHECK: ret
+define void @vec_fabs_f64(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %call = tail call double @llvm.fabs.f64(double %mul) nounwind readnone
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %call5 = tail call double @llvm.fabs.f64(double %mul5) nounwind readnone
+  store double %call, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %call5, double* %arrayidx5, align 8
+  ret void
+}
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @vec_copysign_f32(
+;CHECK: load <4 x float>
+;CHECK: load <4 x float>
+;CHECK: call <4 x float> @llvm.copysign.v4f32
+;CHECK: store <4 x float>
+;CHECK: ret
+define void @vec_copysign_f32(float* %a, float* %b, float* noalias %c) {
+entry:
+  %0 = load float* %a, align 4
+  %1 = load float* %b, align 4
+  %call0 = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
+  store float %call0, float* %c, align 4
+
+  %ix2 = getelementptr inbounds float* %a, i64 1
+  %2 = load float* %ix2, align 4
+  %ix3 = getelementptr inbounds float* %b, i64 1
+  %3 = load float* %ix3, align 4
+  %call1 = tail call float @llvm.copysign.f32(float %2, float %3) nounwind readnone
+  %c1 = getelementptr inbounds float* %c, i64 1
+  store float %call1, float* %c1, align 4
+
+  %ix4 = getelementptr inbounds float* %a, i64 2
+  %4 = load float* %ix4, align 4
+  %ix5 = getelementptr inbounds float* %b, i64 2
+  %5 = load float* %ix5, align 4
+  %call2 = tail call float @llvm.copysign.f32(float %4, float %5) nounwind readnone
+  %c2 = getelementptr inbounds float* %c, i64 2
+  store float %call2, float* %c2, align 4
+
+  %ix6 = getelementptr inbounds float* %a, i64 3
+  %6 = load float* %ix6, align 4
+  %ix7 = getelementptr inbounds float* %b, i64 3
+  %7 = load float* %ix7, align 4
+  %call3 = tail call float @llvm.copysign.f32(float %6, float %7) nounwind readnone
+  %c3 = getelementptr inbounds float* %c, i64 3
+  store float %call3, float* %c3, align 4
+
+  ret void
+}
+
+
+
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D2535.1.patch
Type: text/x-patch
Size: 7234 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140110/38eb4a58/attachment.bin>