[llvm] r261804 - Detecte vector reduction operations just before instruction selection.

Thu Feb 25 07:17:16 PST 2016

Hi Cong,

It seems this commit introduced a regression found by our AArch64 and 
AArch32 Neon intrinsics testers, as it triggers the following assert:
lib/IR/Instructions.cpp:1791: static int 
llvm::ShuffleVectorInst::getMaskValue(llvm::Constant *, unsigned int): 
Assertion `i < Mask->getType()->getVectorNumElements() && "Index out of 
range"' failed.

I've attached the smallest reproducer I managed to create.
You should be able to reproduce this assertion failure with the 
following command: clang -target aarch64 -c -O2 
261804_regression_formatted.c

Could you have a look at this?

Thanks!

Kristof

On 25/02/2016 00:40, Cong Hou via llvm-commits wrote:
> Author: conghou
> Date: Wed Feb 24 17:40:36 2016
> New Revision: 261804
>
> URL: http://llvm.org/viewvc/llvm-project?rev=261804&view=rev
> Log:
> Detecte vector reduction operations just before instruction selection.
>
> (This is the second attemp to commit this patch, after fixing pr26652 & pr26653).
>
> This patch detects vector reductions before instruction selection. Vector
> reductions are vectorized reduction operations, and for such operations we have
> freedom to reorganize the elements of the result as long as the reduction of them
> stay unchanged. This will enable some reduction pattern recognition during
> instruction combine such as SAD/dot-product on X86. A flag is added to
> SDNodeFlags to mark those vector reduction nodes to be checked during instruction
> combine.
>
> To detect those vector reductions, we search def-use chains starting from the
> given instruction, and check if all uses fall into two categories:
>
> 1. Reduction with another vector.
> 2. Reduction on all elements.
>
> in which 2 is detected by recognizing the pattern that the loop vectorizer
> generates to reduce all elements in the vector outside of the loop, which
> includes several ShuffleVector and one ExtractElement instructions.
>
>
> Differential revision: http://reviews.llvm.org/D15250
>
>
>
> Added:
>      llvm/trunk/test/CodeGen/Generic/pr26652.ll
>      llvm/trunk/test/CodeGen/Generic/vector-redux.ll
> Modified:
>      llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
>      llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
>
> Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h?rev=261804&r1=261803&r2=261804&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h (original)
> +++ llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h Wed Feb 24 17:40:36 2016
> @@ -328,6 +328,7 @@ private:
>     bool NoInfs : 1;
>     bool NoSignedZeros : 1;
>     bool AllowReciprocal : 1;
> +  bool VectorReduction : 1;
>   
>   public:
>     /// Default constructor turns off all optimization flags.
> @@ -340,6 +341,7 @@ public:
>       NoInfs = false;
>       NoSignedZeros = false;
>       AllowReciprocal = false;
> +    VectorReduction = false;
>     }
>   
>     // These are mutators for each flag.
> @@ -351,6 +353,7 @@ public:
>     void setNoInfs(bool b) { NoInfs = b; }
>     void setNoSignedZeros(bool b) { NoSignedZeros = b; }
>     void setAllowReciprocal(bool b) { AllowReciprocal = b; }
> +  void setVectorReduction(bool b) { VectorReduction = b; }
>   
>     // These are accessors for each flag.
>     bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
> @@ -361,6 +364,7 @@ public:
>     bool hasNoInfs() const { return NoInfs; }
>     bool hasNoSignedZeros() const { return NoSignedZeros; }
>     bool hasAllowReciprocal() const { return AllowReciprocal; }
> +  bool hasVectorReduction() const { return VectorReduction; }
>   
>     /// Return a raw encoding of the flags.
>     /// This function should only be used to add data to the NodeID value.
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=261804&r1=261803&r2=261804&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Wed Feb 24 17:40:36 2016
> @@ -2317,6 +2317,129 @@ void SelectionDAGBuilder::visitFSub(cons
>     visitBinary(I, ISD::FSUB);
>   }
>   
> +/// Checks if the given instruction performs a vector reduction, in which case
> +/// we have the freedom to alter the elements in the result as long as the
> +/// reduction of them stays unchanged.
> +static bool isVectorReductionOp(const User *I) {
> +  const Instruction *Inst = dyn_cast<Instruction>(I);
> +  if (!Inst || !Inst->getType()->isVectorTy())
> +    return false;
> +
> +  auto OpCode = Inst->getOpcode();
> +  switch (OpCode) {
> +  case Instruction::Add:
> +  case Instruction::Mul:
> +  case Instruction::And:
> +  case Instruction::Or:
> +  case Instruction::Xor:
> +    break;
> +  case Instruction::FAdd:
> +  case Instruction::FMul:
> +    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
> +      if (FPOp->getFastMathFlags().unsafeAlgebra())
> +        break;
> +    // Fall through.
> +  default:
> +    return false;
> +  }
> +
> +  unsigned ElemNum = Inst->getType()->getVectorNumElements();
> +  unsigned ElemNumToReduce = ElemNum;
> +
> +  // Do DFS search on the def-use chain from the given instruction. We only
> +  // allow four kinds of operations during the search until we reach the
> +  // instruction that extracts the first element from the vector:
> +  //
> +  //   1. The reduction operation of the same opcode as the given instruction.
> +  //
> +  //   2. PHI node.
> +  //
> +  //   3. ShuffleVector instruction together with a reduction operation that
> +  //      does a partial reduction.
> +  //
> +  //   4. ExtractElement that extracts the first element from the vector, and we
> +  //      stop searching the def-use chain here.
> +  //
> +  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
> +  // from 1-3 to the stack to continue the DFS. The given instruction is not
> +  // a reduction operation if we meet any other instructions other than those
> +  // listed above.
> +
> +  SmallVector<const User *, 16> UsersToVisit{Inst};
> +  SmallPtrSet<const User *, 16> Visited;
> +  bool ReduxExtracted = false;
> +
> +  while (!UsersToVisit.empty()) {
> +    auto User = UsersToVisit.back();
> +    UsersToVisit.pop_back();
> +    if (!Visited.insert(User).second)
> +      continue;
> +
> +    for (const auto &U : User->users()) {
> +      auto Inst = dyn_cast<Instruction>(U);
> +      if (!Inst)
> +        return false;
> +
> +      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
> +        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
> +          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
> +            return false;
> +        UsersToVisit.push_back(U);
> +      } else if (const ShuffleVectorInst *ShufInst =
> +                     dyn_cast<ShuffleVectorInst>(U)) {
> +        // Detect the following pattern: A ShuffleVector instruction together
> +        // with a reduction that do partial reduction on the first and second
> +        // ElemNumToReduce / 2 elements, and store the result in
> +        // ElemNumToReduce / 2 elements in another vector.
> +
> +        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
> +        ElemNumToReduce = ResultElements <= ElemNumToReduce ? ResultElements
> +                                                            : ElemNumToReduce;
> +        if (ElemNumToReduce == 1)
> +          return false;
> +        if (!isa<UndefValue>(U->getOperand(1)))
> +          return false;
> +        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
> +          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
> +            return false;
> +        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
> +          if (ShufInst->getMaskValue(i) != -1)
> +            return false;
> +
> +        // There is only one user of this ShuffleVector instruction, which
> +        // must
> +        // be a reduction operation.
> +        if (!U->hasOneUse())
> +          return false;
> +
> +        auto U2 = dyn_cast<Instruction>(*U->user_begin());
> +        if (!U2 || U2->getOpcode() != OpCode)
> +          return false;
> +
> +        // Check operands of the reduction operation.
> +        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
> +            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
> +          UsersToVisit.push_back(U2);
> +          ElemNumToReduce /= 2;
> +        } else
> +          return false;
> +      } else if (isa<ExtractElementInst>(U)) {
> +        // At this moment we should have reduced all elements in the vector.
> +        if (ElemNumToReduce != 1)
> +          return false;
> +
> +        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
> +        if (!Val || Val->getZExtValue() != 0)
> +          return false;
> +
> +        ReduxExtracted = true;
> +      } else
> +        return false;
> +    }
> +  }
> +  return ReduxExtracted;
> +}
> +
>   void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
>     SDValue Op1 = getValue(I.getOperand(0));
>     SDValue Op2 = getValue(I.getOperand(1));
> @@ -2324,6 +2447,7 @@ void SelectionDAGBuilder::visitBinary(co
>     bool nuw = false;
>     bool nsw = false;
>     bool exact = false;
> +  bool vec_redux = false;
>     FastMathFlags FMF;
>   
>     if (const OverflowingBinaryOperator *OFBinOp =
> @@ -2337,10 +2461,16 @@ void SelectionDAGBuilder::visitBinary(co
>     if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
>       FMF = FPOp->getFastMathFlags();
>   
> +  if (isVectorReductionOp(&I)) {
> +    vec_redux = true;
> +    DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
> +  }
> +
>     SDNodeFlags Flags;
>     Flags.setExact(exact);
>     Flags.setNoSignedWrap(nsw);
>     Flags.setNoUnsignedWrap(nuw);
> +  Flags.setVectorReduction(vec_redux);
>     if (EnableFMFInDAG) {
>       Flags.setAllowReciprocal(FMF.allowReciprocal());
>       Flags.setNoInfs(FMF.noInfs());
>
> Added: llvm/trunk/test/CodeGen/Generic/pr26652.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Generic/pr26652.ll?rev=261804&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/Generic/pr26652.ll (added)
> +++ llvm/trunk/test/CodeGen/Generic/pr26652.ll Wed Feb 24 17:40:36 2016
> @@ -0,0 +1,8 @@
> +; RUN: llc < %s
> +
> +define <2 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
> +entry:
> +  %0 = or <4 x i32> %a, %b
> +  %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
> +  ret <2 x i32> %1
> +}
>
> Added: llvm/trunk/test/CodeGen/Generic/vector-redux.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Generic/vector-redux.ll?rev=261804&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/Generic/vector-redux.ll (added)
> +++ llvm/trunk/test/CodeGen/Generic/vector-redux.ll Wed Feb 24 17:40:36 2016
> @@ -0,0 +1,237 @@
> +; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
> +; REQUIRES: asserts
> +
> + at a = global [1024 x i32] zeroinitializer, align 16
> +
> +define i32 @reduce_add() {
> +; CHECK-LABEL: reduce_add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +; CHECK:       Detected a reduction operation: {{.*}} add
> +
> +min.iters.checked:
> +  br label %vector.body
> +
> +vector.body:
> +  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ]
> +  %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ]
> +  %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ]
> +  %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index
> +  %1 = bitcast i32* %0 to <4 x i32>*
> +  %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
> +  %2 = getelementptr i32, i32* %0, i64 4
> +  %3 = bitcast i32* %2 to <4 x i32>*
> +  %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16
> +  %4 = add nsw <4 x i32> %wide.load, %vec.phi
> +  %5 = add nsw <4 x i32> %wide.load5, %vec.phi4
> +  %index.next = add nuw nsw i64 %index, 8
> +  %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next
> +  %7 = bitcast i32* %6 to <4 x i32>*
> +  %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16
> +  %8 = getelementptr i32, i32* %6, i64 4
> +  %9 = bitcast i32* %8 to <4 x i32>*
> +  %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16
> +  %10 = add nsw <4 x i32> %wide.load.1, %4
> +  %11 = add nsw <4 x i32> %wide.load5.1, %5
> +  %index.next.1 = add nsw i64 %index, 16
> +  %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1
> +  %13 = bitcast i32* %12 to <4 x i32>*
> +  %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16
> +  %14 = getelementptr i32, i32* %12, i64 4
> +  %15 = bitcast i32* %14 to <4 x i32>*
> +  %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16
> +  %16 = add nsw <4 x i32> %wide.load.2, %10
> +  %17 = add nsw <4 x i32> %wide.load5.2, %11
> +  %index.next.2 = add nsw i64 %index, 24
> +  %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2
> +  %19 = bitcast i32* %18 to <4 x i32>*
> +  %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16
> +  %20 = getelementptr i32, i32* %18, i64 4
> +  %21 = bitcast i32* %20 to <4 x i32>*
> +  %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16
> +  %22 = add nsw <4 x i32> %wide.load.3, %16
> +  %23 = add nsw <4 x i32> %wide.load5.3, %17
> +  %index.next.3 = add nsw i64 %index, 32
> +  %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3
> +  %25 = bitcast i32* %24 to <4 x i32>*
> +  %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16
> +  %26 = getelementptr i32, i32* %24, i64 4
> +  %27 = bitcast i32* %26 to <4 x i32>*
> +  %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16
> +  %28 = add nsw <4 x i32> %wide.load.4, %22
> +  %29 = add nsw <4 x i32> %wide.load5.4, %23
> +  %index.next.4 = add nsw i64 %index, 40
> +  %30 = icmp eq i64 %index.next.4, 1000
> +  br i1 %30, label %middle.block, label %vector.body
> +
> +middle.block:
> +  %.lcssa10 = phi <4 x i32> [ %29, %vector.body ]
> +  %.lcssa = phi <4 x i32> [ %28, %vector.body ]
> +  %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa
> +  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +  %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf
> +  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +  %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7
> +  %31 = extractelement <4 x i32> %bin.rdx8, i32 0
> +  ret i32 %31
> +}
> +
> +define i32 @reduce_and() {
> +; CHECK-LABEL: reduce_and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +; CHECK:       Detected a reduction operation: {{.*}} and
> +
> +entry:
> +  br label %vector.body
> +
> +vector.body:
> +  %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ]
> +  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %6, %vector.body ]
> +  %vec.phi9 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %7, %vector.body ]
> +  %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
> +  %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>*
> +  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256
> +  %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16
> +  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257
> +  %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16
> +  %0 = and <4 x i32> %wide.load, %vec.phi
> +  %1 = and <4 x i32> %wide.load10, %vec.phi9
> +  %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
> +  %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>*
> +  %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258
> +  %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16
> +  %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
> +  %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>*
> +  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259
> +  %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16
> +  %2 = and <4 x i32> %wide.load.1, %0
> +  %3 = and <4 x i32> %wide.load10.1, %1
> +  %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
> +  %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>*
> +  %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260
> +  %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16
> +  %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
> +  %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>*
> +  %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261
> +  %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16
> +  %4 = and <4 x i32> %wide.load.2, %2
> +  %5 = and <4 x i32> %wide.load10.2, %3
> +  %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
> +  %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>*
> +  %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262
> +  %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16
> +  %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
> +  %uglygep17 = bitcast i8* %uglygep to <4 x i32>*
> +  %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263
> +  %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16
> +  %6 = and <4 x i32> %wide.load.3, %4
> +  %7 = and <4 x i32> %wide.load10.3, %5
> +  %lsr.iv.next = add nsw i64 %lsr.iv, 128
> +  %8 = icmp eq i64 %lsr.iv.next, 0
> +  br i1 %8, label %middle.block, label %vector.body
> +
> +middle.block:
> +  %bin.rdx = and <4 x i32> %7, %6
> +  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +  %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf
> +  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +  %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12
> +  %9 = extractelement <4 x i32> %bin.rdx13, i32 0
> +  ret i32 %9
> +}
> +
> +define float @reduce_add_float(float* nocapture readonly %a) {
> +; CHECK-LABEL: reduce_add_float
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +; CHECK:       Detected a reduction operation: {{.*}} fadd fast
> +;
> +entry:
> +  br label %vector.body
> +
> +vector.body:
> +  %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
> +  %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
> +  %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
> +  %0 = getelementptr inbounds float, float* %a, i64 %index
> +  %1 = bitcast float* %0 to <4 x float>*
> +  %wide.load = load <4 x float>, <4 x float>* %1, align 4
> +  %2 = getelementptr float, float* %0, i64 4
> +  %3 = bitcast float* %2 to <4 x float>*
> +  %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
> +  %4 = fadd fast <4 x float> %wide.load, %vec.phi
> +  %5 = fadd fast <4 x float> %wide.load10, %vec.phi9
> +  %index.next = add nuw nsw i64 %index, 8
> +  %6 = getelementptr inbounds float, float* %a, i64 %index.next
> +  %7 = bitcast float* %6 to <4 x float>*
> +  %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
> +  %8 = getelementptr float, float* %6, i64 4
> +  %9 = bitcast float* %8 to <4 x float>*
> +  %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
> +  %10 = fadd fast <4 x float> %wide.load.1, %4
> +  %11 = fadd fast <4 x float> %wide.load10.1, %5
> +  %index.next.1 = add nsw i64 %index, 16
> +  %12 = getelementptr inbounds float, float* %a, i64 %index.next.1
> +  %13 = bitcast float* %12 to <4 x float>*
> +  %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
> +  %14 = getelementptr float, float* %12, i64 4
> +  %15 = bitcast float* %14 to <4 x float>*
> +  %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
> +  %16 = fadd fast <4 x float> %wide.load.2, %10
> +  %17 = fadd fast <4 x float> %wide.load10.2, %11
> +  %index.next.2 = add nsw i64 %index, 24
> +  %18 = getelementptr inbounds float, float* %a, i64 %index.next.2
> +  %19 = bitcast float* %18 to <4 x float>*
> +  %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
> +  %20 = getelementptr float, float* %18, i64 4
> +  %21 = bitcast float* %20 to <4 x float>*
> +  %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
> +  %22 = fadd fast <4 x float> %wide.load.3, %16
> +  %23 = fadd fast <4 x float> %wide.load10.3, %17
> +  %index.next.3 = add nsw i64 %index, 32
> +  %24 = getelementptr inbounds float, float* %a, i64 %index.next.3
> +  %25 = bitcast float* %24 to <4 x float>*
> +  %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
> +  %26 = getelementptr float, float* %24, i64 4
> +  %27 = bitcast float* %26 to <4 x float>*
> +  %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
> +  %28 = fadd fast <4 x float> %wide.load.4, %22
> +  %29 = fadd fast <4 x float> %wide.load10.4, %23
> +  %index.next.4 = add nsw i64 %index, 40
> +  %30 = icmp eq i64 %index.next.4, 1000
> +  br i1 %30, label %middle.block, label %vector.body
> +
> +middle.block:
> +  %.lcssa15 = phi <4 x float> [ %29, %vector.body ]
> +  %.lcssa = phi <4 x float> [ %28, %vector.body ]
> +  %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
> +  %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +  %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
> +  %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +  %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
> +  %31 = extractelement <4 x float> %bin.rdx13, i32 0
> +  ret float %31
> +}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits

-------------- next part --------------
/* ./compiler_261804/bin/clang -c -O2 besttry.c  */
typedef signed char int8_t;
typedef int int32_t;
typedef long int int64_t;
typedef unsigned char uint8_t;
typedef unsigned short int uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long int uint64_t;
typedef uint8_t poly8_t;
typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
static inline __attribute__((__always_inline__, __nodebug__)) uint8x16_t
    vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
  uint8x16_t __ret;
  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                  11, 12, 13, 14, 15);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) uint32x4_t
    vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
  uint32x4_t __ret;
  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) int32x2_t
    vmovn_s64(int64x2_t __p0) {
  int32x2_t __ret;
  __ret = (int32x2_t)__builtin_neon_vmovn_v((int8x16_t)__p0, 2);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) int64x2_t
    vmull_s32(int32x2_t __p0, int32x2_t __p1) {
  int64x2_t __ret;
  __ret = (int64x2_t)__builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) int64x2_t
    vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
  int64x2_t __ret;
  __ret = (int64x2_t)__builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1,
                                              (int8x8_t)__p2, 35);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) int32x2_t
    vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
  int32x2_t __ret;
  __ret =
      (int32x2_t)__builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) uint16x8_t
    vrev64q_u16(uint16x8_t __p0) {
  uint16x8_t __ret;
  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) uint32x2_t
    vtst_u32(uint32x2_t __p0, uint32x2_t __p1) {
  uint32x2_t __ret;
  __ret = (uint32x2_t)__builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) int64x2_t
    vpaddq_s64(int64x2_t __p0, int64x2_t __p1) {
  int64x2_t __ret;
  __ret =
      (int64x2_t)__builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
  return __ret;
}
static inline __attribute__((__always_inline__, __nodebug__)) int64x2_t
    vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
  int64x2_t __ret;
  __ret = __p0 + vmull_s32(__p1, __p2);
  return __ret;
}
int dotests_56() {
  {
    {
      int64x2_t val11;
      uint8x8_t val16;
      uint32x2_t val17;
      uint32x2_t val18;
      uint32x2_t val21;
      uint32x2_t val22;
      uint32x4_t val50;
      uint64x2_t val55;
      uint32x4_t val56;
      uint32x2_t val57;
      uint16x8_t val58;
      int32x2_t val60;
      int32x2_t val61;
      int64x2_t val62;
      int32x2_t val63;
      int32x2_t val64;
      int32x2_t val70;
      char *str1 = "Failure 56";
      uint64_t got;
      uint64_t exp;
      val17 = ((uint32x2_t)0xc4d1ffff80007fffUL);
      val22 = ((uint32x2_t)0x80008000ffffffffUL);
      val21 = vtst_u32(val18, (uint32x2_t)0L);
      val56 = __extension__({
        uint32x2_t __s0_208 = __extension__({
          uint64x2_t __s0 = __extension__({
            uint32x2_t __s0 = (uint32x2_t)0L;
            uint32x4_t __s1 = val50;
            uint64x2_t __ret;
            __ret = (uint64x2_t)(__uint128_t) 0LL;
            __ret;
          });
          uint32x2_t __ret;
          __ret =
              (uint32x2_t)__builtin_neon_vqshrn_n_v((int8x16_t)__s0, 19, 18);
          __ret;
        });
        uint64x2_t __s1_208 = val55;
        uint32x4_t __ret_208;
        __ret_208 = (uint32x4_t)(vcombine_u32(
            (uint32x2_t)(__s0_208), (uint32x2_t)(__extension__({
                                      uint64x2_t __s0 = __s1_208;
                                      uint32x2_t __ret;
                                      __ret =
                                          (uint32x2_t)__builtin_neon_vrshrn_n_v(
                                              (int8x16_t)__s0, 8, 18);
                                      __ret;
                                    }))));
        __ret_208;
      });
      val57 = __extension__({
        uint32x2_t __s0 = (val17 + (val21 - val22));
        uint32x4_t __s1 = ~(val56);
        uint32x2_t __ret;
        __ret = __s0 * __builtin_shufflevector(__s1, __s1, 1, 1);
        __ret;
      });
      val58 = vrev64q_u16((uint16x8_t)((uint64x2_t)(__extension__({
        uint32x2_t __s0 = val57;
        uint32x4_t __ret;
        __ret = __builtin_shufflevector(__s0, __s0, 0, 0, 0, 0);
        __ret;
      }))));
      val62 = vmlal_s32((int64x2_t)((poly8x16_t)(__extension__({
                          uint8x8_t __s0_192 = val16;
                          uint16x8_t __s1_192 = val58;
                          uint8x16_t __ret_192;
                          __ret_192 = (uint8x16_t)(vcombine_u8(
                              (uint8x8_t)(__s0_192),
                              (uint8x8_t)(__extension__({
                                uint16x8_t __s0 = __s1_192;
                                uint8x8_t __ret;
                                __ret = (uint8x8_t)__builtin_neon_vqshrn_n_v(
                                    (int8x16_t)__s0, 1, 16);
                                __ret;
                              }))));
                          __ret_192;
                        }))),
                        val60, val61);
      val70 = __extension__({
        int32x2_t __s0 =
            vmovn_s64(vqdmlal_s32(vpaddq_s64(val11, val62), val63, val64));
        int32x2_t __s1 = (int32x2_t)0L;
        int32x2_t __ret;
        __ret = vqdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, 0, 0));
        __ret;
      });
      got = __extension__({
        uint64x1_t __s0 = (uint64x1_t)(val70);
        uint64_t __ret;
        __ret = (uint64_t)__builtin_neon_vget_lane_i64((int8x8_t)__s0, 0);
        __ret;
      });
      exp = 0x995f285701bf9d77UL;
      return (exp != got);
    }
  }
}