[llvm-commits] [llvm] r106396 - in /llvm/trunk: lib/Transforms/Scalar/LoopStrengthReduce.cpp test/CodeGen/X86/lsr-reuse.ll

Sat Jun 19 14:29:59 PDT 2010

Author: djg
Date: Sat Jun 19 16:29:59 2010
New Revision: 106396

URL: http://llvm.org/viewvc/llvm-project?rev=106396&view=rev
Log:
Include the use kind along with the expression in the key of the
use sharing map. The reconcileNewOffset logic already forces a
separate use if the kinds differ, so incorporating the kind in the
key means we can track more sharing opportunities.

More sharing means fewer total uses to track, which means smaller
problem sizes, which means the conservative throttles don't kick
in as often.

Modified:
    llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
    llvm/trunk/test/CodeGen/X86/lsr-reuse.ll

Modified: llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp?rev=106396&r1=106395&r2=106396&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp Sat Jun 19 16:29:59 2010
@@ -1207,6 +1207,30 @@
 
 namespace {
 
+/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
+/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
+struct UseMapDenseMapInfo {
+  static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
+  }
+
+  static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
+  }
+
+  static unsigned
+  getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
+    unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
+    Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
+    return Result;
+  }
+
+  static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
+                      const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
+    return LHS == RHS;
+  }
+};
+
 /// FormulaSorter - This class implements an ordering for formulae which sorts
 /// the by their standalone cost.
 class FormulaSorter {
@@ -1279,7 +1303,9 @@
   }
 
   // Support for sharing of LSRUses between LSRFixups.
-  typedef DenseMap<const SCEV *, size_t> UseMapTy;
+  typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
+                   size_t,
+                   UseMapDenseMapInfo> UseMapTy;
   UseMapTy UseMap;
 
   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
@@ -1837,7 +1863,7 @@
   }
 
   std::pair<UseMapTy::iterator, bool> P =
-    UseMap.insert(std::make_pair(Expr, 0));
+    UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
   if (!P.second) {
     // A use already existed with this base.
     size_t LUIdx = P.first->second;

Modified: llvm/trunk/test/CodeGen/X86/lsr-reuse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/lsr-reuse.ll?rev=106396&r1=106395&r2=106396&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/lsr-reuse.ll (original)
+++ llvm/trunk/test/CodeGen/X86/lsr-reuse.ll Sat Jun 19 16:29:59 2010
@@ -440,3 +440,312 @@
   %s.1.lcssa = phi i32 [ 0, %entry ], [ %s.0.lcssa, %bb3 ] ; <i32> [#uses=1]
   ret i32 %s.1.lcssa
 }
+
+; Two loops here are of particular interest; the one at %bb21, where
+; we don't want to leave extra induction variables around, or use an
+; lea to compute an exit condition inside the loop:
+
+; CHECK: test:
+
+; CHECK:      BB10_4:
+; CHECK-NEXT:   movaps  %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   addss   %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   mulss   (%r{{[^,]*}}), %xmm{{.*}}
+; CHECK-NEXT:   movss   %xmm{{.*}}, (%r{{[^,]*}})
+; CHECK-NEXT:   addq    $4, %r{{.*}}
+; CHECK-NEXT:   decq    %r{{.*}}
+; CHECK-NEXT:   addq    $4, %r{{.*}}
+; CHECK-NEXT:   movaps  %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT: BB10_2:
+; CHECK-NEXT:   testq   %r{{.*}}, %r{{.*}}
+; CHECK-NEXT:   jle
+; CHECK-NEXT:   testb   $15, %r{{.*}}
+; CHECK-NEXT:   jne
+
+; And the one at %bb68, where we want to be sure to use superhero mode:
+
+; CHECK:      BB10_10:
+; CHECK-NEXT:   movaps  %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   mulps   48(%r{{[^,]*}}), %xmm{{.*}}
+; CHECK-NEXT:   movaps  %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   mulps   32(%r{{[^,]*}}), %xmm{{.*}}
+; CHECK-NEXT:   movaps  %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   mulps   16(%r{{[^,]*}}), %xmm{{.*}}
+; CHECK-NEXT:   movaps  %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   mulps   (%r{{[^,]*}}), %xmm{{.*}}
+; CHECK-NEXT:   movaps  %xmm{{.*}}, (%r{{[^,]*}})
+; CHECK-NEXT:   movaps  %xmm{{.*}}, 16(%r{{[^,]*}})
+; CHECK-NEXT:   movaps  %xmm{{.*}}, 32(%r{{[^,]*}})
+; CHECK-NEXT:   movaps  %xmm{{.*}}, 48(%r{{[^,]*}})
+; CHECK-NEXT:   addps   %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   addps   %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   addps   %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   addps   %xmm{{.*}}, %xmm{{.*}}
+; CHECK-NEXT:   addq    $64, %r{{.*}}
+; CHECK-NEXT:   addq    $64, %r{{.*}}
+; CHECK-NEXT:   addq    $-16, %r{{.*}}
+; CHECK-NEXT: BB10_11:
+; CHECK-NEXT:   cmpq    $15, %r{{.*}}
+; CHECK-NEXT:   jg
+
+define void @test(float* %arg, i64 %arg1, float* nocapture %arg2, float* nocapture %arg3, float* %arg4, i64 %arg5, i64 %arg6) nounwind {
+bb:
+  %t = alloca float, align 4                      ; <float*> [#uses=3]
+  %t7 = alloca float, align 4                     ; <float*> [#uses=2]
+  %t8 = load float* %arg3                         ; <float> [#uses=8]
+  %t9 = ptrtoint float* %arg to i64               ; <i64> [#uses=1]
+  %t10 = ptrtoint float* %arg4 to i64             ; <i64> [#uses=1]
+  %t11 = xor i64 %t10, %t9                        ; <i64> [#uses=1]
+  %t12 = and i64 %t11, 15                         ; <i64> [#uses=1]
+  %t13 = icmp eq i64 %t12, 0                      ; <i1> [#uses=1]
+  %t14 = xor i64 %arg1, 1                         ; <i64> [#uses=1]
+  %t15 = xor i64 %arg5, 1                         ; <i64> [#uses=1]
+  %t16 = or i64 %t15, %t14                        ; <i64> [#uses=1]
+  %t17 = trunc i64 %t16 to i32                    ; <i32> [#uses=1]
+  %t18 = icmp eq i32 %t17, 0                      ; <i1> [#uses=1]
+  br i1 %t18, label %bb19, label %bb213
+
+bb19:                                             ; preds = %bb
+  %t20 = load float* %arg2                        ; <float> [#uses=1]
+  br label %bb21
+
+bb21:                                             ; preds = %bb32, %bb19
+  %t22 = phi i64 [ %t36, %bb32 ], [ 0, %bb19 ]    ; <i64> [#uses=21]
+  %t23 = phi float [ %t35, %bb32 ], [ %t20, %bb19 ] ; <float> [#uses=6]
+  %t24 = sub i64 %arg6, %t22                      ; <i64> [#uses=4]
+  %t25 = getelementptr float* %arg4, i64 %t22     ; <float*> [#uses=4]
+  %t26 = getelementptr float* %arg, i64 %t22      ; <float*> [#uses=3]
+  %t27 = icmp sgt i64 %t24, 0                     ; <i1> [#uses=1]
+  br i1 %t27, label %bb28, label %bb37
+
+bb28:                                             ; preds = %bb21
+  %t29 = ptrtoint float* %t25 to i64              ; <i64> [#uses=1]
+  %t30 = and i64 %t29, 15                         ; <i64> [#uses=1]
+  %t31 = icmp eq i64 %t30, 0                      ; <i1> [#uses=1]
+  br i1 %t31, label %bb37, label %bb32
+
+bb32:                                             ; preds = %bb28
+  %t33 = load float* %t26                         ; <float> [#uses=1]
+  %t34 = fmul float %t23, %t33                    ; <float> [#uses=1]
+  store float %t34, float* %t25
+  %t35 = fadd float %t23, %t8                     ; <float> [#uses=1]
+  %t36 = add i64 %t22, 1                          ; <i64> [#uses=1]
+  br label %bb21
+
+bb37:                                             ; preds = %bb28, %bb21
+  %t38 = fmul float %t8, 4.000000e+00             ; <float> [#uses=1]
+  store float %t38, float* %t
+  %t39 = fmul float %t8, 1.600000e+01             ; <float> [#uses=1]
+  store float %t39, float* %t7
+  %t40 = fmul float %t8, 0.000000e+00             ; <float> [#uses=1]
+  %t41 = fadd float %t23, %t40                    ; <float> [#uses=1]
+  %t42 = insertelement <4 x float> undef, float %t41, i32 0 ; <<4 x float>> [#uses=1]
+  %t43 = fadd float %t23, %t8                     ; <float> [#uses=1]
+  %t44 = insertelement <4 x float> %t42, float %t43, i32 1 ; <<4 x float>> [#uses=1]
+  %t45 = fmul float %t8, 2.000000e+00             ; <float> [#uses=1]
+  %t46 = fadd float %t23, %t45                    ; <float> [#uses=1]
+  %t47 = insertelement <4 x float> %t44, float %t46, i32 2 ; <<4 x float>> [#uses=1]
+  %t48 = fmul float %t8, 3.000000e+00             ; <float> [#uses=1]
+  %t49 = fadd float %t23, %t48                    ; <float> [#uses=1]
+  %t50 = insertelement <4 x float> %t47, float %t49, i32 3 ; <<4 x float>> [#uses=5]
+  %t51 = call <4 x float> asm "movss $1, $0\09\0Apshufd $$0, $0, $0", "=x,*m,~{dirflag},~{fpsr},~{flags}"(float* %t) nounwind ; <<4 x float>> [#uses=3]
+  %t52 = fadd <4 x float> %t50, %t51              ; <<4 x float>> [#uses=3]
+  %t53 = fadd <4 x float> %t52, %t51              ; <<4 x float>> [#uses=3]
+  %t54 = fadd <4 x float> %t53, %t51              ; <<4 x float>> [#uses=2]
+  %t55 = call <4 x float> asm "movss $1, $0\09\0Apshufd $$0, $0, $0", "=x,*m,~{dirflag},~{fpsr},~{flags}"(float* %t7) nounwind ; <<4 x float>> [#uses=8]
+  %t56 = icmp sgt i64 %t24, 15                    ; <i1> [#uses=2]
+  br i1 %t13, label %bb57, label %bb118
+
+bb57:                                             ; preds = %bb37
+  br i1 %t56, label %bb61, label %bb112
+
+bb58:                                             ; preds = %bb68
+  %t59 = getelementptr float* %arg, i64 %t78      ; <float*> [#uses=1]
+  %t60 = getelementptr float* %arg4, i64 %t78     ; <float*> [#uses=1]
+  br label %bb112
+
+bb61:                                             ; preds = %bb57
+  %t62 = add i64 %t22, 16                         ; <i64> [#uses=1]
+  %t63 = add i64 %t22, 4                          ; <i64> [#uses=1]
+  %t64 = add i64 %t22, 8                          ; <i64> [#uses=1]
+  %t65 = add i64 %t22, 12                         ; <i64> [#uses=1]
+  %t66 = add i64 %arg6, -16                       ; <i64> [#uses=1]
+  %t67 = sub i64 %t66, %t22                       ; <i64> [#uses=1]
+  br label %bb68
+
+bb68:                                             ; preds = %bb68, %bb61
+  %t69 = phi i64 [ 0, %bb61 ], [ %t111, %bb68 ]   ; <i64> [#uses=3]
+  %t70 = phi <4 x float> [ %t54, %bb61 ], [ %t107, %bb68 ] ; <<4 x float>> [#uses=2]
+  %t71 = phi <4 x float> [ %t50, %bb61 ], [ %t103, %bb68 ] ; <<4 x float>> [#uses=2]
+  %t72 = phi <4 x float> [ %t53, %bb61 ], [ %t108, %bb68 ] ; <<4 x float>> [#uses=2]
+  %t73 = phi <4 x float> [ %t52, %bb61 ], [ %t109, %bb68 ] ; <<4 x float>> [#uses=2]
+  %t74 = shl i64 %t69, 4                          ; <i64> [#uses=5]
+  %t75 = add i64 %t22, %t74                       ; <i64> [#uses=2]
+  %t76 = getelementptr float* %arg, i64 %t75      ; <float*> [#uses=1]
+  %t77 = bitcast float* %t76 to <4 x float>*      ; <<4 x float>*> [#uses=1]
+  %t78 = add i64 %t62, %t74                       ; <i64> [#uses=2]
+  %t79 = add i64 %t63, %t74                       ; <i64> [#uses=2]
+  %t80 = getelementptr float* %arg, i64 %t79      ; <float*> [#uses=1]
+  %t81 = bitcast float* %t80 to <4 x float>*      ; <<4 x float>*> [#uses=1]
+  %t82 = add i64 %t64, %t74                       ; <i64> [#uses=2]
+  %t83 = getelementptr float* %arg, i64 %t82      ; <float*> [#uses=1]
+  %t84 = bitcast float* %t83 to <4 x float>*      ; <<4 x float>*> [#uses=1]
+  %t85 = add i64 %t65, %t74                       ; <i64> [#uses=2]
+  %t86 = getelementptr float* %arg, i64 %t85      ; <float*> [#uses=1]
+  %t87 = bitcast float* %t86 to <4 x float>*      ; <<4 x float>*> [#uses=1]
+  %t88 = getelementptr float* %arg4, i64 %t75     ; <float*> [#uses=1]
+  %t89 = bitcast float* %t88 to <4 x float>*      ; <<4 x float>*> [#uses=1]
+  %t90 = getelementptr float* %arg4, i64 %t79     ; <float*> [#uses=1]
+  %t91 = bitcast float* %t90 to <4 x float>*      ; <<4 x float>*> [#uses=1]
+  %t92 = getelementptr float* %arg4, i64 %t82     ; <float*> [#uses=1]
+  %t93 = bitcast float* %t92 to <4 x float>*      ; <<4 x float>*> [#uses=1]
+  %t94 = getelementptr float* %arg4, i64 %t85     ; <float*> [#uses=1]
+  %t95 = bitcast float* %t94 to <4 x float>*      ; <<4 x float>*> [#uses=1]
+  %t96 = mul i64 %t69, -16                        ; <i64> [#uses=1]
+  %t97 = add i64 %t67, %t96                       ; <i64> [#uses=2]
+  %t98 = load <4 x float>* %t77                   ; <<4 x float>> [#uses=1]
+  %t99 = load <4 x float>* %t81                   ; <<4 x float>> [#uses=1]
+  %t100 = load <4 x float>* %t84                  ; <<4 x float>> [#uses=1]
+  %t101 = load <4 x float>* %t87                  ; <<4 x float>> [#uses=1]
+  %t102 = fmul <4 x float> %t98, %t71             ; <<4 x float>> [#uses=1]
+  %t103 = fadd <4 x float> %t71, %t55             ; <<4 x float>> [#uses=2]
+  %t104 = fmul <4 x float> %t99, %t73             ; <<4 x float>> [#uses=1]
+  %t105 = fmul <4 x float> %t100, %t72            ; <<4 x float>> [#uses=1]
+  %t106 = fmul <4 x float> %t101, %t70            ; <<4 x float>> [#uses=1]
+  store <4 x float> %t102, <4 x float>* %t89
+  store <4 x float> %t104, <4 x float>* %t91
+  store <4 x float> %t105, <4 x float>* %t93
+  store <4 x float> %t106, <4 x float>* %t95
+  %t107 = fadd <4 x float> %t70, %t55             ; <<4 x float>> [#uses=1]
+  %t108 = fadd <4 x float> %t72, %t55             ; <<4 x float>> [#uses=1]
+  %t109 = fadd <4 x float> %t73, %t55             ; <<4 x float>> [#uses=1]
+  %t110 = icmp sgt i64 %t97, 15                   ; <i1> [#uses=1]
+  %t111 = add i64 %t69, 1                         ; <i64> [#uses=1]
+  br i1 %t110, label %bb68, label %bb58
+
+bb112:                                            ; preds = %bb58, %bb57
+  %t113 = phi float* [ %t59, %bb58 ], [ %t26, %bb57 ] ; <float*> [#uses=1]
+  %t114 = phi float* [ %t60, %bb58 ], [ %t25, %bb57 ] ; <float*> [#uses=1]
+  %t115 = phi <4 x float> [ %t103, %bb58 ], [ %t50, %bb57 ] ; <<4 x float>> [#uses=1]
+  %t116 = phi i64 [ %t97, %bb58 ], [ %t24, %bb57 ] ; <i64> [#uses=1]
+  %t117 = call <4 x float> asm "movss $1, $0\09\0Apshufd $$0, $0, $0", "=x,*m,~{dirflag},~{fpsr},~{flags}"(float* %t) nounwind ; <<4 x float>> [#uses=0]
+  br label %bb194
+
+bb118:                                            ; preds = %bb37
+  br i1 %t56, label %bb122, label %bb194
+
+bb119:                                            ; preds = %bb137
+  %t120 = getelementptr float* %arg, i64 %t145    ; <float*> [#uses=1]
+  %t121 = getelementptr float* %arg4, i64 %t145   ; <float*> [#uses=1]
+  br label %bb194
+
+bb122:                                            ; preds = %bb118
+  %t123 = add i64 %t22, -1                        ; <i64> [#uses=1]
+  %t124 = getelementptr inbounds float* %arg, i64 %t123 ; <float*> [#uses=1]
+  %t125 = bitcast float* %t124 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t126 = load <4 x float>* %t125                 ; <<4 x float>> [#uses=1]
+  %t127 = add i64 %t22, 16                        ; <i64> [#uses=1]
+  %t128 = add i64 %t22, 3                         ; <i64> [#uses=1]
+  %t129 = add i64 %t22, 7                         ; <i64> [#uses=1]
+  %t130 = add i64 %t22, 11                        ; <i64> [#uses=1]
+  %t131 = add i64 %t22, 15                        ; <i64> [#uses=1]
+  %t132 = add i64 %t22, 4                         ; <i64> [#uses=1]
+  %t133 = add i64 %t22, 8                         ; <i64> [#uses=1]
+  %t134 = add i64 %t22, 12                        ; <i64> [#uses=1]
+  %t135 = add i64 %arg6, -16                      ; <i64> [#uses=1]
+  %t136 = sub i64 %t135, %t22                     ; <i64> [#uses=1]
+  br label %bb137
+
+bb137:                                            ; preds = %bb137, %bb122
+  %t138 = phi i64 [ 0, %bb122 ], [ %t193, %bb137 ] ; <i64> [#uses=3]
+  %t139 = phi <4 x float> [ %t54, %bb122 ], [ %t189, %bb137 ] ; <<4 x float>> [#uses=2]
+  %t140 = phi <4 x float> [ %t50, %bb122 ], [ %t185, %bb137 ] ; <<4 x float>> [#uses=2]
+  %t141 = phi <4 x float> [ %t53, %bb122 ], [ %t190, %bb137 ] ; <<4 x float>> [#uses=2]
+  %t142 = phi <4 x float> [ %t52, %bb122 ], [ %t191, %bb137 ] ; <<4 x float>> [#uses=2]
+  %t143 = phi <4 x float> [ %t126, %bb122 ], [ %t175, %bb137 ] ; <<4 x float>> [#uses=1]
+  %t144 = shl i64 %t138, 4                        ; <i64> [#uses=9]
+  %t145 = add i64 %t127, %t144                    ; <i64> [#uses=2]
+  %t146 = add i64 %t128, %t144                    ; <i64> [#uses=1]
+  %t147 = getelementptr float* %arg, i64 %t146    ; <float*> [#uses=1]
+  %t148 = bitcast float* %t147 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t149 = add i64 %t129, %t144                    ; <i64> [#uses=1]
+  %t150 = getelementptr float* %arg, i64 %t149    ; <float*> [#uses=1]
+  %t151 = bitcast float* %t150 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t152 = add i64 %t130, %t144                    ; <i64> [#uses=1]
+  %t153 = getelementptr float* %arg, i64 %t152    ; <float*> [#uses=1]
+  %t154 = bitcast float* %t153 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t155 = add i64 %t131, %t144                    ; <i64> [#uses=1]
+  %t156 = getelementptr float* %arg, i64 %t155    ; <float*> [#uses=1]
+  %t157 = bitcast float* %t156 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t158 = add i64 %t22, %t144                     ; <i64> [#uses=1]
+  %t159 = getelementptr float* %arg4, i64 %t158   ; <float*> [#uses=1]
+  %t160 = bitcast float* %t159 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t161 = add i64 %t132, %t144                    ; <i64> [#uses=1]
+  %t162 = getelementptr float* %arg4, i64 %t161   ; <float*> [#uses=1]
+  %t163 = bitcast float* %t162 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t164 = add i64 %t133, %t144                    ; <i64> [#uses=1]
+  %t165 = getelementptr float* %arg4, i64 %t164   ; <float*> [#uses=1]
+  %t166 = bitcast float* %t165 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t167 = add i64 %t134, %t144                    ; <i64> [#uses=1]
+  %t168 = getelementptr float* %arg4, i64 %t167   ; <float*> [#uses=1]
+  %t169 = bitcast float* %t168 to <4 x float>*    ; <<4 x float>*> [#uses=1]
+  %t170 = mul i64 %t138, -16                      ; <i64> [#uses=1]
+  %t171 = add i64 %t136, %t170                    ; <i64> [#uses=2]
+  %t172 = load <4 x float>* %t148                 ; <<4 x float>> [#uses=2]
+  %t173 = load <4 x float>* %t151                 ; <<4 x float>> [#uses=2]
+  %t174 = load <4 x float>* %t154                 ; <<4 x float>> [#uses=2]
+  %t175 = load <4 x float>* %t157                 ; <<4 x float>> [#uses=2]
+  %t176 = shufflevector <4 x float> %t143, <4 x float> %t172, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+  %t177 = shufflevector <4 x float> %t176, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> ; <<4 x float>> [#uses=1]
+  %t178 = shufflevector <4 x float> %t172, <4 x float> %t173, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+  %t179 = shufflevector <4 x float> %t178, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> ; <<4 x float>> [#uses=1]
+  %t180 = shufflevector <4 x float> %t173, <4 x float> %t174, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+  %t181 = shufflevector <4 x float> %t180, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> ; <<4 x float>> [#uses=1]
+  %t182 = shufflevector <4 x float> %t174, <4 x float> %t175, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+  %t183 = shufflevector <4 x float> %t182, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> ; <<4 x float>> [#uses=1]
+  %t184 = fmul <4 x float> %t177, %t140           ; <<4 x float>> [#uses=1]
+  %t185 = fadd <4 x float> %t140, %t55            ; <<4 x float>> [#uses=2]
+  %t186 = fmul <4 x float> %t179, %t142           ; <<4 x float>> [#uses=1]
+  %t187 = fmul <4 x float> %t181, %t141           ; <<4 x float>> [#uses=1]
+  %t188 = fmul <4 x float> %t183, %t139           ; <<4 x float>> [#uses=1]
+  store <4 x float> %t184, <4 x float>* %t160
+  store <4 x float> %t186, <4 x float>* %t163
+  store <4 x float> %t187, <4 x float>* %t166
+  store <4 x float> %t188, <4 x float>* %t169
+  %t189 = fadd <4 x float> %t139, %t55            ; <<4 x float>> [#uses=1]
+  %t190 = fadd <4 x float> %t141, %t55            ; <<4 x float>> [#uses=1]
+  %t191 = fadd <4 x float> %t142, %t55            ; <<4 x float>> [#uses=1]
+  %t192 = icmp sgt i64 %t171, 15                  ; <i1> [#uses=1]
+  %t193 = add i64 %t138, 1                        ; <i64> [#uses=1]
+  br i1 %t192, label %bb137, label %bb119
+
+bb194:                                            ; preds = %bb119, %bb118, %bb112
+  %t195 = phi i64 [ %t116, %bb112 ], [ %t171, %bb119 ], [ %t24, %bb118 ] ; <i64> [#uses=2]
+  %t196 = phi <4 x float> [ %t115, %bb112 ], [ %t185, %bb119 ], [ %t50, %bb118 ] ; <<4 x float>> [#uses=1]
+  %t197 = phi float* [ %t114, %bb112 ], [ %t121, %bb119 ], [ %t25, %bb118 ] ; <float*> [#uses=1]
+  %t198 = phi float* [ %t113, %bb112 ], [ %t120, %bb119 ], [ %t26, %bb118 ] ; <float*> [#uses=1]
+  %t199 = extractelement <4 x float> %t196, i32 0 ; <float> [#uses=2]
+  %t200 = icmp sgt i64 %t195, 0                   ; <i1> [#uses=1]
+  br i1 %t200, label %bb201, label %bb211
+
+bb201:                                            ; preds = %bb201, %bb194
+  %t202 = phi i64 [ %t209, %bb201 ], [ 0, %bb194 ] ; <i64> [#uses=3]
+  %t203 = phi float [ %t208, %bb201 ], [ %t199, %bb194 ] ; <float> [#uses=2]
+  %t204 = getelementptr float* %t198, i64 %t202   ; <float*> [#uses=1]
+  %t205 = getelementptr float* %t197, i64 %t202   ; <float*> [#uses=1]
+  %t206 = load float* %t204                       ; <float> [#uses=1]
+  %t207 = fmul float %t203, %t206                 ; <float> [#uses=1]
+  store float %t207, float* %t205
+  %t208 = fadd float %t203, %t8                   ; <float> [#uses=2]
+  %t209 = add i64 %t202, 1                        ; <i64> [#uses=2]
+  %t210 = icmp eq i64 %t209, %t195                ; <i1> [#uses=1]
+  br i1 %t210, label %bb211, label %bb201
+
+bb211:                                            ; preds = %bb201, %bb194
+  %t212 = phi float [ %t199, %bb194 ], [ %t208, %bb201 ] ; <float> [#uses=1]
+  store float %t212, float* %arg2
+  ret void
+
+bb213:                                            ; preds = %bb
+  ret void
+}