[llvm] r212324 - [x86] Generalize BuildVectorSDNode::getConstantSplatValue to work for

Mon Jul 7 11:37:49 PDT 2014

On 04/07/2014 10:11, Chandler Carruth wrote:
> Author: chandlerc
> Date: Fri Jul  4 03:11:49 2014
> New Revision: 212324
>
> URL: http://llvm.org/viewvc/llvm-project?rev=212324&view=rev
> Log:
> [x86] Generalize BuildVectorSDNode::getConstantSplatValue to work for
> any constant, constant FP, or undef splat and to tolerate any undef
> lanes in a splat, then replace all uses of isSplatVector in X86's
> lowering with it.
>
> This fixes issues where undef lanes in an otherwise splat vector would
> prevent the splat logic from firing. It is a touch more awkward to use
> this interface, but it is much more accurate. Suggestions for better
> interface structuring welcome.
>
> With this fix, the code generated with the widening legalization
> strategy for widen_cast-4.ll is *dramatically* improved as the special
> lowering strategies for a v16i8 SRA kick in even though the high lanes
> are undef.
>
> We also get a slightly different choice for broadcasting an aligned
> memory location, and use vpshufd instead of vbroadcastss. This looks
> like a minor win for pipelining and domain crossing, but a minor loss
> for the number of micro-ops. I suspect its a wash, but folks can easily
> tweak the lowering if they want.
>
> Modified:
>      llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
>      llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
>      llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
>      llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
>      llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>      llvm/trunk/test/CodeGen/X86/vector-gep.ll
>      llvm/trunk/test/CodeGen/X86/widen_cast-4.ll

The following commit caused a test-suite failure in my nightly test 
suite runners.

If you compile out.ll with clang  the output before and after this 
change will be different. The original .s file was very different 
between the two versions. To pinpoint the bug, I replaced all undef 
values with 'zeroinitializers', such that there is only one undef value 
and consequently only a single difference in the generated assembly file.

Without understanding the code, the piece of LLVM-IR we look at is the 
following:

   %24 = trunc i64 0 to i32
   %broadcast.splatinsert18.unr = insertelement <2 x i32> undef, i32 
%24, i32 0
   %broadcast.splat19.unr = shufflevector <2 x i32> 
%broadcast.splatinsert18.unr, <2 x i32> zeroinitializer, <2 x i32> 
zeroinitializer
   %induction20.unr = add <2 x i32> %broadcast.splat19.unr, <i32 0, i32 1>
   %induction21.unr = add <2 x i32> %broadcast.splat19.unr, <i32 2, i32 3>
   %25 = getelementptr i32* %1, i64 0
   %26 = bitcast i32* %25 to <2 x i32>*
   store <2 x i32> %induction20.unr, <2 x i32>* %26, align 4
   %.sum98.unr = or i64 0, 2
   %27 = getelementptr i32* %1, i64 %.sum98.unr
   %28 = bitcast i32* %27 to <2 x i32>*
   store <2 x i32> %induction21.unr, <2 x i32>* %28, align 4
   %index.next.unr = add i64 0, 4
   %29 = icmp eq i64 %index.next.unr, %n.vec

and this is the corresponding assembly file change:

  .LBB0_15:                               # %vector.body.unr
  	movabsq	$4, %rax
-	movl	$1, %ecx
+	movl	$2, %ecx

Unrelated?

  	movl	%ecx, %edx
  	movd	%rdx, %xmm0
-	pslldq	$8, %xmm0
-	movaps	.LCPI0_0(%rip), %xmm1
-	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
+	xorps	%xmm1, %xmm1

Do we loose some data here?

  	movq	296(%rsp), %rdx         # 8-byte Reload
-	movq	%xmm0, (%rdx)
-	pshufd	$8, %xmm1, %xmm0        # xmm0 = xmm1[0,2,0,0]
+	movq	%xmm1, (%rdx)
+	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]

Just different instruction selection?

  	movq	%xmm0, 8(%rdx)
  	addq	$0, %rax
  	movq	%rax, 168(%rsp)         # 8-byte Spill

Cheers,
Tobias

-------------- next part --------------
; ModuleID = '/home/grosser/Projects/polly/test-suite/SingleSource/Regression/C/sumarraymalloc.c'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@.str = private unnamed_addr constant [11 x i8] c"Sum1 = %d\0A\00", align 1
@.str1 = private unnamed_addr constant [11 x i8] c"Sum2 = %d\0A\00", align 1

; Function Attrs: nounwind uwtable
define i32 @main(i32 %argc, i8** nocapture readonly %argv) #0 {
entry:
  %cmp = icmp slt i32 %argc, 2
  br i1 %cmp, label %cond.end, label %cond.false

cond.false:                                       ; preds = %entry
  %arrayidx = getelementptr inbounds i8** %argv, i64 1
  %0 = load i8** %arrayidx, align 8, !tbaa !1
  %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #2
  %conv.i = trunc i64 %call.i to i32
  br label %cond.end

cond.end:                                         ; preds = %entry, %cond.false
  %cond.reg2mem.0 = phi i32 [ 100, %entry ], [ %conv.i, %cond.false ]
  %conv = sext i32 %cond.reg2mem.0 to i64
  %mul = shl nsw i64 %conv, 2
  %call1 = tail call noalias i8* @malloc(i64 %mul) #2
  %1 = bitcast i8* %call1 to i32*
  %2 = icmp eq i32 %cond.reg2mem.0, 0
  br i1 %2, label %vector.body57.preheader, label %polly.cond3.i

polly.cond3.i:                                    ; preds = %cond.end
  %3 = zext i32 %cond.reg2mem.0 to i64
  %4 = icmp slt i32 %cond.reg2mem.0, 1
  %polly.adjust_ub16.i.pre = add nsw i64 %3, -2
  br i1 %4, label %polly.loop_header10.preheader.i, label %polly.loop_header.i.preheader

polly.loop_header.i.preheader:                    ; preds = %polly.cond3.i
  %5 = zext i32 %cond.reg2mem.0 to i64
  %6 = add nsw i64 %5, -1
  %7 = icmp sgt i64 %6, 0
  %smax = select i1 %7, i64 %6, i64 0
  %backedge.overflow = icmp eq i64 %smax, -1
  br i1 %backedge.overflow, label %polly.loop_header.i.preheader100, label %overflow.checked

polly.loop_header.i.preheader100:                 ; preds = %middle.block, %polly.loop_header.i.preheader
  %polly.indvar.i.ph = phi i64 [ 0, %polly.loop_header.i.preheader ], [ %resume.val, %middle.block ]
  %8 = zext i32 %cond.reg2mem.0 to i64
  %9 = add i64 %8, -1
  %10 = icmp sgt i64 %polly.indvar.i.ph, %9
  %smax122 = select i1 %10, i64 %polly.indvar.i.ph, i64 %9
  %11 = add i64 %smax122, 1
  %12 = sub i64 %11, %polly.indvar.i.ph
  %xtraiter123 = and i64 %12, 3
  %lcmp.mod124 = icmp ne i64 %xtraiter123, 0
  %lcmp.overflow125 = icmp eq i64 %12, 0
  %lcmp.or126 = or i1 %lcmp.overflow125, %lcmp.mod124
  br i1 %lcmp.or126, label %unr.cmp140, label %polly.loop_header.i.preheader100.split

unr.cmp140:                                       ; preds = %polly.loop_header.i.preheader100
  %un.tmp141 = icmp eq i64 %xtraiter123, 1
  br i1 %un.tmp141, label %polly.loop_header.i.unr134, label %unr.cmp132

unr.cmp132:                                       ; preds = %unr.cmp140
  %un.tmp133 = icmp eq i64 %xtraiter123, 2
  br i1 %un.tmp133, label %polly.loop_header.i.unr127, label %polly.loop_header.i.unr

polly.loop_header.i.unr:                          ; preds = %unr.cmp132
  %p_i.02.i.unr = trunc i64 %polly.indvar.i.ph to i32
  %p_arrayidx.i.unr = getelementptr i32* %1, i64 %polly.indvar.i.ph
  store i32 %p_i.02.i.unr, i32* %p_arrayidx.i.unr, align 4
  %polly.indvar_next.i.unr = add nsw i64 %polly.indvar.i.ph, 1
  %polly.loop_cond.i.unr = icmp sgt i64 %polly.indvar.i.ph, %polly.adjust_ub16.i.pre
  br label %polly.loop_header.i.unr127

polly.loop_header.i.unr127:                       ; preds = %unr.cmp132, %polly.loop_header.i.unr
  %polly.indvar.i.unr = phi i64 [ %polly.indvar_next.i.unr, %polly.loop_header.i.unr ], [ %polly.indvar.i.ph, %unr.cmp132 ]
  %p_i.02.i.unr128 = trunc i64 %polly.indvar.i.unr to i32
  %p_arrayidx.i.unr129 = getelementptr i32* %1, i64 %polly.indvar.i.unr
  store i32 %p_i.02.i.unr128, i32* %p_arrayidx.i.unr129, align 4
  %polly.indvar_next.i.unr130 = add nsw i64 %polly.indvar.i.unr, 1
  %polly.loop_cond.i.unr131 = icmp sgt i64 %polly.indvar.i.unr, %polly.adjust_ub16.i.pre
  br label %polly.loop_header.i.unr134

polly.loop_header.i.unr134:                       ; preds = %unr.cmp140, %polly.loop_header.i.unr127
  %polly.indvar.i.unr135 = phi i64 [ %polly.indvar_next.i.unr130, %polly.loop_header.i.unr127 ], [ %polly.indvar.i.ph, %unr.cmp140 ]
  %p_i.02.i.unr136 = trunc i64 %polly.indvar.i.unr135 to i32
  %p_arrayidx.i.unr137 = getelementptr i32* %1, i64 %polly.indvar.i.unr135
  store i32 %p_i.02.i.unr136, i32* %p_arrayidx.i.unr137, align 4
  %polly.indvar_next.i.unr138 = add nsw i64 %polly.indvar.i.unr135, 1
  %polly.loop_cond.i.unr139 = icmp sgt i64 %polly.indvar.i.unr135, %polly.adjust_ub16.i.pre
  br label %polly.loop_header.i.preheader100.split

polly.loop_header.i.preheader100.split:           ; preds = %polly.loop_header.i.unr134, %polly.loop_header.i.preheader100
  %polly.indvar.i.unr142 = phi i64 [ %polly.indvar.i.ph, %polly.loop_header.i.preheader100 ], [ %polly.indvar_next.i.unr138, %polly.loop_header.i.unr134 ]
  %13 = icmp ult i64 %12, 4
  br i1 %13, label %polly.cond6.i.loopexit, label %polly.loop_header.i.preheader100.split.split

polly.loop_header.i.preheader100.split.split:     ; preds = %polly.loop_header.i.preheader100.split
  br label %polly.loop_header.i

overflow.checked:                                 ; preds = %polly.loop_header.i.preheader
  %14 = add nsw i64 %smax, 1
  %end.idx = add nsw i64 %smax, 1
  %n.vec = and i64 %14, -4
  %cmp.zero = icmp eq i64 %n.vec, 0
  br i1 %cmp.zero, label %middle.block, label %vector.body.preheader

vector.body.preheader:                            ; preds = %overflow.checked
  %15 = zext i32 %cond.reg2mem.0 to i64
  %16 = add i64 %15, -1
  %17 = icmp sgt i64 %16, 0
  %smax143 = select i1 %17, i64 %16, i64 0
  %18 = add i64 %smax143, 1
  %19 = lshr i64 %18, 2
  %20 = mul i64 %19, 4
  %21 = add i64 %20, -4
  %22 = lshr i64 %21, 2
  %23 = add i64 %22, 1
  %xtraiter144 = and i64 %23, 1
  %lcmp.mod145 = icmp ne i64 %xtraiter144, 0
  %lcmp.overflow146 = icmp eq i64 %23, 0
  %lcmp.or147 = or i1 %lcmp.overflow146, %lcmp.mod145
  br i1 %lcmp.or147, label %vector.body.unr, label %vector.body.preheader.split

vector.body.unr:                                  ; preds = %vector.body.preheader
  %24 = trunc i64 0 to i32
  %broadcast.splatinsert18.unr = insertelement <2 x i32> undef, i32 %24, i32 0
  %broadcast.splat19.unr = shufflevector <2 x i32> %broadcast.splatinsert18.unr, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
  %induction20.unr = add <2 x i32> %broadcast.splat19.unr, <i32 0, i32 1>
  %induction21.unr = add <2 x i32> %broadcast.splat19.unr, <i32 2, i32 3>
  %25 = getelementptr i32* %1, i64 0
  %26 = bitcast i32* %25 to <2 x i32>*
  store <2 x i32> %induction20.unr, <2 x i32>* %26, align 4
  %.sum98.unr = or i64 0, 2
  %27 = getelementptr i32* %1, i64 %.sum98.unr
  %28 = bitcast i32* %27 to <2 x i32>*
  store <2 x i32> %induction21.unr, <2 x i32>* %28, align 4
  %index.next.unr = add i64 0, 4
  %29 = icmp eq i64 %index.next.unr, %n.vec
  br label %vector.body.preheader.split

vector.body.preheader.split:                      ; preds = %vector.body.unr, %vector.body.preheader
  %index.unr = phi i64 [ 0, %vector.body.preheader ], [ %index.next.unr, %vector.body.unr ]
  %30 = icmp ult i64 %23, 2
  br i1 %30, label %middle.block.loopexit, label %vector.body.preheader.split.split

vector.body.preheader.split.split:                ; preds = %vector.body.preheader.split
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.body.preheader.split.split
  %index = phi i64 [ %index.unr, %vector.body.preheader.split.split ], [ %index.next.1, %vector.body ]
  %31 = trunc i64 %index to i32
  %broadcast.splatinsert18 = insertelement <2 x i32> zeroinitializer, i32 %31, i32 0
  %broadcast.splat19 = shufflevector <2 x i32> %broadcast.splatinsert18, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
  %induction20 = add <2 x i32> %broadcast.splat19, <i32 0, i32 1>
  %induction21 = add <2 x i32> %broadcast.splat19, <i32 2, i32 3>
  %32 = getelementptr i32* %1, i64 %index
  %33 = bitcast i32* %32 to <2 x i32>*
  store <2 x i32> %induction20, <2 x i32>* %33, align 4
  %.sum98 = or i64 %index, 2
  %34 = getelementptr i32* %1, i64 %.sum98
  %35 = bitcast i32* %34 to <2 x i32>*
  store <2 x i32> %induction21, <2 x i32>* %35, align 4
  %index.next = add i64 %index, 4
  %36 = trunc i64 %index.next to i32
  %broadcast.splatinsert18.1 = insertelement <2 x i32> zeroinitializer, i32 %36, i32 0
  %broadcast.splat19.1 = shufflevector <2 x i32> %broadcast.splatinsert18.1, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
  %induction20.1 = add <2 x i32> %broadcast.splat19.1, <i32 0, i32 1>
  %induction21.1 = add <2 x i32> %broadcast.splat19.1, <i32 2, i32 3>
  %37 = getelementptr i32* %1, i64 %index.next
  %38 = bitcast i32* %37 to <2 x i32>*
  store <2 x i32> %induction20.1, <2 x i32>* %38, align 4
  %.sum98.1 = or i64 %index.next, 2
  %39 = getelementptr i32* %1, i64 %.sum98.1
  %40 = bitcast i32* %39 to <2 x i32>*
  store <2 x i32> %induction21.1, <2 x i32>* %40, align 4
  %index.next.1 = add i64 %index.next, 4
  %41 = icmp eq i64 %index.next.1, %n.vec
  br i1 %41, label %middle.block.loopexit.unr-lcssa, label %vector.body, !llvm.loop !5

middle.block.loopexit.unr-lcssa:                  ; preds = %vector.body
  br label %middle.block.loopexit

middle.block.loopexit:                            ; preds = %vector.body.preheader.split, %middle.block.loopexit.unr-lcssa
  br label %middle.block

middle.block:                                     ; preds = %middle.block.loopexit, %overflow.checked
  %resume.val = phi i64 [ 0, %overflow.checked ], [ %n.vec, %middle.block.loopexit ]
  %cmp.n = icmp eq i64 %end.idx, %resume.val
  br i1 %cmp.n, label %polly.cond6.i, label %polly.loop_header.i.preheader100

polly.cond6.i.loopexit.unr-lcssa:                 ; preds = %polly.loop_header.i
  br label %polly.cond6.i.loopexit

polly.cond6.i.loopexit:                           ; preds = %polly.loop_header.i.preheader100.split, %polly.cond6.i.loopexit.unr-lcssa
  br label %polly.cond6.i

polly.cond6.i:                                    ; preds = %polly.cond6.i.loopexit, %middle.block
  %42 = icmp sgt i32 %cond.reg2mem.0, -1
  br i1 %42, label %vector.body57.preheader, label %polly.loop_header10.preheader.i

polly.loop_header10.preheader.i:                  ; preds = %polly.cond3.i, %polly.cond6.i
  %43 = zext i32 %cond.reg2mem.0 to i64
  %44 = add nsw i64 %43, -1
  %45 = icmp sgt i64 %44, 0
  %smax24 = select i1 %45, i64 %44, i64 0
  %backedge.overflow25 = icmp eq i64 %smax24, -1
  br i1 %backedge.overflow25, label %polly.loop_header10.i.preheader, label %overflow.checked37

polly.loop_header10.i.preheader:                  ; preds = %middle.block29, %polly.loop_header10.preheader.i
  %polly.indvar14.i.ph = phi i64 [ 0, %polly.loop_header10.preheader.i ], [ %resume.val38, %middle.block29 ]
  %46 = zext i32 %cond.reg2mem.0 to i64
  %47 = add i64 %46, -1
  %48 = icmp sgt i64 %polly.indvar14.i.ph, %47
  %smax102 = select i1 %48, i64 %polly.indvar14.i.ph, i64 %47
  %49 = add i64 %smax102, 1
  %50 = sub i64 %49, %polly.indvar14.i.ph
  %xtraiter = and i64 %50, 3
  %lcmp.mod = icmp ne i64 %xtraiter, 0
  %lcmp.overflow = icmp eq i64 %50, 0
  %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod
  br i1 %lcmp.or, label %unr.cmp114, label %polly.loop_header10.i.preheader.split

unr.cmp114:                                       ; preds = %polly.loop_header10.i.preheader
  %un.tmp115 = icmp eq i64 %xtraiter, 1
  br i1 %un.tmp115, label %polly.loop_header10.i.unr108, label %unr.cmp

unr.cmp:                                          ; preds = %unr.cmp114
  %un.tmp = icmp eq i64 %xtraiter, 2
  br i1 %un.tmp, label %polly.loop_header10.i.unr103, label %polly.loop_header10.i.unr

polly.loop_header10.i.unr:                        ; preds = %unr.cmp
  %p_i.0220.i.unr = trunc i64 %polly.indvar14.i.ph to i32
  %p_arrayidx21.i.unr = getelementptr i32* %1, i64 %polly.indvar14.i.ph
  store i32 %p_i.0220.i.unr, i32* %p_arrayidx21.i.unr, align 4
  %polly.indvar_next15.i.unr = add nsw i64 %polly.indvar14.i.ph, 1
  %polly.loop_cond17.i.unr = icmp sgt i64 %polly.indvar14.i.ph, %polly.adjust_ub16.i.pre
  br label %polly.loop_header10.i.unr103

polly.loop_header10.i.unr103:                     ; preds = %unr.cmp, %polly.loop_header10.i.unr
  %polly.indvar14.i.unr = phi i64 [ %polly.indvar_next15.i.unr, %polly.loop_header10.i.unr ], [ %polly.indvar14.i.ph, %unr.cmp ]
  %p_i.0220.i.unr104 = trunc i64 %polly.indvar14.i.unr to i32
  %p_arrayidx21.i.unr105 = getelementptr i32* %1, i64 %polly.indvar14.i.unr
  store i32 %p_i.0220.i.unr104, i32* %p_arrayidx21.i.unr105, align 4
  %polly.indvar_next15.i.unr106 = add nsw i64 %polly.indvar14.i.unr, 1
  %polly.loop_cond17.i.unr107 = icmp sgt i64 %polly.indvar14.i.unr, %polly.adjust_ub16.i.pre
  br label %polly.loop_header10.i.unr108

polly.loop_header10.i.unr108:                     ; preds = %unr.cmp114, %polly.loop_header10.i.unr103
  %polly.indvar14.i.unr109 = phi i64 [ %polly.indvar_next15.i.unr106, %polly.loop_header10.i.unr103 ], [ %polly.indvar14.i.ph, %unr.cmp114 ]
  %p_i.0220.i.unr110 = trunc i64 %polly.indvar14.i.unr109 to i32
  %p_arrayidx21.i.unr111 = getelementptr i32* %1, i64 %polly.indvar14.i.unr109
  store i32 %p_i.0220.i.unr110, i32* %p_arrayidx21.i.unr111, align 4
  %polly.indvar_next15.i.unr112 = add nsw i64 %polly.indvar14.i.unr109, 1
  %polly.loop_cond17.i.unr113 = icmp sgt i64 %polly.indvar14.i.unr109, %polly.adjust_ub16.i.pre
  br label %polly.loop_header10.i.preheader.split

polly.loop_header10.i.preheader.split:            ; preds = %polly.loop_header10.i.unr108, %polly.loop_header10.i.preheader
  %polly.indvar14.i.unr116 = phi i64 [ %polly.indvar14.i.ph, %polly.loop_header10.i.preheader ], [ %polly.indvar_next15.i.unr112, %polly.loop_header10.i.unr108 ]
  %51 = icmp ult i64 %50, 4
  br i1 %51, label %vector.body57.preheader.loopexit, label %polly.loop_header10.i.preheader.split.split

polly.loop_header10.i.preheader.split.split:      ; preds = %polly.loop_header10.i.preheader.split
  br label %polly.loop_header10.i

overflow.checked37:                               ; preds = %polly.loop_header10.preheader.i
  %52 = add nsw i64 %smax24, 1
  %end.idx32 = add nsw i64 %smax24, 1
  %n.vec34 = and i64 %52, -4
  %cmp.zero36 = icmp eq i64 %n.vec34, 0
  br i1 %cmp.zero36, label %middle.block29, label %vector.body28.preheader

vector.body28.preheader:                          ; preds = %overflow.checked37
  %53 = zext i32 %cond.reg2mem.0 to i64
  %54 = add i64 %53, -1
  %55 = icmp sgt i64 %54, 0
  %smax117 = select i1 %55, i64 %54, i64 0
  %56 = add i64 %smax117, 1
  %57 = lshr i64 %56, 2
  %58 = mul i64 %57, 4
  %59 = add i64 %58, -4
  %60 = lshr i64 %59, 2
  %61 = add i64 %60, 1
  %xtraiter118 = and i64 %61, 1
  %lcmp.mod119 = icmp ne i64 %xtraiter118, 0
  %lcmp.overflow120 = icmp eq i64 %61, 0
  %lcmp.or121 = or i1 %lcmp.overflow120, %lcmp.mod119
  br i1 %lcmp.or121, label %vector.body28.unr, label %vector.body28.preheader.split

vector.body28.unr:                                ; preds = %vector.body28.preheader
  %62 = trunc i64 0 to i32
  %broadcast.splatinsert48.unr = insertelement <2 x i32> zeroinitializer, i32 %62, i32 0
  %broadcast.splat49.unr = shufflevector <2 x i32> %broadcast.splatinsert48.unr, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
  %induction50.unr = add <2 x i32> %broadcast.splat49.unr, <i32 0, i32 1>
  %induction51.unr = add <2 x i32> %broadcast.splat49.unr, <i32 2, i32 3>
  %63 = getelementptr i32* %1, i64 0
  %64 = bitcast i32* %63 to <2 x i32>*
  store <2 x i32> %induction50.unr, <2 x i32>* %64, align 4
  %.sum99.unr = or i64 0, 2
  %65 = getelementptr i32* %1, i64 %.sum99.unr
  %66 = bitcast i32* %65 to <2 x i32>*
  store <2 x i32> %induction51.unr, <2 x i32>* %66, align 4
  %index.next43.unr = add i64 0, 4
  %67 = icmp eq i64 %index.next43.unr, %n.vec34
  br label %vector.body28.preheader.split

vector.body28.preheader.split:                    ; preds = %vector.body28.unr, %vector.body28.preheader
  %index31.unr = phi i64 [ 0, %vector.body28.preheader ], [ %index.next43.unr, %vector.body28.unr ]
  %68 = icmp ult i64 %61, 2
  br i1 %68, label %middle.block29.loopexit, label %vector.body28.preheader.split.split

vector.body28.preheader.split.split:              ; preds = %vector.body28.preheader.split
  br label %vector.body28

vector.body28:                                    ; preds = %vector.body28, %vector.body28.preheader.split.split
  %index31 = phi i64 [ %index31.unr, %vector.body28.preheader.split.split ], [ %index.next43.1, %vector.body28 ]
  %69 = trunc i64 %index31 to i32
  %broadcast.splatinsert48 = insertelement <2 x i32> zeroinitializer, i32 %69, i32 0
  %broadcast.splat49 = shufflevector <2 x i32> %broadcast.splatinsert48, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
  %induction50 = add <2 x i32> %broadcast.splat49, <i32 0, i32 1>
  %induction51 = add <2 x i32> %broadcast.splat49, <i32 2, i32 3>
  %70 = getelementptr i32* %1, i64 %index31
  %71 = bitcast i32* %70 to <2 x i32>*
  store <2 x i32> %induction50, <2 x i32>* %71, align 4
  %.sum99 = or i64 %index31, 2
  %72 = getelementptr i32* %1, i64 %.sum99
  %73 = bitcast i32* %72 to <2 x i32>*
  store <2 x i32> %induction51, <2 x i32>* %73, align 4
  %index.next43 = add i64 %index31, 4
  %74 = trunc i64 %index.next43 to i32
  %broadcast.splatinsert48.1 = insertelement <2 x i32> zeroinitializer, i32 %74, i32 0
  %broadcast.splat49.1 = shufflevector <2 x i32> %broadcast.splatinsert48.1, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
  %induction50.1 = add <2 x i32> %broadcast.splat49.1, <i32 0, i32 1>
  %induction51.1 = add <2 x i32> %broadcast.splat49.1, <i32 2, i32 3>
  %75 = getelementptr i32* %1, i64 %index.next43
  %76 = bitcast i32* %75 to <2 x i32>*
  store <2 x i32> %induction50.1, <2 x i32>* %76, align 4
  %.sum99.1 = or i64 %index.next43, 2
  %77 = getelementptr i32* %1, i64 %.sum99.1
  %78 = bitcast i32* %77 to <2 x i32>*
  store <2 x i32> %induction51.1, <2 x i32>* %78, align 4
  %index.next43.1 = add i64 %index.next43, 4
  %79 = icmp eq i64 %index.next43.1, %n.vec34
  br i1 %79, label %middle.block29.loopexit.unr-lcssa, label %vector.body28, !llvm.loop !8

middle.block29.loopexit.unr-lcssa:                ; preds = %vector.body28
  br label %middle.block29.loopexit

middle.block29.loopexit:                          ; preds = %vector.body28.preheader.split, %middle.block29.loopexit.unr-lcssa
  br label %middle.block29

middle.block29:                                   ; preds = %middle.block29.loopexit, %overflow.checked37
  %resume.val38 = phi i64 [ 0, %overflow.checked37 ], [ %n.vec34, %middle.block29.loopexit ]
  %cmp.n42 = icmp eq i64 %end.idx32, %resume.val38
  br i1 %cmp.n42, label %vector.body57.preheader, label %polly.loop_header10.i.preheader

polly.loop_header.i:                              ; preds = %polly.loop_header.i, %polly.loop_header.i.preheader100.split.split
  %polly.indvar.i = phi i64 [ %polly.indvar.i.unr142, %polly.loop_header.i.preheader100.split.split ], [ %polly.indvar_next.i.3, %polly.loop_header.i ]
  %p_i.02.i = trunc i64 %polly.indvar.i to i32
  %p_arrayidx.i = getelementptr i32* %1, i64 %polly.indvar.i
  store i32 %p_i.02.i, i32* %p_arrayidx.i, align 4
  %polly.indvar_next.i = add nsw i64 %polly.indvar.i, 1
  %p_i.02.i.1 = trunc i64 %polly.indvar_next.i to i32
  %p_arrayidx.i.1 = getelementptr i32* %1, i64 %polly.indvar_next.i
  store i32 %p_i.02.i.1, i32* %p_arrayidx.i.1, align 4
  %polly.indvar_next.i.1 = add nsw i64 %polly.indvar_next.i, 1
  %p_i.02.i.2 = trunc i64 %polly.indvar_next.i.1 to i32
  %p_arrayidx.i.2 = getelementptr i32* %1, i64 %polly.indvar_next.i.1
  store i32 %p_i.02.i.2, i32* %p_arrayidx.i.2, align 4
  %polly.indvar_next.i.2 = add nsw i64 %polly.indvar_next.i.1, 1
  %p_i.02.i.3 = trunc i64 %polly.indvar_next.i.2 to i32
  %p_arrayidx.i.3 = getelementptr i32* %1, i64 %polly.indvar_next.i.2
  store i32 %p_i.02.i.3, i32* %p_arrayidx.i.3, align 4
  %polly.indvar_next.i.3 = add nsw i64 %polly.indvar_next.i.2, 1
  %polly.loop_cond.i.3 = icmp sgt i64 %polly.indvar_next.i.2, %polly.adjust_ub16.i.pre
  br i1 %polly.loop_cond.i.3, label %polly.cond6.i.loopexit.unr-lcssa, label %polly.loop_header.i, !llvm.loop !9

polly.loop_header10.i:                            ; preds = %polly.loop_header10.i, %polly.loop_header10.i.preheader.split.split
  %polly.indvar14.i = phi i64 [ %polly.indvar14.i.unr116, %polly.loop_header10.i.preheader.split.split ], [ %polly.indvar_next15.i.3, %polly.loop_header10.i ]
  %p_i.0220.i = trunc i64 %polly.indvar14.i to i32
  %p_arrayidx21.i = getelementptr i32* %1, i64 %polly.indvar14.i
  store i32 %p_i.0220.i, i32* %p_arrayidx21.i, align 4
  %polly.indvar_next15.i = add nsw i64 %polly.indvar14.i, 1
  %p_i.0220.i.1 = trunc i64 %polly.indvar_next15.i to i32
  %p_arrayidx21.i.1 = getelementptr i32* %1, i64 %polly.indvar_next15.i
  store i32 %p_i.0220.i.1, i32* %p_arrayidx21.i.1, align 4
  %polly.indvar_next15.i.1 = add nsw i64 %polly.indvar_next15.i, 1
  %p_i.0220.i.2 = trunc i64 %polly.indvar_next15.i.1 to i32
  %p_arrayidx21.i.2 = getelementptr i32* %1, i64 %polly.indvar_next15.i.1
  store i32 %p_i.0220.i.2, i32* %p_arrayidx21.i.2, align 4
  %polly.indvar_next15.i.2 = add nsw i64 %polly.indvar_next15.i.1, 1
  %p_i.0220.i.3 = trunc i64 %polly.indvar_next15.i.2 to i32
  %p_arrayidx21.i.3 = getelementptr i32* %1, i64 %polly.indvar_next15.i.2
  store i32 %p_i.0220.i.3, i32* %p_arrayidx21.i.3, align 4
  %polly.indvar_next15.i.3 = add nsw i64 %polly.indvar_next15.i.2, 1
  %polly.loop_cond17.i.3 = icmp sgt i64 %polly.indvar_next15.i.2, %polly.adjust_ub16.i.pre
  br i1 %polly.loop_cond17.i.3, label %vector.body57.preheader.loopexit.unr-lcssa, label %polly.loop_header10.i, !llvm.loop !10

vector.body57.preheader.loopexit.unr-lcssa:       ; preds = %polly.loop_header10.i
  br label %vector.body57.preheader.loopexit

vector.body57.preheader.loopexit:                 ; preds = %polly.loop_header10.i.preheader.split, %vector.body57.preheader.loopexit.unr-lcssa
  br label %vector.body57.preheader

vector.body57.preheader:                          ; preds = %vector.body57.preheader.loopexit, %middle.block29, %cond.end, %polly.cond6.i
  br label %vector.body57

vector.body57:                                    ; preds = %vector.body57.preheader
  %80 = bitcast i32* %1 to <4 x i32>*
  %wide.load = load <4 x i32>* %80, align 4
  %81 = getelementptr i32* %1, i64 4
  %82 = bitcast i32* %81 to <4 x i32>*
  %wide.load.1 = load <4 x i32>* %82, align 4
  %83 = add nsw <4 x i32> %wide.load.1, %wide.load
  %84 = getelementptr i32* %1, i64 8
  %85 = bitcast i32* %84 to <4 x i32>*
  %wide.load.2 = load <4 x i32>* %85, align 4
  %86 = add nsw <4 x i32> %wide.load.2, %83
  %87 = getelementptr i32* %1, i64 12
  %88 = bitcast i32* %87 to <4 x i32>*
  %wide.load.3 = load <4 x i32>* %88, align 4
  %89 = add nsw <4 x i32> %wide.load.3, %86
  %90 = getelementptr i32* %1, i64 16
  %91 = bitcast i32* %90 to <4 x i32>*
  %wide.load.4 = load <4 x i32>* %91, align 4
  %92 = add nsw <4 x i32> %wide.load.4, %89
  %93 = getelementptr i32* %1, i64 20
  %94 = bitcast i32* %93 to <4 x i32>*
  %wide.load.5 = load <4 x i32>* %94, align 4
  %95 = add nsw <4 x i32> %wide.load.5, %92
  %96 = getelementptr i32* %1, i64 24
  %97 = bitcast i32* %96 to <4 x i32>*
  %wide.load.6 = load <4 x i32>* %97, align 4
  %98 = add nsw <4 x i32> %wide.load.6, %95
  %99 = getelementptr i32* %1, i64 28
  %100 = bitcast i32* %99 to <4 x i32>*
  %wide.load.7 = load <4 x i32>* %100, align 4
  %101 = add nsw <4 x i32> %wide.load.7, %98
  %102 = getelementptr i32* %1, i64 32
  %103 = bitcast i32* %102 to <4 x i32>*
  %wide.load.8 = load <4 x i32>* %103, align 4
  %104 = add nsw <4 x i32> %wide.load.8, %101
  %105 = getelementptr i32* %1, i64 36
  %106 = bitcast i32* %105 to <4 x i32>*
  %wide.load.9 = load <4 x i32>* %106, align 4
  %107 = add nsw <4 x i32> %wide.load.9, %104
  %108 = getelementptr i32* %1, i64 40
  %109 = bitcast i32* %108 to <4 x i32>*
  %wide.load.10 = load <4 x i32>* %109, align 4
  %110 = add nsw <4 x i32> %wide.load.10, %107
  %111 = getelementptr i32* %1, i64 44
  %112 = bitcast i32* %111 to <4 x i32>*
  %wide.load.11 = load <4 x i32>* %112, align 4
  %113 = add nsw <4 x i32> %wide.load.11, %110
  %114 = getelementptr i32* %1, i64 48
  %115 = bitcast i32* %114 to <4 x i32>*
  %wide.load.12 = load <4 x i32>* %115, align 4
  %116 = add nsw <4 x i32> %wide.load.12, %113
  %117 = getelementptr i32* %1, i64 52
  %118 = bitcast i32* %117 to <4 x i32>*
  %wide.load.13 = load <4 x i32>* %118, align 4
  %119 = add nsw <4 x i32> %wide.load.13, %116
  %120 = getelementptr i32* %1, i64 56
  %121 = bitcast i32* %120 to <4 x i32>*
  %wide.load.14 = load <4 x i32>* %121, align 4
  %122 = add nsw <4 x i32> %wide.load.14, %119
  %123 = getelementptr i32* %1, i64 60
  %124 = bitcast i32* %123 to <4 x i32>*
  %wide.load.15 = load <4 x i32>* %124, align 4
  %125 = add nsw <4 x i32> %wide.load.15, %122
  %126 = getelementptr i32* %1, i64 64
  %127 = bitcast i32* %126 to <4 x i32>*
  %wide.load.16 = load <4 x i32>* %127, align 4
  %128 = add nsw <4 x i32> %wide.load.16, %125
  %129 = getelementptr i32* %1, i64 68
  %130 = bitcast i32* %129 to <4 x i32>*
  %wide.load.17 = load <4 x i32>* %130, align 4
  %131 = add nsw <4 x i32> %wide.load.17, %128
  %132 = getelementptr i32* %1, i64 72
  %133 = bitcast i32* %132 to <4 x i32>*
  %wide.load.18 = load <4 x i32>* %133, align 4
  %134 = add nsw <4 x i32> %wide.load.18, %131
  %135 = getelementptr i32* %1, i64 76
  %136 = bitcast i32* %135 to <4 x i32>*
  %wide.load.19 = load <4 x i32>* %136, align 4
  %137 = add nsw <4 x i32> %wide.load.19, %134
  %138 = getelementptr i32* %1, i64 80
  %139 = bitcast i32* %138 to <4 x i32>*
  %wide.load.20 = load <4 x i32>* %139, align 4
  %140 = add nsw <4 x i32> %wide.load.20, %137
  %141 = getelementptr i32* %1, i64 84
  %142 = bitcast i32* %141 to <4 x i32>*
  %wide.load.21 = load <4 x i32>* %142, align 4
  %143 = add nsw <4 x i32> %wide.load.21, %140
  %144 = getelementptr i32* %1, i64 88
  %145 = bitcast i32* %144 to <4 x i32>*
  %wide.load.22 = load <4 x i32>* %145, align 4
  %146 = add nsw <4 x i32> %wide.load.22, %143
  %147 = getelementptr i32* %1, i64 92
  %148 = bitcast i32* %147 to <4 x i32>*
  %wide.load.23 = load <4 x i32>* %148, align 4
  %149 = add nsw <4 x i32> %wide.load.23, %146
  %150 = getelementptr i32* %1, i64 96
  %151 = bitcast i32* %150 to <4 x i32>*
  %wide.load.24 = load <4 x i32>* %151, align 4
  %152 = add nsw <4 x i32> %wide.load.24, %149
  %rdx.shuf = shufflevector <4 x i32> %152, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 0, i32 0>
  %bin.rdx = add <4 x i32> %152, %rdx.shuf
  %rdx.shuf71 = shufflevector <4 x i32> %bin.rdx, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
  %bin.rdx72 = add <4 x i32> %bin.rdx, %rdx.shuf71
  %153 = extractelement <4 x i32> %bin.rdx72, i32 0
  %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i64 0, i64 0), i32 %153) #2
  br label %vector.body76

vector.body76:                                    ; preds = %vector.body57
  %154 = bitcast i32* %1 to <4 x i32>*
  %wide.load91 = load <4 x i32>* %154, align 4
  %155 = getelementptr i32* %1, i64 4
  %156 = bitcast i32* %155 to <4 x i32>*
  %wide.load91.1 = load <4 x i32>* %156, align 4
  %157 = add nsw <4 x i32> %wide.load91.1, %wide.load91
  %158 = getelementptr i32* %1, i64 8
  %159 = bitcast i32* %158 to <4 x i32>*
  %wide.load91.2 = load <4 x i32>* %159, align 4
  %160 = add nsw <4 x i32> %wide.load91.2, %157
  %161 = getelementptr i32* %1, i64 12
  %162 = bitcast i32* %161 to <4 x i32>*
  %wide.load91.3 = load <4 x i32>* %162, align 4
  %163 = add nsw <4 x i32> %wide.load91.3, %160
  %164 = getelementptr i32* %1, i64 16
  %165 = bitcast i32* %164 to <4 x i32>*
  %wide.load91.4 = load <4 x i32>* %165, align 4
  %166 = add nsw <4 x i32> %wide.load91.4, %163
  %167 = getelementptr i32* %1, i64 20
  %168 = bitcast i32* %167 to <4 x i32>*
  %wide.load91.5 = load <4 x i32>* %168, align 4
  %169 = add nsw <4 x i32> %wide.load91.5, %166
  %170 = getelementptr i32* %1, i64 24
  %171 = bitcast i32* %170 to <4 x i32>*
  %wide.load91.6 = load <4 x i32>* %171, align 4
  %172 = add nsw <4 x i32> %wide.load91.6, %169
  %173 = getelementptr i32* %1, i64 28
  %174 = bitcast i32* %173 to <4 x i32>*
  %wide.load91.7 = load <4 x i32>* %174, align 4
  %175 = add nsw <4 x i32> %wide.load91.7, %172
  %176 = getelementptr i32* %1, i64 32
  %177 = bitcast i32* %176 to <4 x i32>*
  %wide.load91.8 = load <4 x i32>* %177, align 4
  %178 = add nsw <4 x i32> %wide.load91.8, %175
  %179 = getelementptr i32* %1, i64 36
  %180 = bitcast i32* %179 to <4 x i32>*
  %wide.load91.9 = load <4 x i32>* %180, align 4
  %181 = add nsw <4 x i32> %wide.load91.9, %178
  %182 = getelementptr i32* %1, i64 40
  %183 = bitcast i32* %182 to <4 x i32>*
  %wide.load91.10 = load <4 x i32>* %183, align 4
  %184 = add nsw <4 x i32> %wide.load91.10, %181
  %185 = getelementptr i32* %1, i64 44
  %186 = bitcast i32* %185 to <4 x i32>*
  %wide.load91.11 = load <4 x i32>* %186, align 4
  %187 = add nsw <4 x i32> %wide.load91.11, %184
  %188 = getelementptr i32* %1, i64 48
  %189 = bitcast i32* %188 to <4 x i32>*
  %wide.load91.12 = load <4 x i32>* %189, align 4
  %190 = add nsw <4 x i32> %wide.load91.12, %187
  %191 = getelementptr i32* %1, i64 52
  %192 = bitcast i32* %191 to <4 x i32>*
  %wide.load91.13 = load <4 x i32>* %192, align 4
  %193 = add nsw <4 x i32> %wide.load91.13, %190
  %194 = getelementptr i32* %1, i64 56
  %195 = bitcast i32* %194 to <4 x i32>*
  %wide.load91.14 = load <4 x i32>* %195, align 4
  %196 = add nsw <4 x i32> %wide.load91.14, %193
  %197 = getelementptr i32* %1, i64 60
  %198 = bitcast i32* %197 to <4 x i32>*
  %wide.load91.15 = load <4 x i32>* %198, align 4
  %199 = add nsw <4 x i32> %wide.load91.15, %196
  %200 = getelementptr i32* %1, i64 64
  %201 = bitcast i32* %200 to <4 x i32>*
  %wide.load91.16 = load <4 x i32>* %201, align 4
  %202 = add nsw <4 x i32> %wide.load91.16, %199
  %203 = getelementptr i32* %1, i64 68
  %204 = bitcast i32* %203 to <4 x i32>*
  %wide.load91.17 = load <4 x i32>* %204, align 4
  %205 = add nsw <4 x i32> %wide.load91.17, %202
  %206 = getelementptr i32* %1, i64 72
  %207 = bitcast i32* %206 to <4 x i32>*
  %wide.load91.18 = load <4 x i32>* %207, align 4
  %208 = add nsw <4 x i32> %wide.load91.18, %205
  %209 = getelementptr i32* %1, i64 76
  %210 = bitcast i32* %209 to <4 x i32>*
  %wide.load91.19 = load <4 x i32>* %210, align 4
  %211 = add nsw <4 x i32> %wide.load91.19, %208
  %212 = getelementptr i32* %1, i64 80
  %213 = bitcast i32* %212 to <4 x i32>*
  %wide.load91.20 = load <4 x i32>* %213, align 4
  %214 = add nsw <4 x i32> %wide.load91.20, %211
  %215 = getelementptr i32* %1, i64 84
  %216 = bitcast i32* %215 to <4 x i32>*
  %wide.load91.21 = load <4 x i32>* %216, align 4
  %217 = add nsw <4 x i32> %wide.load91.21, %214
  %218 = getelementptr i32* %1, i64 88
  %219 = bitcast i32* %218 to <4 x i32>*
  %wide.load91.22 = load <4 x i32>* %219, align 4
  %220 = add nsw <4 x i32> %wide.load91.22, %217
  %221 = getelementptr i32* %1, i64 92
  %222 = bitcast i32* %221 to <4 x i32>*
  %wide.load91.23 = load <4 x i32>* %222, align 4
  %223 = add nsw <4 x i32> %wide.load91.23, %220
  %224 = getelementptr i32* %1, i64 96
  %225 = bitcast i32* %224 to <4 x i32>*
  %wide.load91.24 = load <4 x i32>* %225, align 4
  %226 = add nsw <4 x i32> %wide.load91.24, %223
  %rdx.shuf93 = shufflevector <4 x i32> %226, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 0, i32 0>
  %bin.rdx94 = add <4 x i32> %226, %rdx.shuf93
  %rdx.shuf95 = shufflevector <4 x i32> %bin.rdx94, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
  %bin.rdx96 = add <4 x i32> %bin.rdx94, %rdx.shuf95
  %227 = extractelement <4 x i32> %bin.rdx96, i32 0
  %call5 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str1, i64 0, i64 0), i32 %227) #2
  tail call void @free(i8* %call1) #2
  ret i32 0
}

; Function Attrs: nounwind
declare noalias i8* @malloc(i64) #1

; Function Attrs: nounwind
declare i32 @printf(i8* nocapture readonly, ...) #1

; Function Attrs: nounwind
declare void @free(i8* nocapture) #1

; Function Attrs: nounwind
declare i64 @strtol(i8* readonly, i8** nocapture, i32) #1

attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.5.0 "}
!1 = metadata !{metadata !2, metadata !2, i64 0}
!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
!4 = metadata !{metadata !"Simple C/C++ TBAA"}
!5 = metadata !{metadata !5, metadata !6, metadata !7}
!6 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
!7 = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 1}
!8 = metadata !{metadata !8, metadata !6, metadata !7}
!9 = metadata !{metadata !9, metadata !6, metadata !7}
!10 = metadata !{metadata !10, metadata !6, metadata !7}
-------------- next part --------------
	.text
	.file	"out.ll"
	.section	.rodata.cst16,"aM", at progbits,16
	.align	16
.LCPI0_0:
	.quad	2                       # 0x2
	.quad	3                       # 0x3
	.text
	.globl	main
	.align	16, 0x90
	.type	main, at function
main:                                   # @main
	.cfi_startproc
# BB#0:                                 # %entry
	subq	$328, %rsp              # imm = 0x148
.Ltmp0:
	.cfi_def_cfa_offset 336
	movl	$100, %eax
	cmpl	$2, %edi
	movq	%rsi, 320(%rsp)         # 8-byte Spill
	movl	%eax, 316(%rsp)         # 4-byte Spill
	jl	.LBB0_2
# BB#1:                                 # %cond.false
	movq	320(%rsp), %rax         # 8-byte Reload
	movq	8(%rax), %rdi
	xorl	%ecx, %ecx
	movl	%ecx, %esi
	movl	$10, %edx
	callq	strtol
	movl	%eax, %ecx
	movl	%ecx, 316(%rsp)         # 4-byte Spill
.LBB0_2:                                # %cond.end
	movl	316(%rsp), %eax         # 4-byte Reload
	movslq	%eax, %rcx
	shlq	$2, %rcx
	movq	%rcx, %rdi
	movl	%eax, 312(%rsp)         # 4-byte Spill
	callq	malloc
	movq	%rax, %rcx
	movl	312(%rsp), %edx         # 4-byte Reload
	cmpl	$0, %edx
	movq	%rax, 304(%rsp)         # 8-byte Spill
	movq	%rcx, 296(%rsp)         # 8-byte Spill
	je	.LBB0_47
# BB#3:                                 # %polly.cond3.i
	movl	312(%rsp), %eax         # 4-byte Reload
	movl	%eax, %ecx
	movl	%ecx, %edx
	addq	$-2, %rdx
	cmpl	$1, %eax
	movq	%rdx, 288(%rsp)         # 8-byte Spill
	jl	.LBB0_25
# BB#4:                                 # %polly.loop_header.i.preheader
	movabsq	$0, %rax
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	$0, %rsi
	movq	%rax, %rdi
	cmovgq	%rsi, %rdi
	cmpq	$-1, %rdi
	movq	%rdi, 280(%rsp)         # 8-byte Spill
	movq	%rax, 272(%rsp)         # 8-byte Spill
	jne	.LBB0_13
.LBB0_5:                                # %polly.loop_header.i.preheader100
	movq	272(%rsp), %rax         # 8-byte Reload
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	%rsi, %rax
	cmovgq	%rax, %rsi
	addq	$1, %rsi
	subq	%rax, %rsi
	movq	%rsi, %rdi
	andq	$3, %rdi
	cmpq	$0, %rdi
	setne	%r8b
	cmpq	$0, %rsi
	sete	%r9b
	orb	%r8b, %r9b
	testb	$1, %r9b
	movq	%rax, %r10
	movq	%rax, 264(%rsp)         # 8-byte Spill
	movq	%rsi, 256(%rsp)         # 8-byte Spill
	movq	%rdi, 248(%rsp)         # 8-byte Spill
	movq	%r10, 240(%rsp)         # 8-byte Spill
	jne	.LBB0_6
	jmp	.LBB0_11
.LBB0_6:                                # %unr.cmp140
	movq	248(%rsp), %rax         # 8-byte Reload
	cmpq	$1, %rax
	movq	264(%rsp), %rcx         # 8-byte Reload
	movq	%rcx, 232(%rsp)         # 8-byte Spill
	je	.LBB0_10
# BB#7:                                 # %unr.cmp132
	movq	248(%rsp), %rax         # 8-byte Reload
	cmpq	$2, %rax
	movq	264(%rsp), %rcx         # 8-byte Reload
	movq	%rcx, 224(%rsp)         # 8-byte Spill
	je	.LBB0_9
# BB#8:                                 # %polly.loop_header.i.unr
	movq	264(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 224(%rsp)         # 8-byte Spill
.LBB0_9:                                # %polly.loop_header.i.unr127
	movq	224(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 232(%rsp)         # 8-byte Spill
.LBB0_10:                               # %polly.loop_header.i.unr134
	movq	232(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 240(%rsp)         # 8-byte Spill
.LBB0_11:                               # %polly.loop_header.i.preheader100.split
	movq	240(%rsp), %rax         # 8-byte Reload
	movq	256(%rsp), %rcx         # 8-byte Reload
	cmpq	$4, %rcx
	movq	%rax, 216(%rsp)         # 8-byte Spill
	jb	.LBB0_23
# BB#12:                                # %polly.loop_header.i.preheader100.split.split
	movq	216(%rsp), %rax         # 8-byte Reload
	movq	%rax, 208(%rsp)         # 8-byte Spill
	jmp	.LBB0_43
.LBB0_13:                               # %overflow.checked
	movabsq	$0, %rax
	movq	280(%rsp), %rcx         # 8-byte Reload
	addq	$1, %rcx
	movq	280(%rsp), %rdx         # 8-byte Reload
	addq	$1, %rdx
	andq	$-4, %rcx
	cmpq	$0, %rcx
	movq	%rcx, 200(%rsp)         # 8-byte Spill
	movq	%rdx, 192(%rsp)         # 8-byte Spill
	movq	%rax, 184(%rsp)         # 8-byte Spill
	je	.LBB0_21
# BB#14:                                # %vector.body.preheader
	movabsq	$0, %rax
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	$0, %rsi
	movq	%rax, %rdi
	cmovgq	%rsi, %rdi
	addq	$1, %rdi
	shrq	$2, %rdi
	shlq	$2, %rdi
	addq	$-4, %rdi
	shrq	$2, %rdi
	addq	$1, %rdi
	movq	%rdi, %rsi
	andq	$1, %rsi
	cmpq	$0, %rsi
	setne	%r8b
	cmpq	$0, %rdi
	sete	%r9b
	orb	%r8b, %r9b
	testb	$1, %r9b
	movq	%rdi, 176(%rsp)         # 8-byte Spill
	movq	%rax, 168(%rsp)         # 8-byte Spill
	jne	.LBB0_15
	jmp	.LBB0_16
.LBB0_15:                               # %vector.body.unr
	movabsq	$4, %rax
	movl	$1, %ecx
	movl	%ecx, %edx
	movd	%rdx, %xmm0
	pslldq	$8, %xmm0
	movaps	.LCPI0_0(%rip), %xmm1
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	296(%rsp), %rdx         # 8-byte Reload
	movq	%xmm0, (%rdx)
	pshufd	$8, %xmm1, %xmm0        # xmm0 = xmm1[0,2,0,0]
	movq	%xmm0, 8(%rdx)
	addq	$0, %rax
	movq	%rax, 168(%rsp)         # 8-byte Spill
.LBB0_16:                               # %vector.body.preheader.split
	movq	168(%rsp), %rax         # 8-byte Reload
	movq	176(%rsp), %rcx         # 8-byte Reload
	cmpq	$2, %rcx
	movq	%rax, 160(%rsp)         # 8-byte Spill
	jb	.LBB0_20
# BB#17:                                # %vector.body.preheader.split.split
	movq	160(%rsp), %rax         # 8-byte Reload
	movq	%rax, 152(%rsp)         # 8-byte Spill
	jmp	.LBB0_18
.LBB0_18:                               # %vector.body
                                        # =>This Inner Loop Header: Depth=1
	movq	152(%rsp), %rax         # 8-byte Reload
	movd	%rax, %xmm0
	movlhps	%xmm0, %xmm0            # xmm0 = xmm0[0,0]
	movl	$1, %ecx
	movl	%ecx, %edx
	movd	%rdx, %xmm1
	pslldq	$8, %xmm1
	movaps	%xmm0, %xmm2
	paddq	%xmm1, %xmm2
	movaps	.LCPI0_0(%rip), %xmm3
	paddq	%xmm3, %xmm0
	pshufd	$8, %xmm2, %xmm2        # xmm2 = xmm2[0,2,0,0]
	movq	296(%rsp), %rdx         # 8-byte Reload
	movq	%xmm2, (%rdx,%rax,4)
	movq	%rax, %rsi
	orq	$2, %rsi
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, (%rdx,%rsi,4)
	movq	%rax, %rsi
	addq	$4, %rsi
	movd	%rsi, %xmm0
	movlhps	%xmm0, %xmm0            # xmm0 = xmm0[0,0]
	movaps	%xmm0, %xmm2
	paddq	%xmm1, %xmm2
	paddq	%xmm3, %xmm0
	pshufd	$8, %xmm2, %xmm1        # xmm1 = xmm2[0,2,0,0]
	movq	%xmm1, 16(%rdx,%rax,4)
	movq	%rsi, %rax
	orq	$2, %rax
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, (%rdx,%rax,4)
	addq	$4, %rsi
	movq	200(%rsp), %rax         # 8-byte Reload
	cmpq	%rax, %rsi
	movq	%rsi, 152(%rsp)         # 8-byte Spill
	jne	.LBB0_18
# BB#19:                                # %middle.block.loopexit.unr-lcssa
	jmp	.LBB0_20
.LBB0_20:                               # %middle.block.loopexit
	movq	200(%rsp), %rax         # 8-byte Reload
	movq	%rax, 184(%rsp)         # 8-byte Spill
	jmp	.LBB0_21
.LBB0_21:                               # %middle.block
	movq	184(%rsp), %rax         # 8-byte Reload
	movq	192(%rsp), %rcx         # 8-byte Reload
	cmpq	%rax, %rcx
	movq	%rax, 272(%rsp)         # 8-byte Spill
	je	.LBB0_24
	jmp	.LBB0_5
.LBB0_22:                               # %polly.cond6.i.loopexit.unr-lcssa
	jmp	.LBB0_23
.LBB0_23:                               # %polly.cond6.i.loopexit
	jmp	.LBB0_24
.LBB0_24:                               # %polly.cond6.i
	movl	312(%rsp), %eax         # 4-byte Reload
	cmpl	$-1, %eax
	jg	.LBB0_47
.LBB0_25:                               # %polly.loop_header10.preheader.i
	movabsq	$0, %rax
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	$0, %rsi
	movq	%rax, %rdi
	cmovgq	%rsi, %rdi
	cmpq	$-1, %rdi
	movq	%rdi, 144(%rsp)         # 8-byte Spill
	movq	%rax, 136(%rsp)         # 8-byte Spill
	jne	.LBB0_34
.LBB0_26:                               # %polly.loop_header10.i.preheader
	movq	136(%rsp), %rax         # 8-byte Reload
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	%rsi, %rax
	cmovgq	%rax, %rsi
	addq	$1, %rsi
	subq	%rax, %rsi
	movq	%rsi, %rdi
	andq	$3, %rdi
	cmpq	$0, %rdi
	setne	%r8b
	cmpq	$0, %rsi
	sete	%r9b
	orb	%r8b, %r9b
	testb	$1, %r9b
	movq	%rax, %r10
	movq	%rax, 128(%rsp)         # 8-byte Spill
	movq	%rsi, 120(%rsp)         # 8-byte Spill
	movq	%rdi, 112(%rsp)         # 8-byte Spill
	movq	%r10, 104(%rsp)         # 8-byte Spill
	jne	.LBB0_27
	jmp	.LBB0_32
.LBB0_27:                               # %unr.cmp114
	movq	112(%rsp), %rax         # 8-byte Reload
	cmpq	$1, %rax
	movq	128(%rsp), %rcx         # 8-byte Reload
	movq	%rcx, 96(%rsp)          # 8-byte Spill
	je	.LBB0_31
# BB#28:                                # %unr.cmp
	movq	112(%rsp), %rax         # 8-byte Reload
	cmpq	$2, %rax
	movq	128(%rsp), %rcx         # 8-byte Reload
	movq	%rcx, 88(%rsp)          # 8-byte Spill
	je	.LBB0_30
# BB#29:                                # %polly.loop_header10.i.unr
	movq	128(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 88(%rsp)          # 8-byte Spill
.LBB0_30:                               # %polly.loop_header10.i.unr103
	movq	88(%rsp), %rax          # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 96(%rsp)          # 8-byte Spill
.LBB0_31:                               # %polly.loop_header10.i.unr108
	movq	96(%rsp), %rax          # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 104(%rsp)         # 8-byte Spill
.LBB0_32:                               # %polly.loop_header10.i.preheader.split
	movq	104(%rsp), %rax         # 8-byte Reload
	movq	120(%rsp), %rcx         # 8-byte Reload
	cmpq	$4, %rcx
	movq	%rax, 80(%rsp)          # 8-byte Spill
	jb	.LBB0_46
# BB#33:                                # %polly.loop_header10.i.preheader.split.split
	movq	80(%rsp), %rax          # 8-byte Reload
	movq	%rax, 72(%rsp)          # 8-byte Spill
	jmp	.LBB0_44
.LBB0_34:                               # %overflow.checked37
	movabsq	$0, %rax
	movq	144(%rsp), %rcx         # 8-byte Reload
	addq	$1, %rcx
	movq	144(%rsp), %rdx         # 8-byte Reload
	addq	$1, %rdx
	andq	$-4, %rcx
	cmpq	$0, %rcx
	movq	%rcx, 64(%rsp)          # 8-byte Spill
	movq	%rdx, 56(%rsp)          # 8-byte Spill
	movq	%rax, 48(%rsp)          # 8-byte Spill
	je	.LBB0_42
# BB#35:                                # %vector.body28.preheader
	movabsq	$0, %rax
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	$0, %rsi
	movq	%rax, %rdi
	cmovgq	%rsi, %rdi
	addq	$1, %rdi
	shrq	$2, %rdi
	shlq	$2, %rdi
	addq	$-4, %rdi
	shrq	$2, %rdi
	addq	$1, %rdi
	movq	%rdi, %rsi
	andq	$1, %rsi
	cmpq	$0, %rsi
	setne	%r8b
	cmpq	$0, %rdi
	sete	%r9b
	orb	%r8b, %r9b
	testb	$1, %r9b
	movq	%rdi, 40(%rsp)          # 8-byte Spill
	movq	%rax, 32(%rsp)          # 8-byte Spill
	jne	.LBB0_36
	jmp	.LBB0_37
.LBB0_36:                               # %vector.body28.unr
	movabsq	$4, %rax
	movl	$1, %ecx
	movl	%ecx, %edx
	movd	%rdx, %xmm0
	pslldq	$8, %xmm0
	movaps	.LCPI0_0(%rip), %xmm1
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	296(%rsp), %rdx         # 8-byte Reload
	movq	%xmm0, (%rdx)
	pshufd	$8, %xmm1, %xmm0        # xmm0 = xmm1[0,2,0,0]
	movq	%xmm0, 8(%rdx)
	addq	$0, %rax
	movq	%rax, 32(%rsp)          # 8-byte Spill
.LBB0_37:                               # %vector.body28.preheader.split
	movq	32(%rsp), %rax          # 8-byte Reload
	movq	40(%rsp), %rcx          # 8-byte Reload
	cmpq	$2, %rcx
	movq	%rax, 24(%rsp)          # 8-byte Spill
	jb	.LBB0_41
# BB#38:                                # %vector.body28.preheader.split.split
	movq	24(%rsp), %rax          # 8-byte Reload
	movq	%rax, 16(%rsp)          # 8-byte Spill
	jmp	.LBB0_39
.LBB0_39:                               # %vector.body28
                                        # =>This Inner Loop Header: Depth=1
	movq	16(%rsp), %rax          # 8-byte Reload
	movd	%rax, %xmm0
	movlhps	%xmm0, %xmm0            # xmm0 = xmm0[0,0]
	movl	$1, %ecx
	movl	%ecx, %edx
	movd	%rdx, %xmm1
	pslldq	$8, %xmm1
	movaps	%xmm0, %xmm2
	paddq	%xmm1, %xmm2
	movaps	.LCPI0_0(%rip), %xmm3
	paddq	%xmm3, %xmm0
	pshufd	$8, %xmm2, %xmm2        # xmm2 = xmm2[0,2,0,0]
	movq	296(%rsp), %rdx         # 8-byte Reload
	movq	%xmm2, (%rdx,%rax,4)
	movq	%rax, %rsi
	orq	$2, %rsi
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, (%rdx,%rsi,4)
	movq	%rax, %rsi
	addq	$4, %rsi
	movd	%rsi, %xmm0
	movlhps	%xmm0, %xmm0            # xmm0 = xmm0[0,0]
	movaps	%xmm0, %xmm2
	paddq	%xmm1, %xmm2
	paddq	%xmm3, %xmm0
	pshufd	$8, %xmm2, %xmm1        # xmm1 = xmm2[0,2,0,0]
	movq	%xmm1, 16(%rdx,%rax,4)
	movq	%rsi, %rax
	orq	$2, %rax
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, (%rdx,%rax,4)
	addq	$4, %rsi
	movq	64(%rsp), %rax          # 8-byte Reload
	cmpq	%rax, %rsi
	movq	%rsi, 16(%rsp)          # 8-byte Spill
	jne	.LBB0_39
# BB#40:                                # %middle.block29.loopexit.unr-lcssa
	jmp	.LBB0_41
.LBB0_41:                               # %middle.block29.loopexit
	movq	64(%rsp), %rax          # 8-byte Reload
	movq	%rax, 48(%rsp)          # 8-byte Spill
	jmp	.LBB0_42
.LBB0_42:                               # %middle.block29
	movq	48(%rsp), %rax          # 8-byte Reload
	movq	56(%rsp), %rcx          # 8-byte Reload
	cmpq	%rax, %rcx
	movq	%rax, 136(%rsp)         # 8-byte Spill
	je	.LBB0_47
	jmp	.LBB0_26
.LBB0_43:                               # %polly.loop_header.i
                                        # =>This Inner Loop Header: Depth=1
	movq	208(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	movq	%rax, %rsi
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 4(%rdx,%rax,4)
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 8(%rdx,%rax,4)
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 12(%rdx,%rax,4)
	movq	%rsi, %rax
	addq	$1, %rax
	movq	288(%rsp), %rdi         # 8-byte Reload
	cmpq	%rdi, %rsi
	movq	%rax, 208(%rsp)         # 8-byte Spill
	jg	.LBB0_22
	jmp	.LBB0_43
.LBB0_44:                               # %polly.loop_header10.i
                                        # =>This Inner Loop Header: Depth=1
	movq	72(%rsp), %rax          # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	movq	%rax, %rsi
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 4(%rdx,%rax,4)
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 8(%rdx,%rax,4)
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 12(%rdx,%rax,4)
	movq	%rsi, %rax
	addq	$1, %rax
	movq	288(%rsp), %rdi         # 8-byte Reload
	cmpq	%rdi, %rsi
	movq	%rax, 72(%rsp)          # 8-byte Spill
	jle	.LBB0_44
# BB#45:                                # %vector.body57.preheader.loopexit.unr-lcssa
	jmp	.LBB0_46
.LBB0_46:                               # %vector.body57.preheader.loopexit
	jmp	.LBB0_47
.LBB0_47:                               # %vector.body57.preheader
	jmp	.LBB0_48
.LBB0_48:                               # %vector.body57
	movq	296(%rsp), %rax         # 8-byte Reload
	movups	(%rax), %xmm0
	movups	16(%rax), %xmm1
	movups	32(%rax), %xmm2
	movups	48(%rax), %xmm3
	paddd	%xmm0, %xmm1
	paddd	%xmm1, %xmm2
	paddd	%xmm2, %xmm3
	movups	64(%rax), %xmm0
	paddd	%xmm3, %xmm0
	movups	80(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	96(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	112(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	128(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	144(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	160(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	176(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	192(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	208(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	224(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	240(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	256(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	272(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	288(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	304(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	320(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	336(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	352(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	368(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	384(%rax), %xmm0
	paddd	%xmm1, %xmm0
	pshufd	$14, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,0]
	paddd	%xmm1, %xmm0
	pshufd	$1, %xmm0, %xmm1        # xmm1 = xmm0[1,0,0,0]
	paddd	%xmm1, %xmm0
	movd	%xmm0, %esi
	movl	$.L.str, %ecx
	movl	%ecx, %edi
	xorl	%ecx, %ecx
	movb	%cl, %dl
	movb	%dl, %al
	callq	printf
	movl	%eax, 12(%rsp)          # 4-byte Spill
# BB#49:                                # %vector.body76
	movq	296(%rsp), %rax         # 8-byte Reload
	movups	(%rax), %xmm0
	movups	16(%rax), %xmm1
	movups	32(%rax), %xmm2
	movups	48(%rax), %xmm3
	paddd	%xmm0, %xmm1
	paddd	%xmm1, %xmm2
	paddd	%xmm2, %xmm3
	movups	64(%rax), %xmm0
	paddd	%xmm3, %xmm0
	movups	80(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	96(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	112(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	128(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	144(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	160(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	176(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	192(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	208(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	224(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	240(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	256(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	272(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	288(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	304(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	320(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	336(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	352(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	368(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	384(%rax), %xmm0
	paddd	%xmm1, %xmm0
	pshufd	$14, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,0]
	paddd	%xmm1, %xmm0
	pshufd	$1, %xmm0, %xmm1        # xmm1 = xmm0[1,0,0,0]
	paddd	%xmm1, %xmm0
	movd	%xmm0, %esi
	movl	$.L.str1, %ecx
	movl	%ecx, %edi
	xorl	%ecx, %ecx
	movb	%cl, %dl
	movb	%dl, %al
	callq	printf
	movq	304(%rsp), %rdi         # 8-byte Reload
	movl	%eax, 8(%rsp)           # 4-byte Spill
	callq	free
	movl	$0, %eax
	addq	$328, %rsp              # imm = 0x148
	retq
.Ltmp1:
	.size	main, .Ltmp1-main
	.cfi_endproc

	.type	.L.str, at object          # @.str
	.section	.rodata.str1.1,"aMS", at progbits,1
.L.str:
	.asciz	"Sum1 = %d\n"
	.size	.L.str, 11

	.type	.L.str1, at object         # @.str1
.L.str1:
	.asciz	"Sum2 = %d\n"
	.size	.L.str1, 11

	.ident	"clang version 3.5.0 "
	.section	".note.GNU-stack","", at progbits
-------------- next part --------------
	.text
	.file	"out.ll"
	.section	.rodata.cst16,"aM", at progbits,16
	.align	16
.LCPI0_0:
	.quad	2                       # 0x2
	.quad	3                       # 0x3
	.text
	.globl	main
	.align	16, 0x90
	.type	main, at function
main:                                   # @main
	.cfi_startproc
# BB#0:                                 # %entry
	subq	$328, %rsp              # imm = 0x148
.Ltmp0:
	.cfi_def_cfa_offset 336
	movl	$100, %eax
	cmpl	$2, %edi
	movq	%rsi, 320(%rsp)         # 8-byte Spill
	movl	%eax, 316(%rsp)         # 4-byte Spill
	jl	.LBB0_2
# BB#1:                                 # %cond.false
	movq	320(%rsp), %rax         # 8-byte Reload
	movq	8(%rax), %rdi
	xorl	%ecx, %ecx
	movl	%ecx, %esi
	movl	$10, %edx
	callq	strtol
	movl	%eax, %ecx
	movl	%ecx, 316(%rsp)         # 4-byte Spill
.LBB0_2:                                # %cond.end
	movl	316(%rsp), %eax         # 4-byte Reload
	movslq	%eax, %rcx
	shlq	$2, %rcx
	movq	%rcx, %rdi
	movl	%eax, 312(%rsp)         # 4-byte Spill
	callq	malloc
	movq	%rax, %rcx
	movl	312(%rsp), %edx         # 4-byte Reload
	cmpl	$0, %edx
	movq	%rax, 304(%rsp)         # 8-byte Spill
	movq	%rcx, 296(%rsp)         # 8-byte Spill
	je	.LBB0_47
# BB#3:                                 # %polly.cond3.i
	movl	312(%rsp), %eax         # 4-byte Reload
	movl	%eax, %ecx
	movl	%ecx, %edx
	addq	$-2, %rdx
	cmpl	$1, %eax
	movq	%rdx, 288(%rsp)         # 8-byte Spill
	jl	.LBB0_25
# BB#4:                                 # %polly.loop_header.i.preheader
	movabsq	$0, %rax
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	$0, %rsi
	movq	%rax, %rdi
	cmovgq	%rsi, %rdi
	cmpq	$-1, %rdi
	movq	%rdi, 280(%rsp)         # 8-byte Spill
	movq	%rax, 272(%rsp)         # 8-byte Spill
	jne	.LBB0_13
.LBB0_5:                                # %polly.loop_header.i.preheader100
	movq	272(%rsp), %rax         # 8-byte Reload
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	%rsi, %rax
	cmovgq	%rax, %rsi
	addq	$1, %rsi
	subq	%rax, %rsi
	movq	%rsi, %rdi
	andq	$3, %rdi
	cmpq	$0, %rdi
	setne	%r8b
	cmpq	$0, %rsi
	sete	%r9b
	orb	%r8b, %r9b
	testb	$1, %r9b
	movq	%rax, %r10
	movq	%rax, 264(%rsp)         # 8-byte Spill
	movq	%rsi, 256(%rsp)         # 8-byte Spill
	movq	%rdi, 248(%rsp)         # 8-byte Spill
	movq	%r10, 240(%rsp)         # 8-byte Spill
	jne	.LBB0_6
	jmp	.LBB0_11
.LBB0_6:                                # %unr.cmp140
	movq	248(%rsp), %rax         # 8-byte Reload
	cmpq	$1, %rax
	movq	264(%rsp), %rcx         # 8-byte Reload
	movq	%rcx, 232(%rsp)         # 8-byte Spill
	je	.LBB0_10
# BB#7:                                 # %unr.cmp132
	movq	248(%rsp), %rax         # 8-byte Reload
	cmpq	$2, %rax
	movq	264(%rsp), %rcx         # 8-byte Reload
	movq	%rcx, 224(%rsp)         # 8-byte Spill
	je	.LBB0_9
# BB#8:                                 # %polly.loop_header.i.unr
	movq	264(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 224(%rsp)         # 8-byte Spill
.LBB0_9:                                # %polly.loop_header.i.unr127
	movq	224(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 232(%rsp)         # 8-byte Spill
.LBB0_10:                               # %polly.loop_header.i.unr134
	movq	232(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 240(%rsp)         # 8-byte Spill
.LBB0_11:                               # %polly.loop_header.i.preheader100.split
	movq	240(%rsp), %rax         # 8-byte Reload
	movq	256(%rsp), %rcx         # 8-byte Reload
	cmpq	$4, %rcx
	movq	%rax, 216(%rsp)         # 8-byte Spill
	jb	.LBB0_23
# BB#12:                                # %polly.loop_header.i.preheader100.split.split
	movq	216(%rsp), %rax         # 8-byte Reload
	movq	%rax, 208(%rsp)         # 8-byte Spill
	jmp	.LBB0_43
.LBB0_13:                               # %overflow.checked
	movabsq	$0, %rax
	movq	280(%rsp), %rcx         # 8-byte Reload
	addq	$1, %rcx
	movq	280(%rsp), %rdx         # 8-byte Reload
	addq	$1, %rdx
	andq	$-4, %rcx
	cmpq	$0, %rcx
	movq	%rcx, 200(%rsp)         # 8-byte Spill
	movq	%rdx, 192(%rsp)         # 8-byte Spill
	movq	%rax, 184(%rsp)         # 8-byte Spill
	je	.LBB0_21
# BB#14:                                # %vector.body.preheader
	movabsq	$0, %rax
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	$0, %rsi
	movq	%rax, %rdi
	cmovgq	%rsi, %rdi
	addq	$1, %rdi
	shrq	$2, %rdi
	shlq	$2, %rdi
	addq	$-4, %rdi
	shrq	$2, %rdi
	addq	$1, %rdi
	movq	%rdi, %rsi
	andq	$1, %rsi
	cmpq	$0, %rsi
	setne	%r8b
	cmpq	$0, %rdi
	sete	%r9b
	orb	%r8b, %r9b
	testb	$1, %r9b
	movq	%rdi, 176(%rsp)         # 8-byte Spill
	movq	%rax, 168(%rsp)         # 8-byte Spill
	jne	.LBB0_15
	jmp	.LBB0_16
.LBB0_15:                               # %vector.body.unr
	movabsq	$4, %rax
	movl	$2, %ecx
	movl	%ecx, %edx
	movd	%rdx, %xmm0
	xorps	%xmm1, %xmm1
	movq	296(%rsp), %rdx         # 8-byte Reload
	movq	%xmm1, (%rdx)
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, 8(%rdx)
	addq	$0, %rax
	movq	%rax, 168(%rsp)         # 8-byte Spill
.LBB0_16:                               # %vector.body.preheader.split
	movq	168(%rsp), %rax         # 8-byte Reload
	movq	176(%rsp), %rcx         # 8-byte Reload
	cmpq	$2, %rcx
	movq	%rax, 160(%rsp)         # 8-byte Spill
	jb	.LBB0_20
# BB#17:                                # %vector.body.preheader.split.split
	movq	160(%rsp), %rax         # 8-byte Reload
	movq	%rax, 152(%rsp)         # 8-byte Spill
	jmp	.LBB0_18
.LBB0_18:                               # %vector.body
                                        # =>This Inner Loop Header: Depth=1
	movq	152(%rsp), %rax         # 8-byte Reload
	movd	%rax, %xmm0
	movlhps	%xmm0, %xmm0            # xmm0 = xmm0[0,0]
	movl	$1, %ecx
	movl	%ecx, %edx
	movd	%rdx, %xmm1
	pslldq	$8, %xmm1
	movaps	%xmm0, %xmm2
	paddq	%xmm1, %xmm2
	movaps	.LCPI0_0(%rip), %xmm3
	paddq	%xmm3, %xmm0
	pshufd	$8, %xmm2, %xmm2        # xmm2 = xmm2[0,2,0,0]
	movq	296(%rsp), %rdx         # 8-byte Reload
	movq	%xmm2, (%rdx,%rax,4)
	movq	%rax, %rsi
	orq	$2, %rsi
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, (%rdx,%rsi,4)
	movq	%rax, %rsi
	addq	$4, %rsi
	movd	%rsi, %xmm0
	movlhps	%xmm0, %xmm0            # xmm0 = xmm0[0,0]
	movaps	%xmm0, %xmm2
	paddq	%xmm1, %xmm2
	paddq	%xmm3, %xmm0
	pshufd	$8, %xmm2, %xmm1        # xmm1 = xmm2[0,2,0,0]
	movq	%xmm1, 16(%rdx,%rax,4)
	movq	%rsi, %rax
	orq	$2, %rax
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, (%rdx,%rax,4)
	addq	$4, %rsi
	movq	200(%rsp), %rax         # 8-byte Reload
	cmpq	%rax, %rsi
	movq	%rsi, 152(%rsp)         # 8-byte Spill
	jne	.LBB0_18
# BB#19:                                # %middle.block.loopexit.unr-lcssa
	jmp	.LBB0_20
.LBB0_20:                               # %middle.block.loopexit
	movq	200(%rsp), %rax         # 8-byte Reload
	movq	%rax, 184(%rsp)         # 8-byte Spill
	jmp	.LBB0_21
.LBB0_21:                               # %middle.block
	movq	184(%rsp), %rax         # 8-byte Reload
	movq	192(%rsp), %rcx         # 8-byte Reload
	cmpq	%rax, %rcx
	movq	%rax, 272(%rsp)         # 8-byte Spill
	je	.LBB0_24
	jmp	.LBB0_5
.LBB0_22:                               # %polly.cond6.i.loopexit.unr-lcssa
	jmp	.LBB0_23
.LBB0_23:                               # %polly.cond6.i.loopexit
	jmp	.LBB0_24
.LBB0_24:                               # %polly.cond6.i
	movl	312(%rsp), %eax         # 4-byte Reload
	cmpl	$-1, %eax
	jg	.LBB0_47
.LBB0_25:                               # %polly.loop_header10.preheader.i
	movabsq	$0, %rax
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	$0, %rsi
	movq	%rax, %rdi
	cmovgq	%rsi, %rdi
	cmpq	$-1, %rdi
	movq	%rdi, 144(%rsp)         # 8-byte Spill
	movq	%rax, 136(%rsp)         # 8-byte Spill
	jne	.LBB0_34
.LBB0_26:                               # %polly.loop_header10.i.preheader
	movq	136(%rsp), %rax         # 8-byte Reload
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	%rsi, %rax
	cmovgq	%rax, %rsi
	addq	$1, %rsi
	subq	%rax, %rsi
	movq	%rsi, %rdi
	andq	$3, %rdi
	cmpq	$0, %rdi
	setne	%r8b
	cmpq	$0, %rsi
	sete	%r9b
	orb	%r8b, %r9b
	testb	$1, %r9b
	movq	%rax, %r10
	movq	%rax, 128(%rsp)         # 8-byte Spill
	movq	%rsi, 120(%rsp)         # 8-byte Spill
	movq	%rdi, 112(%rsp)         # 8-byte Spill
	movq	%r10, 104(%rsp)         # 8-byte Spill
	jne	.LBB0_27
	jmp	.LBB0_32
.LBB0_27:                               # %unr.cmp114
	movq	112(%rsp), %rax         # 8-byte Reload
	cmpq	$1, %rax
	movq	128(%rsp), %rcx         # 8-byte Reload
	movq	%rcx, 96(%rsp)          # 8-byte Spill
	je	.LBB0_31
# BB#28:                                # %unr.cmp
	movq	112(%rsp), %rax         # 8-byte Reload
	cmpq	$2, %rax
	movq	128(%rsp), %rcx         # 8-byte Reload
	movq	%rcx, 88(%rsp)          # 8-byte Spill
	je	.LBB0_30
# BB#29:                                # %polly.loop_header10.i.unr
	movq	128(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 88(%rsp)          # 8-byte Spill
.LBB0_30:                               # %polly.loop_header10.i.unr103
	movq	88(%rsp), %rax          # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 96(%rsp)          # 8-byte Spill
.LBB0_31:                               # %polly.loop_header10.i.unr108
	movq	96(%rsp), %rax          # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	addq	$1, %rax
	movq	%rax, 104(%rsp)         # 8-byte Spill
.LBB0_32:                               # %polly.loop_header10.i.preheader.split
	movq	104(%rsp), %rax         # 8-byte Reload
	movq	120(%rsp), %rcx         # 8-byte Reload
	cmpq	$4, %rcx
	movq	%rax, 80(%rsp)          # 8-byte Spill
	jb	.LBB0_46
# BB#33:                                # %polly.loop_header10.i.preheader.split.split
	movq	80(%rsp), %rax          # 8-byte Reload
	movq	%rax, 72(%rsp)          # 8-byte Spill
	jmp	.LBB0_44
.LBB0_34:                               # %overflow.checked37
	movabsq	$0, %rax
	movq	144(%rsp), %rcx         # 8-byte Reload
	addq	$1, %rcx
	movq	144(%rsp), %rdx         # 8-byte Reload
	addq	$1, %rdx
	andq	$-4, %rcx
	cmpq	$0, %rcx
	movq	%rcx, 64(%rsp)          # 8-byte Spill
	movq	%rdx, 56(%rsp)          # 8-byte Spill
	movq	%rax, 48(%rsp)          # 8-byte Spill
	je	.LBB0_42
# BB#35:                                # %vector.body28.preheader
	movabsq	$0, %rax
	movl	312(%rsp), %ecx         # 4-byte Reload
	movl	%ecx, %edx
	movl	%edx, %esi
	addq	$-1, %rsi
	cmpq	$0, %rsi
	movq	%rax, %rdi
	cmovgq	%rsi, %rdi
	addq	$1, %rdi
	shrq	$2, %rdi
	shlq	$2, %rdi
	addq	$-4, %rdi
	shrq	$2, %rdi
	addq	$1, %rdi
	movq	%rdi, %rsi
	andq	$1, %rsi
	cmpq	$0, %rsi
	setne	%r8b
	cmpq	$0, %rdi
	sete	%r9b
	orb	%r8b, %r9b
	testb	$1, %r9b
	movq	%rdi, 40(%rsp)          # 8-byte Spill
	movq	%rax, 32(%rsp)          # 8-byte Spill
	jne	.LBB0_36
	jmp	.LBB0_37
.LBB0_36:                               # %vector.body28.unr
	movabsq	$4, %rax
	movl	$1, %ecx
	movl	%ecx, %edx
	movd	%rdx, %xmm0
	pslldq	$8, %xmm0
	movaps	.LCPI0_0(%rip), %xmm1
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	296(%rsp), %rdx         # 8-byte Reload
	movq	%xmm0, (%rdx)
	pshufd	$8, %xmm1, %xmm0        # xmm0 = xmm1[0,2,0,0]
	movq	%xmm0, 8(%rdx)
	addq	$0, %rax
	movq	%rax, 32(%rsp)          # 8-byte Spill
.LBB0_37:                               # %vector.body28.preheader.split
	movq	32(%rsp), %rax          # 8-byte Reload
	movq	40(%rsp), %rcx          # 8-byte Reload
	cmpq	$2, %rcx
	movq	%rax, 24(%rsp)          # 8-byte Spill
	jb	.LBB0_41
# BB#38:                                # %vector.body28.preheader.split.split
	movq	24(%rsp), %rax          # 8-byte Reload
	movq	%rax, 16(%rsp)          # 8-byte Spill
	jmp	.LBB0_39
.LBB0_39:                               # %vector.body28
                                        # =>This Inner Loop Header: Depth=1
	movq	16(%rsp), %rax          # 8-byte Reload
	movd	%rax, %xmm0
	movlhps	%xmm0, %xmm0            # xmm0 = xmm0[0,0]
	movl	$1, %ecx
	movl	%ecx, %edx
	movd	%rdx, %xmm1
	pslldq	$8, %xmm1
	movaps	%xmm0, %xmm2
	paddq	%xmm1, %xmm2
	movaps	.LCPI0_0(%rip), %xmm3
	paddq	%xmm3, %xmm0
	pshufd	$8, %xmm2, %xmm2        # xmm2 = xmm2[0,2,0,0]
	movq	296(%rsp), %rdx         # 8-byte Reload
	movq	%xmm2, (%rdx,%rax,4)
	movq	%rax, %rsi
	orq	$2, %rsi
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, (%rdx,%rsi,4)
	movq	%rax, %rsi
	addq	$4, %rsi
	movd	%rsi, %xmm0
	movlhps	%xmm0, %xmm0            # xmm0 = xmm0[0,0]
	movaps	%xmm0, %xmm2
	paddq	%xmm1, %xmm2
	paddq	%xmm3, %xmm0
	pshufd	$8, %xmm2, %xmm1        # xmm1 = xmm2[0,2,0,0]
	movq	%xmm1, 16(%rdx,%rax,4)
	movq	%rsi, %rax
	orq	$2, %rax
	pshufd	$8, %xmm0, %xmm0        # xmm0 = xmm0[0,2,0,0]
	movq	%xmm0, (%rdx,%rax,4)
	addq	$4, %rsi
	movq	64(%rsp), %rax          # 8-byte Reload
	cmpq	%rax, %rsi
	movq	%rsi, 16(%rsp)          # 8-byte Spill
	jne	.LBB0_39
# BB#40:                                # %middle.block29.loopexit.unr-lcssa
	jmp	.LBB0_41
.LBB0_41:                               # %middle.block29.loopexit
	movq	64(%rsp), %rax          # 8-byte Reload
	movq	%rax, 48(%rsp)          # 8-byte Spill
	jmp	.LBB0_42
.LBB0_42:                               # %middle.block29
	movq	48(%rsp), %rax          # 8-byte Reload
	movq	56(%rsp), %rcx          # 8-byte Reload
	cmpq	%rax, %rcx
	movq	%rax, 136(%rsp)         # 8-byte Spill
	je	.LBB0_47
	jmp	.LBB0_26
.LBB0_43:                               # %polly.loop_header.i
                                        # =>This Inner Loop Header: Depth=1
	movq	208(%rsp), %rax         # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	movq	%rax, %rsi
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 4(%rdx,%rax,4)
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 8(%rdx,%rax,4)
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 12(%rdx,%rax,4)
	movq	%rsi, %rax
	addq	$1, %rax
	movq	288(%rsp), %rdi         # 8-byte Reload
	cmpq	%rdi, %rsi
	movq	%rax, 208(%rsp)         # 8-byte Spill
	jg	.LBB0_22
	jmp	.LBB0_43
.LBB0_44:                               # %polly.loop_header10.i
                                        # =>This Inner Loop Header: Depth=1
	movq	72(%rsp), %rax          # 8-byte Reload
	movl	%eax, %ecx
	movq	296(%rsp), %rdx         # 8-byte Reload
	movl	%ecx, (%rdx,%rax,4)
	movq	%rax, %rsi
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 4(%rdx,%rax,4)
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 8(%rdx,%rax,4)
	addq	$1, %rsi
	movl	%esi, %ecx
	movl	%ecx, 12(%rdx,%rax,4)
	movq	%rsi, %rax
	addq	$1, %rax
	movq	288(%rsp), %rdi         # 8-byte Reload
	cmpq	%rdi, %rsi
	movq	%rax, 72(%rsp)          # 8-byte Spill
	jle	.LBB0_44
# BB#45:                                # %vector.body57.preheader.loopexit.unr-lcssa
	jmp	.LBB0_46
.LBB0_46:                               # %vector.body57.preheader.loopexit
	jmp	.LBB0_47
.LBB0_47:                               # %vector.body57.preheader
	jmp	.LBB0_48
.LBB0_48:                               # %vector.body57
	movq	296(%rsp), %rax         # 8-byte Reload
	movups	(%rax), %xmm0
	movups	16(%rax), %xmm1
	movups	32(%rax), %xmm2
	movups	48(%rax), %xmm3
	paddd	%xmm0, %xmm1
	paddd	%xmm1, %xmm2
	paddd	%xmm2, %xmm3
	movups	64(%rax), %xmm0
	paddd	%xmm3, %xmm0
	movups	80(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	96(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	112(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	128(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	144(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	160(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	176(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	192(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	208(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	224(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	240(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	256(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	272(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	288(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	304(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	320(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	336(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	352(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	368(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	384(%rax), %xmm0
	paddd	%xmm1, %xmm0
	pshufd	$14, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,0]
	paddd	%xmm1, %xmm0
	pshufd	$1, %xmm0, %xmm1        # xmm1 = xmm0[1,0,0,0]
	paddd	%xmm1, %xmm0
	movd	%xmm0, %esi
	movl	$.L.str, %ecx
	movl	%ecx, %edi
	xorl	%ecx, %ecx
	movb	%cl, %dl
	movb	%dl, %al
	callq	printf
	movl	%eax, 12(%rsp)          # 4-byte Spill
# BB#49:                                # %vector.body76
	movq	296(%rsp), %rax         # 8-byte Reload
	movups	(%rax), %xmm0
	movups	16(%rax), %xmm1
	movups	32(%rax), %xmm2
	movups	48(%rax), %xmm3
	paddd	%xmm0, %xmm1
	paddd	%xmm1, %xmm2
	paddd	%xmm2, %xmm3
	movups	64(%rax), %xmm0
	paddd	%xmm3, %xmm0
	movups	80(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	96(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	112(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	128(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	144(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	160(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	176(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	192(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	208(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	224(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	240(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	256(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	272(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	288(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	304(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	320(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	336(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	352(%rax), %xmm0
	paddd	%xmm1, %xmm0
	movups	368(%rax), %xmm1
	paddd	%xmm0, %xmm1
	movups	384(%rax), %xmm0
	paddd	%xmm1, %xmm0
	pshufd	$14, %xmm0, %xmm1       # xmm1 = xmm0[2,3,0,0]
	paddd	%xmm1, %xmm0
	pshufd	$1, %xmm0, %xmm1        # xmm1 = xmm0[1,0,0,0]
	paddd	%xmm1, %xmm0
	movd	%xmm0, %esi
	movl	$.L.str1, %ecx
	movl	%ecx, %edi
	xorl	%ecx, %ecx
	movb	%cl, %dl
	movb	%dl, %al
	callq	printf
	movq	304(%rsp), %rdi         # 8-byte Reload
	movl	%eax, 8(%rsp)           # 4-byte Spill
	callq	free
	movl	$0, %eax
	addq	$328, %rsp              # imm = 0x148
	retq
.Ltmp1:
	.size	main, .Ltmp1-main
	.cfi_endproc

	.type	.L.str, at object          # @.str
	.section	.rodata.str1.1,"aMS", at progbits,1
.L.str:
	.asciz	"Sum1 = %d\n"
	.size	.L.str, 11

	.type	.L.str1, at object         # @.str1
.L.str1:
	.asciz	"Sum2 = %d\n"
	.size	.L.str1, 11

	.ident	"clang version 3.5.0 "
	.section	".note.GNU-stack","", at progbits