[LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment?

Sun Dec 28 14:23:36 PST 2014

----- Original Message -----
> From: "Hal Finkel" <hfinkel at anl.gov>
> To: "Josh Klontz" <josh.klontz at gmail.com>
> Cc: "Dev" <llvmdev at cs.uiuc.edu>
> Sent: Sunday, December 28, 2014 4:21:51 PM
> Subject: Re: [LLVMdev] Correct usage of `llvm.assume` for loop vectorization	alignment?
> 
> ----- Original Message -----
> > From: "Josh Klontz" <josh.klontz at gmail.com>
> > To: "Dev" <llvmdev at cs.uiuc.edu>
> > Sent: Friday, December 26, 2014 8:27:43 AM
> > Subject: [LLVMdev] Correct usage of `llvm.assume` for loop
> > vectorization	alignment?
> > 
> > Using LLVM ToT and Hal's helpful slide deck [1], I've been trying
> > to
> > use `llvm.assume` to communicate pointer alignment guarantees to
> > vector load and store instructions. For example, in [2] %5 and %9
> > are guaranteed to be 32-byte aligned. However, if I run this IR
> > through `opt -O3 -datalayout -S`, the vectorized loads and stores
> > are still 1-byte aligned [3]. What's going wrong? Do I have to move
> > the `llvm.assume` into the loop body?
> 
> Hi John,

[Err, Josh. Sorry about that].

 -Hal

> 
> The problem is that you're asserting an alignment fact about:
>   %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
> 
> and you want this to apply to pointers derived from this value within
> the loop:
>   %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index
> 
> I'm pretty sure we currently only look 'up' the use/def chain for
> alignment facts, and so nothing triggers because %17 is derived from
> %0, and there is no alignment fact asserted directly on %0.
> 
> Can you please file a bug report about this (at
> http://llvm.org/bugs/)? I think that we can likely fix this.
> 
>  -Hal
> 
> > 
> > 
> > v/r,
> > Josh
> > 
> > 
> > 
> > 
> > 
> > [1]
> > http://llvm.org/devmtg/2014-10/Slides/Finkel-IntrinsicsMetadataAttributes.pdf
> > 
> > 
> > [2]
> > ; ModuleID = 'align.ll'
> > 
> > 
> > %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] }
> > 
> > 
> > ; Function Attrs: noduplicate nounwind readonly
> > declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32
> > zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0
> > 
> > 
> > ; Function Attrs: nounwind
> > declare void @llvm.assume(i1) #1
> > 
> > 
> > ; Function Attrs: nounwind
> > define %u8XY* @benchmark(%u8XY*) #1 {
> > entry:
> > %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3
> > %columns = load i32* %1, align 4, !range !0
> > %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4
> > %rows = load i32* %2, align 4, !range !0
> > %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns,
> > i32
> > %rows, i32 1, i8* null)
> > %4 = zext i32 %rows to i64
> > %dst_y_step = zext i32 %columns to i64
> > %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0
> > %6 = ptrtoint i8* %5 to i64
> > %7 = and i64 %6, 31
> > %8 = icmp eq i64 %7, 0
> > tail call void @llvm.assume(i1 %8)
> > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
> > %10 = ptrtoint i8* %9 to i64
> > %11 = and i64 %10, 31
> > %12 = icmp eq i64 %11, 0
> > tail call void @llvm.assume(i1 %12)
> > %13 = mul nuw nsw i64 %4, %dst_y_step
> > br label %x_body
> > 
> > 
> > x_body: ; preds = %x_body, %entry
> > %y = phi i64 [ 0, %entry ], [ %y_increment, %x_body ]
> > %14 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y
> > %15 = load i8* %14, align 1, !llvm.mem.parallel_loop_access !1
> > %.lobit = lshr i8 %15, 7
> > %16 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y
> > store i8 %.lobit, i8* %16, align 1, !llvm.mem.parallel_loop_access
> > !1
> > %y_increment = add nuw nsw i64 %y, 1
> > %y_postcondition = icmp eq i64 %y_increment, %13
> > br i1 %y_postcondition, label %y_exit, label %x_body, !llvm.loop !2
> > 
> > 
> > y_exit: ; preds = %x_body
> > ret %u8XY* %3
> > }
> > 
> > 
> > attributes #0 = { noduplicate nounwind readonly }
> > attributes #1 = { nounwind }
> > 
> > 
> > !0 = !{i32 1, i32 -1}
> > !1 = !{!1}
> > !2 = !{!2}
> > 
> > 
> > [3]
> > ; ModuleID = 'align.ll'
> > 
> > 
> > %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] }
> > 
> > 
> > ; Function Attrs: noduplicate nounwind readonly
> > declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32
> > zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0
> > 
> > 
> > ; Function Attrs: nounwind
> > declare void @llvm.assume(i1) #1
> > 
> > 
> > ; Function Attrs: nounwind
> > define %u8XY* @benchmark(%u8XY*) #1 {
> > entry:
> > %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3
> > %columns = load i32* %1, align 4, !range !0
> > %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4
> > %rows = load i32* %2, align 4, !range !0
> > %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns,
> > i32
> > %rows, i32 1, i8* null)
> > %4 = zext i32 %rows to i64
> > %dst_y_step = zext i32 %columns to i64
> > %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0
> > %6 = ptrtoint i8* %5 to i64
> > %7 = and i64 %6, 31
> > %8 = icmp eq i64 %7, 0
> > tail call void @llvm.assume(i1 %8)
> > %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
> > %10 = ptrtoint i8* %9 to i64
> > %11 = and i64 %10, 31
> > %12 = icmp eq i64 %11, 0
> > tail call void @llvm.assume(i1 %12)
> > %13 = mul nuw nsw i64 %4, %dst_y_step
> > %14 = zext i32 %rows to i64
> > %15 = zext i32 %columns to i64
> > %16 = mul nuw i64 %14, %15
> > %n.vec = and i64 %16, -4
> > %cmp.zero = icmp eq i64 %n.vec, 0
> > br i1 %cmp.zero, label %middle.block, label %vector.body.preheader
> > 
> > 
> > vector.body.preheader: ; preds = %entry
> > br label %vector.body
> > 
> > 
> > vector.body: ; preds = %vector.body.preheader, %vector.body
> > %index = phi i64 [ %index.next, %vector.body ], [ 0,
> > %vector.body.preheader ]
> > %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index
> > %18 = bitcast i8* %17 to <4 x i8>*
> > %wide.load = load <4 x i8>* %18, align 1
> > %19 = lshr <4 x i8> %wide.load, <i8 7, i8 7, i8 7, i8 7>
> > %20 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %index
> > %21 = bitcast i8* %20 to <4 x i8>*
> > store <4 x i8> %19, <4 x i8>* %21, align 1
> > %index.next = add i64 %index, 4
> > %22 = icmp eq i64 %index.next, %n.vec
> > br i1 %22, label %middle.block.loopexit, label %vector.body,
> > !llvm.loop !1
> > 
> > 
> > middle.block.loopexit: ; preds = %vector.body
> > br label %middle.block
> > 
> > 
> > middle.block: ; preds = %middle.block.loopexit, %entry
> > %resume.val = phi i64 [ 0, %entry ], [ %n.vec,
> > %middle.block.loopexit
> > ]
> > %cmp.n = icmp eq i64 %16, %resume.val
> > br i1 %cmp.n, label %y_exit, label %x_body.preheader
> > 
> > 
> > x_body.preheader: ; preds = %middle.block
> > br label %x_body
> > 
> > 
> > x_body: ; preds = %x_body.preheader, %x_body
> > %y = phi i64 [ %y_increment, %x_body ], [ %resume.val,
> > %x_body.preheader ]
> > %23 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y
> > %24 = load i8* %23, align 1, !llvm.mem.parallel_loop_access !4
> > %.lobit = lshr i8 %24, 7
> > %25 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y
> > store i8 %.lobit, i8* %25, align 1, !llvm.mem.parallel_loop_access
> > !4
> > %y_increment = add nuw nsw i64 %y, 1
> > %y_postcondition = icmp eq i64 %y_increment, %13
> > br i1 %y_postcondition, label %y_exit.loopexit, label %x_body,
> > !llvm.loop !5
> > 
> > 
> > y_exit.loopexit: ; preds = %x_body
> > br label %y_exit
> > 
> > 
> > y_exit: ; preds = %y_exit.loopexit, %middle.block
> > ret %u8XY* %3
> > }
> > 
> > 
> > attributes #0 = { noduplicate nounwind readonly }
> > attributes #1 = { nounwind }
> > 
> > 
> > !0 = !{i32 1, i32 -1}
> > !1 = !{!1, !2, !3}
> > !2 = !{!"llvm.loop.vectorize.width", i32 1}
> > !3 = !{!"llvm.loop.interleave.count", i32 1}
> > !4 = !{!4}
> > !5 = !{!5, !2, !3}
> > _______________________________________________
> > LLVM Developers mailing list
> > LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
> > 
> 
> --
> Hal Finkel
> Assistant Computational Scientist
> Leadership Computing Facility
> Argonne National Laboratory
> 

-- 
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory