[LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment?

Sun Dec 28 14:21:51 PST 2014

----- Original Message -----
> From: "Josh Klontz" <josh.klontz at gmail.com>
> To: "Dev" <llvmdev at cs.uiuc.edu>
> Sent: Friday, December 26, 2014 8:27:43 AM
> Subject: [LLVMdev] Correct usage of `llvm.assume` for loop vectorization	alignment?
> 
> Using LLVM ToT and Hal's helpful slide deck [1], I've been trying to
> use `llvm.assume` to communicate pointer alignment guarantees to
> vector load and store instructions. For example, in [2] %5 and %9
> are guaranteed to be 32-byte aligned. However, if I run this IR
> through `opt -O3 -datalayout -S`, the vectorized loads and stores
> are still 1-byte aligned [3]. What's going wrong? Do I have to move
> the `llvm.assume` into the loop body?

Hi John,

The problem is that you're asserting an alignment fact about:
  %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0

and you want this to apply to pointers derived from this value within the loop:
  %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index

I'm pretty sure we currently only look 'up' the use/def chain for alignment facts, and so nothing triggers because %17 is derived from %0, and there is no alignment fact asserted directly on %0.

Can you please file a bug report about this (at http://llvm.org/bugs/)? I think that we can likely fix this.

 -Hal

> 
> 
> v/r,
> Josh
> 
> 
> 
> 
> 
> [1]
> http://llvm.org/devmtg/2014-10/Slides/Finkel-IntrinsicsMetadataAttributes.pdf
> 
> 
> [2]
> ; ModuleID = 'align.ll'
> 
> 
> %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] }
> 
> 
> ; Function Attrs: noduplicate nounwind readonly
> declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32
> zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0
> 
> 
> ; Function Attrs: nounwind
> declare void @llvm.assume(i1) #1
> 
> 
> ; Function Attrs: nounwind
> define %u8XY* @benchmark(%u8XY*) #1 {
> entry:
> %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3
> %columns = load i32* %1, align 4, !range !0
> %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4
> %rows = load i32* %2, align 4, !range !0
> %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32
> %rows, i32 1, i8* null)
> %4 = zext i32 %rows to i64
> %dst_y_step = zext i32 %columns to i64
> %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0
> %6 = ptrtoint i8* %5 to i64
> %7 = and i64 %6, 31
> %8 = icmp eq i64 %7, 0
> tail call void @llvm.assume(i1 %8)
> %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
> %10 = ptrtoint i8* %9 to i64
> %11 = and i64 %10, 31
> %12 = icmp eq i64 %11, 0
> tail call void @llvm.assume(i1 %12)
> %13 = mul nuw nsw i64 %4, %dst_y_step
> br label %x_body
> 
> 
> x_body: ; preds = %x_body, %entry
> %y = phi i64 [ 0, %entry ], [ %y_increment, %x_body ]
> %14 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y
> %15 = load i8* %14, align 1, !llvm.mem.parallel_loop_access !1
> %.lobit = lshr i8 %15, 7
> %16 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y
> store i8 %.lobit, i8* %16, align 1, !llvm.mem.parallel_loop_access !1
> %y_increment = add nuw nsw i64 %y, 1
> %y_postcondition = icmp eq i64 %y_increment, %13
> br i1 %y_postcondition, label %y_exit, label %x_body, !llvm.loop !2
> 
> 
> y_exit: ; preds = %x_body
> ret %u8XY* %3
> }
> 
> 
> attributes #0 = { noduplicate nounwind readonly }
> attributes #1 = { nounwind }
> 
> 
> !0 = !{i32 1, i32 -1}
> !1 = !{!1}
> !2 = !{!2}
> 
> 
> [3]
> ; ModuleID = 'align.ll'
> 
> 
> %u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] }
> 
> 
> ; Function Attrs: noduplicate nounwind readonly
> declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32
> zeroext, i32 zeroext, i32 zeroext, i8* noalias nocapture) #0
> 
> 
> ; Function Attrs: nounwind
> declare void @llvm.assume(i1) #1
> 
> 
> ; Function Attrs: nounwind
> define %u8XY* @benchmark(%u8XY*) #1 {
> entry:
> %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3
> %columns = load i32* %1, align 4, !range !0
> %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4
> %rows = load i32* %2, align 4, !range !0
> %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32
> %rows, i32 1, i8* null)
> %4 = zext i32 %rows to i64
> %dst_y_step = zext i32 %columns to i64
> %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0
> %6 = ptrtoint i8* %5 to i64
> %7 = and i64 %6, 31
> %8 = icmp eq i64 %7, 0
> tail call void @llvm.assume(i1 %8)
> %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
> %10 = ptrtoint i8* %9 to i64
> %11 = and i64 %10, 31
> %12 = icmp eq i64 %11, 0
> tail call void @llvm.assume(i1 %12)
> %13 = mul nuw nsw i64 %4, %dst_y_step
> %14 = zext i32 %rows to i64
> %15 = zext i32 %columns to i64
> %16 = mul nuw i64 %14, %15
> %n.vec = and i64 %16, -4
> %cmp.zero = icmp eq i64 %n.vec, 0
> br i1 %cmp.zero, label %middle.block, label %vector.body.preheader
> 
> 
> vector.body.preheader: ; preds = %entry
> br label %vector.body
> 
> 
> vector.body: ; preds = %vector.body.preheader, %vector.body
> %index = phi i64 [ %index.next, %vector.body ], [ 0,
> %vector.body.preheader ]
> %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index
> %18 = bitcast i8* %17 to <4 x i8>*
> %wide.load = load <4 x i8>* %18, align 1
> %19 = lshr <4 x i8> %wide.load, <i8 7, i8 7, i8 7, i8 7>
> %20 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %index
> %21 = bitcast i8* %20 to <4 x i8>*
> store <4 x i8> %19, <4 x i8>* %21, align 1
> %index.next = add i64 %index, 4
> %22 = icmp eq i64 %index.next, %n.vec
> br i1 %22, label %middle.block.loopexit, label %vector.body,
> !llvm.loop !1
> 
> 
> middle.block.loopexit: ; preds = %vector.body
> br label %middle.block
> 
> 
> middle.block: ; preds = %middle.block.loopexit, %entry
> %resume.val = phi i64 [ 0, %entry ], [ %n.vec, %middle.block.loopexit
> ]
> %cmp.n = icmp eq i64 %16, %resume.val
> br i1 %cmp.n, label %y_exit, label %x_body.preheader
> 
> 
> x_body.preheader: ; preds = %middle.block
> br label %x_body
> 
> 
> x_body: ; preds = %x_body.preheader, %x_body
> %y = phi i64 [ %y_increment, %x_body ], [ %resume.val,
> %x_body.preheader ]
> %23 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y
> %24 = load i8* %23, align 1, !llvm.mem.parallel_loop_access !4
> %.lobit = lshr i8 %24, 7
> %25 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y
> store i8 %.lobit, i8* %25, align 1, !llvm.mem.parallel_loop_access !4
> %y_increment = add nuw nsw i64 %y, 1
> %y_postcondition = icmp eq i64 %y_increment, %13
> br i1 %y_postcondition, label %y_exit.loopexit, label %x_body,
> !llvm.loop !5
> 
> 
> y_exit.loopexit: ; preds = %x_body
> br label %y_exit
> 
> 
> y_exit: ; preds = %y_exit.loopexit, %middle.block
> ret %u8XY* %3
> }
> 
> 
> attributes #0 = { noduplicate nounwind readonly }
> attributes #1 = { nounwind }
> 
> 
> !0 = !{i32 1, i32 -1}
> !1 = !{!1, !2, !3}
> !2 = !{!"llvm.loop.vectorize.width", i32 1}
> !3 = !{!"llvm.loop.interleave.count", i32 1}
> !4 = !{!4}
> !5 = !{!5, !2, !3}
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
> 

-- 
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory