[LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment?
Josh Klontz
josh.klontz at gmail.com
Fri Dec 26 06:27:43 PST 2014
Using LLVM ToT and Hal's helpful slide deck [1], I've been trying to use
`llvm.assume` to communicate pointer alignment guarantees to vector load
and store instructions. For example, in [2] %5 and %9 are guaranteed to be
32-byte aligned. However, if I run this IR through `opt -O3 -datalayout
-S`, the vectorized loads and stores are still 1-byte aligned [3]. What's
going wrong? Do I have to move the `llvm.assume` into the loop body?
v/r,
Josh
[1]
http://llvm.org/devmtg/2014-10/Slides/Finkel-IntrinsicsMetadataAttributes.pdf
[2]
; ModuleID = 'align.ll'
%u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] }
; Function Attrs: noduplicate nounwind readonly
declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 zeroext,
i32 zeroext, i32 zeroext, i8* noalias nocapture) #0
; Function Attrs: nounwind
declare void @llvm.assume(i1) #1
; Function Attrs: nounwind
define %u8XY* @benchmark(%u8XY*) #1 {
entry:
%1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3
%columns = load i32* %1, align 4, !range !0
%2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4
%rows = load i32* %2, align 4, !range !0
%3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32
%rows, i32 1, i8* null)
%4 = zext i32 %rows to i64
%dst_y_step = zext i32 %columns to i64
%5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0
%6 = ptrtoint i8* %5 to i64
%7 = and i64 %6, 31
%8 = icmp eq i64 %7, 0
tail call void @llvm.assume(i1 %8)
%9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
%10 = ptrtoint i8* %9 to i64
%11 = and i64 %10, 31
%12 = icmp eq i64 %11, 0
tail call void @llvm.assume(i1 %12)
%13 = mul nuw nsw i64 %4, %dst_y_step
br label %x_body
x_body: ; preds = %x_body, %entry
%y = phi i64 [ 0, %entry ], [ %y_increment, %x_body ]
%14 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y
%15 = load i8* %14, align 1, !llvm.mem.parallel_loop_access !1
%.lobit = lshr i8 %15, 7
%16 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y
store i8 %.lobit, i8* %16, align 1, !llvm.mem.parallel_loop_access !1
%y_increment = add nuw nsw i64 %y, 1
%y_postcondition = icmp eq i64 %y_increment, %13
br i1 %y_postcondition, label %y_exit, label %x_body, !llvm.loop !2
y_exit: ; preds = %x_body
ret %u8XY* %3
}
attributes #0 = { noduplicate nounwind readonly }
attributes #1 = { nounwind }
!0 = !{i32 1, i32 -1}
!1 = !{!1}
!2 = !{!2}
[3]
; ModuleID = 'align.ll'
%u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] }
; Function Attrs: noduplicate nounwind readonly
declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 zeroext,
i32 zeroext, i32 zeroext, i8* noalias nocapture) #0
; Function Attrs: nounwind
declare void @llvm.assume(i1) #1
; Function Attrs: nounwind
define %u8XY* @benchmark(%u8XY*) #1 {
entry:
%1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3
%columns = load i32* %1, align 4, !range !0
%2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4
%rows = load i32* %2, align 4, !range !0
%3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32
%rows, i32 1, i8* null)
%4 = zext i32 %rows to i64
%dst_y_step = zext i32 %columns to i64
%5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0
%6 = ptrtoint i8* %5 to i64
%7 = and i64 %6, 31
%8 = icmp eq i64 %7, 0
tail call void @llvm.assume(i1 %8)
%9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
%10 = ptrtoint i8* %9 to i64
%11 = and i64 %10, 31
%12 = icmp eq i64 %11, 0
tail call void @llvm.assume(i1 %12)
%13 = mul nuw nsw i64 %4, %dst_y_step
%14 = zext i32 %rows to i64
%15 = zext i32 %columns to i64
%16 = mul nuw i64 %14, %15
%n.vec = and i64 %16, -4
%cmp.zero = icmp eq i64 %n.vec, 0
br i1 %cmp.zero, label %middle.block, label %vector.body.preheader
vector.body.preheader: ; preds = %entry
br label %vector.body
vector.body: ; preds =
%vector.body.preheader, %vector.body
%index = phi i64 [ %index.next, %vector.body ], [ 0,
%vector.body.preheader ]
%17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index
%18 = bitcast i8* %17 to <4 x i8>*
%wide.load = load <4 x i8>* %18, align 1
%19 = lshr <4 x i8> %wide.load, <i8 7, i8 7, i8 7, i8 7>
%20 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %index
%21 = bitcast i8* %20 to <4 x i8>*
store <4 x i8> %19, <4 x i8>* %21, align 1
%index.next = add i64 %index, 4
%22 = icmp eq i64 %index.next, %n.vec
br i1 %22, label %middle.block.loopexit, label %vector.body, !llvm.loop !1
middle.block.loopexit: ; preds = %vector.body
br label %middle.block
middle.block: ; preds =
%middle.block.loopexit, %entry
%resume.val = phi i64 [ 0, %entry ], [ %n.vec, %middle.block.loopexit ]
%cmp.n = icmp eq i64 %16, %resume.val
br i1 %cmp.n, label %y_exit, label %x_body.preheader
x_body.preheader: ; preds = %middle.block
br label %x_body
x_body: ; preds =
%x_body.preheader, %x_body
%y = phi i64 [ %y_increment, %x_body ], [ %resume.val, %x_body.preheader ]
%23 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y
%24 = load i8* %23, align 1, !llvm.mem.parallel_loop_access !4
%.lobit = lshr i8 %24, 7
%25 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y
store i8 %.lobit, i8* %25, align 1, !llvm.mem.parallel_loop_access !4
%y_increment = add nuw nsw i64 %y, 1
%y_postcondition = icmp eq i64 %y_increment, %13
br i1 %y_postcondition, label %y_exit.loopexit, label %x_body, !llvm.loop
!5
y_exit.loopexit: ; preds = %x_body
br label %y_exit
y_exit: ; preds =
%y_exit.loopexit, %middle.block
ret %u8XY* %3
}
attributes #0 = { noduplicate nounwind readonly }
attributes #1 = { nounwind }
!0 = !{i32 1, i32 -1}
!1 = !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 1}
!3 = !{!"llvm.loop.interleave.count", i32 1}
!4 = !{!4}
!5 = !{!5, !2, !3}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141226/645710fa/attachment.html>
More information about the llvm-dev
mailing list