[LLVMdev] Correct usage of `llvm.assume` for loop vectorization alignment?

Fri Dec 26 06:27:43 PST 2014

Using LLVM ToT and Hal's helpful slide deck [1], I've been trying to use
`llvm.assume` to communicate pointer alignment guarantees to vector load
and store instructions. For example, in [2] %5 and %9 are guaranteed to be
32-byte aligned. However, if I run this IR through `opt -O3 -datalayout
-S`, the vectorized loads and stores are still 1-byte aligned [3]. What's
going wrong? Do I have to move the `llvm.assume` into the loop body?

v/r,
Josh


[1]
http://llvm.org/devmtg/2014-10/Slides/Finkel-IntrinsicsMetadataAttributes.pdf

[2]
; ModuleID = 'align.ll'

%u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] }

; Function Attrs: noduplicate nounwind readonly
declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 zeroext,
i32 zeroext, i32 zeroext, i8* noalias nocapture) #0

; Function Attrs: nounwind
declare void @llvm.assume(i1) #1

; Function Attrs: nounwind
define %u8XY* @benchmark(%u8XY*) #1 {
entry:
  %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3
  %columns = load i32* %1, align 4, !range !0
  %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4
  %rows = load i32* %2, align 4, !range !0
  %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32
%rows, i32 1, i8* null)
  %4 = zext i32 %rows to i64
  %dst_y_step = zext i32 %columns to i64
  %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0
  %6 = ptrtoint i8* %5 to i64
  %7 = and i64 %6, 31
  %8 = icmp eq i64 %7, 0
  tail call void @llvm.assume(i1 %8)
  %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
  %10 = ptrtoint i8* %9 to i64
  %11 = and i64 %10, 31
  %12 = icmp eq i64 %11, 0
  tail call void @llvm.assume(i1 %12)
  %13 = mul nuw nsw i64 %4, %dst_y_step
  br label %x_body

x_body:                                           ; preds = %x_body, %entry
  %y = phi i64 [ 0, %entry ], [ %y_increment, %x_body ]
  %14 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y
  %15 = load i8* %14, align 1, !llvm.mem.parallel_loop_access !1
  %.lobit = lshr i8 %15, 7
  %16 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y
  store i8 %.lobit, i8* %16, align 1, !llvm.mem.parallel_loop_access !1
  %y_increment = add nuw nsw i64 %y, 1
  %y_postcondition = icmp eq i64 %y_increment, %13
  br i1 %y_postcondition, label %y_exit, label %x_body, !llvm.loop !2

y_exit:                                           ; preds = %x_body
  ret %u8XY* %3
}

attributes #0 = { noduplicate nounwind readonly }
attributes #1 = { nounwind }

!0 = !{i32 1, i32 -1}
!1 = !{!1}
!2 = !{!2}

[3]
; ModuleID = 'align.ll'

%u8XY = type { i32, i32, i32, i32, i32, i32, [0 x i8] }

; Function Attrs: noduplicate nounwind readonly
declare noalias %u8XY* @likely_new(i32 zeroext, i32 zeroext, i32 zeroext,
i32 zeroext, i32 zeroext, i8* noalias nocapture) #0

; Function Attrs: nounwind
declare void @llvm.assume(i1) #1

; Function Attrs: nounwind
define %u8XY* @benchmark(%u8XY*) #1 {
entry:
  %1 = getelementptr inbounds %u8XY* %0, i64 0, i32 3
  %columns = load i32* %1, align 4, !range !0
  %2 = getelementptr inbounds %u8XY* %0, i64 0, i32 4
  %rows = load i32* %2, align 4, !range !0
  %3 = tail call %u8XY* @likely_new(i32 24584, i32 1, i32 %columns, i32
%rows, i32 1, i8* null)
  %4 = zext i32 %rows to i64
  %dst_y_step = zext i32 %columns to i64
  %5 = getelementptr inbounds %u8XY* %3, i64 0, i32 6, i64 0
  %6 = ptrtoint i8* %5 to i64
  %7 = and i64 %6, 31
  %8 = icmp eq i64 %7, 0
  tail call void @llvm.assume(i1 %8)
  %9 = getelementptr inbounds %u8XY* %0, i64 0, i32 6, i64 0
  %10 = ptrtoint i8* %9 to i64
  %11 = and i64 %10, 31
  %12 = icmp eq i64 %11, 0
  tail call void @llvm.assume(i1 %12)
  %13 = mul nuw nsw i64 %4, %dst_y_step
  %14 = zext i32 %rows to i64
  %15 = zext i32 %columns to i64
  %16 = mul nuw i64 %14, %15
  %n.vec = and i64 %16, -4
  %cmp.zero = icmp eq i64 %n.vec, 0
  br i1 %cmp.zero, label %middle.block, label %vector.body.preheader

vector.body.preheader:                            ; preds = %entry
  br label %vector.body

vector.body:                                      ; preds =
%vector.body.preheader, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0,
%vector.body.preheader ]
  %17 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %index
  %18 = bitcast i8* %17 to <4 x i8>*
  %wide.load = load <4 x i8>* %18, align 1
  %19 = lshr <4 x i8> %wide.load, <i8 7, i8 7, i8 7, i8 7>
  %20 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %index
  %21 = bitcast i8* %20 to <4 x i8>*
  store <4 x i8> %19, <4 x i8>* %21, align 1
  %index.next = add i64 %index, 4
  %22 = icmp eq i64 %index.next, %n.vec
  br i1 %22, label %middle.block.loopexit, label %vector.body, !llvm.loop !1

middle.block.loopexit:                            ; preds = %vector.body
  br label %middle.block

middle.block:                                     ; preds =
%middle.block.loopexit, %entry
  %resume.val = phi i64 [ 0, %entry ], [ %n.vec, %middle.block.loopexit ]
  %cmp.n = icmp eq i64 %16, %resume.val
  br i1 %cmp.n, label %y_exit, label %x_body.preheader

x_body.preheader:                                 ; preds = %middle.block
  br label %x_body

x_body:                                           ; preds =
%x_body.preheader, %x_body
  %y = phi i64 [ %y_increment, %x_body ], [ %resume.val, %x_body.preheader ]
  %23 = getelementptr %u8XY* %0, i64 0, i32 6, i64 %y
  %24 = load i8* %23, align 1, !llvm.mem.parallel_loop_access !4
  %.lobit = lshr i8 %24, 7
  %25 = getelementptr %u8XY* %3, i64 0, i32 6, i64 %y
  store i8 %.lobit, i8* %25, align 1, !llvm.mem.parallel_loop_access !4
  %y_increment = add nuw nsw i64 %y, 1
  %y_postcondition = icmp eq i64 %y_increment, %13
  br i1 %y_postcondition, label %y_exit.loopexit, label %x_body, !llvm.loop
!5

y_exit.loopexit:                                  ; preds = %x_body
  br label %y_exit

y_exit:                                           ; preds =
%y_exit.loopexit, %middle.block
  ret %u8XY* %3
}

attributes #0 = { noduplicate nounwind readonly }
attributes #1 = { nounwind }

!0 = !{i32 1, i32 -1}
!1 = !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 1}
!3 = !{!"llvm.loop.interleave.count", i32 1}
!4 = !{!4}
!5 = !{!5, !2, !3}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20141226/645710fa/attachment.html>