[llvm] r186088 - LoopVectorize: Vectorize all accesses in address space zero with unit stride

Thu Jul 11 09:01:39 PDT 2013

On Jul 11, 2013, at 10:28 AM, David Blaikie <dblaikie at gmail.com> wrote:

> On Thu, Jul 11, 2013 at 8:21 AM, Arnold Schwaighofer
> <aschwaighofer at apple.com> wrote:
>> Author: arnolds
>> Date: Thu Jul 11 10:21:55 2013
>> New Revision: 186088
>> 
>> URL: http://llvm.org/viewvc/llvm-project?rev=186088&view=rev
>> Log:
>> LoopVectorize: Vectorize all accesses in address space zero with unit stride
> 
> Naively (since I know very little about LLVM optimization details,
> mostly working up in Clang): do you need to limit this to unit stride?
> Any object that would include address zero would be invalid, no? (I'm
> not sure whether vectorization can have holes (eg: elements of size 1
> but stride 2),

In principle the vectorizer can vectorize this with a gather/scather. In many cases the cost model will tell it not to.

In this context the “stride” already has the element size factored in.

> if so you might need to avoid those - but if it has
> stride 2 and size 2

This is a unit stride. The unit stride I am referring to is taking the element size into account.

> and crosses zero even if zero isn't one of the
> element addresses (instead it's the address of the second byte of an
> element) should be eligible for the same optimization as size 1 stride
> 1, no?

Yes, you are right. The code already does that.

for i in ..: a[i] is a unit stride access.
while
for i in ..” a[2*i] is a non unit stride access (“stride 2”)

irrespective of the type of a.

> 
>> 
>> We can vectorize them because in the case where we wrap in the address space the
>> unvectorized code would have had to access a pointer value of zero which is
>> undefined behavior in address space zero according to the LLVM IR semantics.
>> (Thank you Duncan, for pointing this out to me).
>> 
>> Fixes PR16592.
>> 
>> Added:
>>    llvm/trunk/test/Transforms/LoopVectorize/safegep.ll
>> Modified:
>>    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>> 
>> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=186088&r1=186087&r2=186088&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Thu Jul 11 10:21:55 2013
>> @@ -3223,11 +3223,12 @@ static bool isInBoundsGep(Value *Ptr) {
>> /// \brief Check whether the access through \p Ptr has a constant stride.
>> static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
>>                         const Loop *Lp) {
>> -  const Type *PtrTy = Ptr->getType();
>> -  assert(PtrTy->isPointerTy() && "Unexpected non ptr");
>> +  const Type *Ty = Ptr->getType();
>> +  assert(Ty->isPointerTy() && "Unexpected non ptr");
>> 
>>   // Make sure that the pointer does not point to aggregate types.
>> -  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) {
>> +  const PointerType *PtrTy = cast<PointerType>(Ty);
>> +  if (PtrTy->getElementType()->isAggregateType()) {
>>     DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr
>>           << "\n");
>>     return 0;
>> @@ -3248,11 +3249,16 @@ static int isStridedPtr(ScalarEvolution
>>   }
>> 
>>   // The address calculation must not wrap. Otherwise, a dependence could be
>> -  // inverted. An inbounds getelementptr that is a AddRec with a unit stride
>> +  // inverted.
>> +  // An inbounds getelementptr that is a AddRec with a unit stride
>>   // cannot wrap per definition. The unit stride requirement is checked later.
>> +  // An getelementptr without an inbounds attribute and unit stride would have
>> +  // to access the pointer value "0" which is undefined behavior in address
>> +  // space 0, therefore we can also vectorize this case.
>>   bool IsInBoundsGEP = isInBoundsGep(Ptr);
>>   bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
>> -  if (!IsNoWrapAddRec && !IsInBoundsGEP) {
>> +  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
>> +  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
>>     DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
>>           << *Ptr << " SCEV: " << *PtrScev << "\n");
>>     return 0;
>> @@ -3269,7 +3275,7 @@ static int isStridedPtr(ScalarEvolution
>>     return 0;
>>   }
>> 
>> -  int64_t Size = DL->getTypeAllocSize(PtrTy->getPointerElementType());
>> +  int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
>>   const APInt &APStepVal = C->getValue()->getValue();
>> 
>>   // Huge step value - give up.
>> @@ -3285,8 +3291,10 @@ static int isStridedPtr(ScalarEvolution
>>     return 0;
>> 
>>   // If the SCEV could wrap but we have an inbounds gep with a unit stride we
>> -  // know we can't "wrap around the address space".
>> -  if (!IsNoWrapAddRec && IsInBoundsGEP && Stride != 1 && Stride != -1)
>> +  // know we can't "wrap around the address space". In case of address space
>> +  // zero we know that this won't happen without triggering undefined behavior.
>> +  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
>> +      Stride != 1 && Stride != -1)
>>     return 0;
>> 
>>   return Stride;
>> 
>> Added: llvm/trunk/test/Transforms/LoopVectorize/safegep.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/safegep.ll?rev=186088&view=auto
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopVectorize/safegep.ll (added)
>> +++ llvm/trunk/test/Transforms/LoopVectorize/safegep.ll Thu Jul 11 10:21:55 2013
>> @@ -0,0 +1,61 @@
>> +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-unroll=1  < %s |  FileCheck %s
>> +target datalayout = "e-p:32:32:32-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
>> +
>> +
>> +; We can vectorize this code because if the address computation would wrap then
>> +; a load from 0 would take place which is undefined behaviour in address space 0
>> +; according to LLVM IR semantics.
>> +
>> +; PR16592
>> +
>> +; CHECK: safe
>> +; CHECK: <4 x float>
>> +
>> +define void @safe(float* %A, float* %B, float %K) {
>> +entry:
>> +  br label %"<bb 3>"
>> +
>> +"<bb 3>":
>> +  %i_15 = phi i32 [ 0, %entry ], [ %i_19, %"<bb 3>" ]
>> +  %pp3 = getelementptr float* %A, i32 %i_15
>> +  %D.1396_10 = load float* %pp3, align 4
>> +  %pp24 = getelementptr float* %B, i32 %i_15
>> +  %D.1398_15 = load float* %pp24, align 4
>> +  %D.1399_17 = fadd float %D.1398_15, %K
>> +  %D.1400_18 = fmul float %D.1396_10, %D.1399_17
>> +  store float %D.1400_18, float* %pp3, align 4
>> +  %i_19 = add nsw i32 %i_15, 1
>> +  %exitcond = icmp ne i32 %i_19, 64
>> +  br i1 %exitcond, label %"<bb 3>", label %return
>> +
>> +return:
>> +  ret void
>> +}
>> +
>> +; In a non-default address space we don't have this rule.
>> +
>> +; CHECK: notsafe
>> +; CHECK-NOT: <4 x float>
>> +
>> +define void @notsafe(float addrspace(5) * %A, float* %B, float %K) {
>> +entry:
>> +  br label %"<bb 3>"
>> +
>> +"<bb 3>":
>> +  %i_15 = phi i32 [ 0, %entry ], [ %i_19, %"<bb 3>" ]
>> +  %pp3 = getelementptr float addrspace(5) * %A, i32 %i_15
>> +  %D.1396_10 = load float addrspace(5) * %pp3, align 4
>> +  %pp24 = getelementptr float* %B, i32 %i_15
>> +  %D.1398_15 = load float* %pp24, align 4
>> +  %D.1399_17 = fadd float %D.1398_15, %K
>> +  %D.1400_18 = fmul float %D.1396_10, %D.1399_17
>> +  store float %D.1400_18, float addrspace(5) * %pp3, align 4
>> +  %i_19 = add nsw i32 %i_15, 1
>> +  %exitcond = icmp ne i32 %i_19, 64
>> +  br i1 %exitcond, label %"<bb 3>", label %return
>> +
>> +return:
>> +  ret void
>> +}
>> +
>> +
>> 
>> 
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits