[llvm] r180196 - LoopVectorize: Scalarize padded types

Wed Apr 24 09:16:01 PDT 2013

Author: arnolds
Date: Wed Apr 24 11:16:01 2013
New Revision: 180196

URL: http://llvm.org/viewvc/llvm-project?rev=180196&view=rev
Log:
LoopVectorize: Scalarize padded types

This patch disables memory-instruction vectorization for types that need padding
bytes, e.g., x86_fp80 has 10 bytes store size with 6 bytes padding in darwin on
x86_64. Because the load/store vectorization is performed by the bit casting to
a packed vector, which has incompatible memory layout due to the lack of padding
bytes, the present vectorizer produces inconsistent result for memory
instructions of those types.
This patch checks an equality of the AllocSize of a scalar type and allocated
size for each vector element, to ensure that there is no padding bytes and the
array can be read/written using vector operations.

Patch by Daisuke Takahashi!

Fixes PR15758.

Added:
    llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=180196&r1=180195&r2=180196&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Wed Apr 24 11:16:01 2013
@@ -956,6 +956,12 @@ void InnerLoopVectorizer::vectorizeMemor
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
 
+  unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
+  unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+
+  if (ScalarAllocatedSize != VectorElementSize)
+    return scalarizeInstruction(Instr);
+
   // If the pointer is loop invariant or if it is non consecutive,
   // scalarize the load.
   int Stride = Legal->isConsecutivePtr(Ptr);
@@ -3558,7 +3564,9 @@ LoopVectorizationCostModel::getInstructi
     // Scalarized loads/stores.
     int Stride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = Stride < 0;
-    if (0 == Stride) {
+    unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
+    unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
+    if (0 == Stride || ScalarAllocatedSize != VectorElementSize) {
       unsigned Cost = 0;
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

Added: llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll?rev=180196&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll Wed Apr 24 11:16:01 2013
@@ -0,0 +1,29 @@
+; RUN: opt -O3 -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+ at x = common global [1024 x x86_fp80] zeroinitializer, align 16
+
+;CHECK: @example
+;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>*
+;CHECK: store
+;CHECK: ret void
+
+define void @example() nounwind ssp uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sitofp i32 1 to x86_fp80
+  %arrayidx = getelementptr inbounds [1024 x x86_fp80]* @x, i64 0, i64 %indvars.iv
+  store x86_fp80 %conv, x86_fp80* %arrayidx, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}