[PATCH] SROA: Handle a case of store size being smaller than allocation size

Thu Aug 21 16:03:18 PDT 2014

I think this patch might not be the right approach. The underlying issue is
that clang translates:

union U { long m0[3]; long double m1; };

to:

%union.U = type { x86_fp80, [16 x i8] }

The x86_fp80 type is 10 bytes of data aligned to 16 bytes. Is it
permissible to copy only the lower 10 bytes of an x86_fp80? I think the
answer should be yes. This is the behavior of the underlying x87 fldt and
fstpt instructions. If copying only the lower 10 bytes of a x86_fp80 is
permissible, then the C union was not correctly lowered to LLVM IR. I think
the correct resolution is to lower the union to:

%union.U = type { [4 x i64] }

On Thu, Aug 21, 2014 at 4:37 PM, Reid Kleckner <rnk at google.com> wrote:

> Hi chandlerc,
>
> In this case, we are creating an x86_fp80 slice for a union from C where
> the padding bytes may contain real data. An x86_fp80 alloca is 16 bytes,
> and that's just fine. We can't, however, use regular loads and stores to
> access the slice, because the store size is only 10 bytes / 80 bits.
> Instead, use memcpy and memset.
>
> Fixes PR18726.
>
> http://reviews.llvm.org/D5012
>
> Files:
>   lib/Transforms/Scalar/SROA.cpp
>   test/Transforms/SROA/slice-width.ll
>
> Index: lib/Transforms/Scalar/SROA.cpp
> ===================================================================
> --- lib/Transforms/Scalar/SROA.cpp
> +++ lib/Transforms/Scalar/SROA.cpp
> @@ -2422,6 +2422,7 @@
>      if (!VecTy && !IntTy &&
>          (BeginOffset > NewAllocaBeginOffset ||
>           EndOffset < NewAllocaEndOffset ||
> +         SliceSize != DL.getTypeStoreSize(AllocaTy) ||
>           !AllocaTy->isSingleValueType() ||
>           !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
>           DL.getTypeSizeInBits(ScalarTy)%8 != 0)) {
> @@ -2544,10 +2545,11 @@
>
>      // If this doesn't map cleanly onto the alloca type, and that type
> isn't
>      // a single value type, just emit a memcpy.
> -    bool EmitMemCpy
> -      = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset ||
> -                             EndOffset < NewAllocaEndOffset ||
> -
>  !NewAI.getAllocatedType()->isSingleValueType());
> +    bool EmitMemCpy =
> +        !VecTy && !IntTy &&
> +        (BeginOffset > NewAllocaBeginOffset || EndOffset <
> NewAllocaEndOffset ||
> +         SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
> +         !NewAI.getAllocatedType()->isSingleValueType());
>
>      // If we're just going to emit a memcpy, the alloca hasn't changed,
> and the
>      // size hasn't been shrunk based on analysis of the viable range,
> this is
> Index: test/Transforms/SROA/slice-width.ll
> ===================================================================
> --- test/Transforms/SROA/slice-width.ll
> +++ test/Transforms/SROA/slice-width.ll
> @@ -1,7 +1,8 @@
>  ; RUN: opt < %s -sroa -S | FileCheck %s
> -target datalayout =
> "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
> +target datalayout =
> "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-f80:128-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
>
>  declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture,
> i32, i32, i1) nounwind
> +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
> nounwind
>
>  define void @no_split_on_non_byte_width(i32) {
>  ; This tests that allocas are not split into slices that are not byte
> width multiple
> @@ -23,3 +24,57 @@
>    %t1 = load i1* %p1
>    ret void
>  }
> +
> +; PR18726: Check that we use memcpy and memset to fill out padding when
> we have
> +; a slice with a simple single type whose store size is smaller than the
> slice
> +; size.
> +
> +%union.Foo = type { x86_fp80, i64, i64 }
> +
> + at foo_copy_source = external constant %union.Foo
> + at i64_sink = global i64 0
> +
> +define void @memcpy_fp80_padding(%union.Foo* sret %res) {
> +  %x = alloca %union.Foo
> +
> +  ; Copy from a global.
> +  %x_i8 = bitcast %union.Foo* %x to i8*
> +  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x_i8, i8* bitcast
> (%union.Foo* @foo_copy_source to i8*), i32 32, i32 16, i1 false)
> +
> +  ; Access a slice of the alloca to trigger SROA.
> +  %mid_p = getelementptr %union.Foo* %x, i32 0, i32 1
> +  %elt = load i64* %mid_p
> +  store i64 %elt, i64* @i64_sink
> +
> +  ; Copy out.
> +  %res_i8 = bitcast %union.Foo* %res to i8*
> +  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %res_i8, i8* %x_i8, i32 32,
> i32 16, i1 false)
> +  ret void
> +}
> +; CHECK-LABEL: define void @memcpy_fp80_padding
> +; CHECK: alloca x86_fp80
> +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32
> +; CHECK: load i64* getelementptr inbounds (%union.Foo* @foo_copy_source,
> i64 0, i32 1)
> +; CHECK: load i64* getelementptr inbounds (%union.Foo* @foo_copy_source,
> i64 0, i32 2)
> +
> +define void @memset_fp80_padding(%union.Foo* sret %res) {
> +  %x = alloca %union.Foo
> +
> +  ; Set to all ones.
> +  %x_i8 = bitcast %union.Foo* %x to i8*
> +  call void @llvm.memset.p0i8.i32(i8* %x_i8, i8 -1, i32 32, i32 16, i1
> false)
> +
> +  ; Access a slice of the alloca to trigger SROA.
> +  %mid_p = getelementptr %union.Foo* %x, i32 0, i32 1
> +  %elt = load i64* %mid_p
> +  store i64 %elt, i64* @i64_sink
> +
> +  ; Copy out.
> +  %res_i8 = bitcast %union.Foo* %res to i8*
> +  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %res_i8, i8* %x_i8, i32 32,
> i32 16, i1 false)
> +  ret void
> +}
> +; CHECK-LABEL: define void @memset_fp80_padding
> +; CHECK: alloca x86_fp80
> +; CHECK: call void @llvm.memset.p0i8.i32(i8* %{{.*}}, i8 -1, i32 16, i32
> 16, i1 false)
> +; CHECK: store i64 -1, i64* @i64_sink
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140821/82dc83cc/attachment.html>