[PATCH] D59129: [SROA] WIP: Lowering alloca is not always beneficial

Fri Mar 8 04:02:35 PST 2019

SjoerdMeijer created this revision.
SjoerdMeijer added reviewers: chandlerc, efriedma, RKSimon, samparker, dmgreen.

This is work in progress to show how different pieces should fit together
(with e.g. D59014 <https://reviews.llvm.org/D59014>).  I would like to teach SROA that lowering alloca's is not
always beneficial.

Root cause of the inefficiency I am looking at is that some slices
of alloca are covered by a memcpy that load data from a character pointer.
Thus, the source operand of a memcpy get an alignment of 1 like this:

  call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 1 %tmp.sroa.0.0..sroa_idx7, i8* align 1 %src, i32 32, i1 false)

This is fine, nothing wrong with that, but after a few SROA invocations and
alloca rewrites, we end up with loads/stores that get an alignment of 1 to
which these memcpy's are expanded to. Depending on the how much data we're
loading/storing, and if unaligned data access is supported, this can lead to an
explosion in the number of byte loads/stores instructions. To illustrate the
problem, this is not uncommon initialisation code example:

  typedef struct {
    char data[32];
  } data_t;
  typedef struct {
    long long m1;
    long long m2;
    long long m3;
  } data_2_t;
  data_2_t *getData2();
  void init(char *src) {
    data_t tmp;
    memcpy(&tmp, src, sizeof(tmp));
    data_2_t *dst = getData2();
    memcpy(&dst->m1, &tmp.data[8], 8);
    memcpy(&dst->m2, &tmp.data[16], 8);
  }

Here stack object 'tmp' gets filled in by the memcpy that reads from the
character pointer, and the subsequent memcpy's read from that stack object.
When optimising for minsize, I would like to generate this:

  init:
    push  {r7, lr}
    mov r7, sp
    sub sp, #32
    mov r1, r0
    mov r0, sp
    movs  r2, #32
    bl  __aeabi_memcpy
    bl  getData2
    add.w r12, sp, #8
    ldm.w r12, {r1, r2, r3, r12}
    stm.w r0, {r1, r2, r3, r12}
    add sp, #32
    pop {r7, pc}

The solution to achieve this, relies at looking at the cost of instructions
covered by the alloca, and if it is an expensive instruction (for example
memcpy of 36 bytes with 1 byte alignment), then don't rewrite the alloca.
At the moment however, we generate this monster:

  init:
    push  {r4, r5, r6, r7, lr}
    add r7, sp, #12
    push.w  {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11}
    ldrb  r1, [r0, #8]!
    str r1, [sp, #24]
    ldrb  r1, [r0, #8]!
    str r1, [sp, #16]
    ldrb  r1, [r0, #4]!
    ldrb  r2, [r0, #-8]!
    str r2, [sp, #12]
    ldrb  r2, [r0, #11]
    ldrb  r3, [r0, #10]
    ldrb  r6, [r0, #-1]
    ldrb.w  r8, [r0, #1]
    ldrb.w  r9, [r0, #3]
    ldrb.w  r10, [r0, #2]
    ldrb.w  r11, [r0, #5]
    ldrb  r4, [r0, #7]
    ldrb  r5, [r0, #6]
    orr.w r2, r3, r2, lsl #8
    ldrb  r3, [r0, #9]
    orr.w r1, r1, r3, lsl #8
    orr.w r1, r1, r2, lsl #16
    str r1, [sp, #20]
    ldrb  r1, [r0, #-3]
    str r1, [sp, #8]
    ldrb  r1, [r0, #-2]
    str r1, [sp, #4]
    bl  getData2
    ldr r2, [sp, #16]
    ldr r3, [sp, #12]
    orr.w r1, r5, r4, lsl #8
    ldr r5, [sp, #8]
    orr.w r2, r2, r11, lsl #8
    orr.w r3, r3, r8, lsl #8
    orr.w r1, r2, r1, lsl #16
    orr.w r2, r10, r9, lsl #8
    orr.w r2, r3, r2, lsl #16
    ldr r3, [sp, #24]
    orr.w r3, r3, r5, lsl #8
    ldr r5, [sp, #4]
    orr.w r6, r5, r6, lsl #8
    orr.w r3, r3, r6, lsl #16
    strd  r3, r2, [r0]
    str r1, [r0, #8]
    ldr r1, [sp, #20]
    str r1, [r0, #12]
    add sp, #28
    pop.w {r8, r9, r10, r11}
    pop {r4, r5, r6, r7, pc}

https://reviews.llvm.org/D59129

Files:
  include/llvm/Transforms/Scalar/SROA.h
  lib/Transforms/Scalar/SROA.cpp

-------------- next part --------------
A non-text attachment was scrubbed...
Name: D59129.189840.patch
Type: text/x-patch
Size: 3835 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190308/57dea206/attachment.bin>