[llvm-bugs] [Bug 32384] New: [AArch64] missing lowering of memcpy

via llvm-bugs llvm-bugs at lists.llvm.org
Wed Mar 22 17:25:51 PDT 2017


            Bug ID: 32384
           Summary: [AArch64] missing lowering of memcpy
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: AArch64
          Assignee: unassignedbugs at nondot.org
          Reporter: davide at freebsd.org
                CC: efriedma at codeaurora.org, james.molloy at arm.com,
                    llvm-bugs at lists.llvm.org, matze at braunis.de,
                    t.p.northover at gmail.com

Consider the following code, reduced from SingleSource/Misc/salsa20.c in the
llvm test-suite, built with -O3 -mgeneral-regs-only -fomit-frame-pointer.

#include <stdint.h>

void tinky(uint32_t out[16],uint32_t in[16]) {
  uint32_t x[16];
  for (int i = 0; i < 16; ++i) x[i] = in[i];
  for (int i = 0; i < 16; ++i) out[i] = x[i] + in[i];

llvm generates a call to memcpy to put the input array on the stack

0000000000000000 <tinky>:
   0:   d10183ff        sub     sp, sp, #0x60
   4:   a9044ff4        stp     x20, x19, [sp,#64]
   8:   aa0003f3        mov     x19, x0
   c:   910003e0        mov     x0, sp
  10:   321a03e2        orr     w2, wzr, #0x40
  14:   a9057bfd        stp     x29, x30, [sp,#80]
  18:   910143fd        add     x29, sp, #0x50
  1c:   aa0103f4        mov     x20, x1
  20:   94000000        bl      0 <memcpy>
  24:   b94003e8        ldr     w8, [sp]
  28:   b9400289        ldr     w9, [x20]
  2c:   0b080128        add     w8, w9, w8

while GCC just used `stp`:

   0:   d10103ff        sub     sp, sp, #0x40
   4:   91004024        add     x4, x1, #0x10
   8:   eb04001f        cmp     x0, x4
   c:   91004003        add     x3, x0, #0x10
  10:   a9402c2a        ldp     x10, x11, [x1]
  14:   aa000022        orr     x2, x1, x0
  18:   a9412428        ldp     x8, x9, [x1,#16]
  1c:   a9002fea        stp     x10, x11, [sp]
  20:   a9421c26        ldp     x6, x7, [x1,#32]
  24:   a90127e8        stp     x8, x9, [sp,#16]
  28:   a9431424        ldp     x4, x5, [x1,#48]
  2c:   a9021fe6        stp     x6, x7, [sp,#32]
  30:   a90317e4        stp     x4, x5, [sp,#48]
  34:   92400c42        and     x2, x2, #0xf
  38:   fa433022        ccmp    x1, x3, #0x2, cc

As the IR shows, both alignment and size are constant, and the size is
relatively small (well, 64), so I guess this call could be lowered (providing
`EmitTargetCodeForMemcpy()`) for the architecture. Thoughts?

define void @tinky(i32* nocapture, i32* nocapture readonly) local_unnamed_addr
#0 {
  %3 = bitcast i32* %1 to i8*
  %4 = alloca [16 x i32], align 4
  %5 = bitcast [16 x i32]* %4 to i8*
  call void @llvm.lifetime.start(i64 64, i8* nonnull %5) #2
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %5, i8* %3, i64 64, i32 4,
i1 false)
  %6 = getelementptr inbounds [16 x i32], [16 x i32]* %4, i64 0, i64 0
  %7 = load i32, i32* %6, align 4, !tbaa !1
  %8 = load i32, i32* %1, align 4, !tbaa !1
  %9 = add i32 %8, %7

