[llvm] r240787 - [DAGCombine] fold (X >>?, exact C1) << C2 --> X << (C2-C1)

Tue Jun 30 08:39:38 PDT 2015

Hi Ben,

Sorry, but I've found a regression in AArch64 code generation that I've
narrowed down to this patch. The smallest reproducer I've figured out so far
is that for this IR,

target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"

%struct.X = type { float, float }

define void @test(%struct.X* nocapture %out, i64 %out_start, i64 %step) {
entry:
  %arr = alloca [144 x %struct.X], align 4
  %0 = bitcast [144 x %struct.X]* %arr to i8*
  br label %for.body

for.body:
  %w_ix.028 = phi i64 [ 0, %entry ], [ %add10, %for.body ]
  %i.027 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
  %add = add i64 %i.027, %out_start
  %sext = shl i64 %w_ix.028, 32
  %idxprom = ashr exact i64 %sext, 32
  %AA = getelementptr inbounds [144 x %struct.X], [144 x %struct.X]* %arr,
i64 0, i64 %idxprom, i32 0
  %AB = load float, float* %AA, align 4
  %AC = getelementptr inbounds [144 x %struct.X], [144 x %struct.X]* %arr,
i64 0, i64 %idxprom, i32 1
  %AD = load float, float* %AC, align 4
  %add3 = fadd fast float %AD, %AB
  %add7 = add i64 %add, 8
  %a9 = getelementptr inbounds %struct.X, %struct.X* %out, i64 %add7, i32 0
  store float %add3, float* %a9, align 4
  %add10 = add i64 %idxprom, %step
  %inc = add nuw nsw i64 %i.027, 1
  br label %for.body
}

I see the following AArch64 code generated before your patch,

test:                                   // @test
// BB#0:                                // %entry
        stp     x28, x27, [sp, #-16]!
        sub     sp, sp, #1152           // =1152
        mov      x8, xzr
        add     x9, x0, x1, lsl #3
        add     x9, x9, #64             // =64
        mov      x10, sp
.LBB0_1:                                // %for.body
                                        // =>This Inner Loop Header: Depth=1
*** add     x11, x10, w8, sxtw #3 ****
        ldp      s0, s1, [x11]
        add     x8, x2, w8, sxtw
        fadd    s0, s1, s0
        str     s0, [x9], #8
        b       .LBB0_1

Whereas with your patch,

test:                                   // @test
// BB#0:                                // %entry
        stp     x28, x27, [sp, #-16]!
        sub     sp, sp, #1152           // =1152
        mov      x8, xzr
        add     x9, x0, x1, lsl #3
        add     x9, x9, #64             // =64
        mov      x10, sp
.LBB0_1:                                // %for.body
                                        // =>This Inner Loop Header: Depth=1
*** lsl     x11, x8, #32
        add     x11, x10, x11, asr #29 ***
        ldp      s0, s1, [x11]
        fadd    s0, s1, s0
        str     s0, [x9], #8
        add     x8, x2, w8, sxtw
        b       .LBB0_1

I've marked up the main difference in asterisks. Essentially, this change is
turning an

add     x11, x10, w8, sxtw #3

Into an

lsl     x11, x8, #32
add     x11, x10, x11, asr #29

If I change the IR to,

for.body:
  %w_ix.028 = phi i64 [ 0, %entry ], [ %add10, %for.body ]
  %i.027 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
  %add = add i64 %i.027, %out_start
  %sext = shl i64 %w_ix.028, 32
  %idxprom = ashr exact i64 %sext, 32
  %AA = getelementptr inbounds [144 x %struct.X], [144 x %struct.X]* %arr,
i64 0, i64 %idxprom, i32 0
  %AB = load float, float* %AA, align 4
  %add7 = add i64 %add, 8
  %a9 = getelementptr inbounds %struct.X, %struct.X* %out, i64 %add7, i32 0
  store float %AB, float* %a9, align 4
  %add10 = add i64 %idxprom, %step
  %inc = add nuw nsw i64 %i.027, 1
  br label %for.body
}

That is, removing the second GEP, I see this before your patch,

        sxtw    x10, w10
        lsl     x11, x10, #3

and this after,

        lsl     x11, x8, #32
        asr     x11, x11, #29

This could be a deficiency in the AArch64 backend that your change exposes.
I'm not sure, so I've also CC'd Tim.

Would appreciate any comments you might have about this.

Thanks for your time,
Charlie.