[PATCH] D114174: [ARM][CodeGen] Add support for complex deinterleaving

Fri Sep 30 03:26:15 PDT 2022

dmgreen added a comment.

Thanks for adding all the extra tests. Here is another one, that is useful in itself, but more useful when we break it. It does `a*b +90 a*c`, and seems to work well for this example.

  define arm_aapcs_vfpcc <4 x float> @test(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
  entry:
    %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
    %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
    %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
    %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
    %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
    %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>

    %i6 = fmul fast <2 x float> %br, %ar
    %i7 = fmul fast <2 x float> %bi, %ai
    %xr = fsub fast <2 x float> %i6, %i7
    %i9 = fmul fast <2 x float> %bi, %ar
    %i10 = fmul fast <2 x float> %br, %ai
    %xi = fadd fast <2 x float> %i9, %i10

    %j6 = fmul fast <2 x float> %cr, %ar
    %j7 = fmul fast <2 x float> %ci, %ai
    %yr = fsub fast <2 x float> %j6, %j7
    %j9 = fmul fast <2 x float> %ci, %ar
    %j10 = fmul fast <2 x float> %cr, %ai
    %yi = fadd fast <2 x float> %j9, %j10

    %zr = fsub fast <2 x float> %yr, %xi
    %zi = fadd fast <2 x float> %yi, %xr
    %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
    ret <4 x float> %interleaved.vec
  }

But this is a modification that alters the c*a to use part of the b*a mul's. It shouldn't be being transformed as it is, I don't believe.

  define arm_aapcs_vfpcc <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
  entry:
    %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
    %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
    %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
    %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
    %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
    %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>

    %i6 = fmul fast <2 x float> %br, %ar
    %i7 = fmul fast <2 x float> %bi, %ai
    %xr = fsub fast <2 x float> %i6, %i7
    %i9 = fmul fast <2 x float> %bi, %ar
    %i10 = fmul fast <2 x float> %br, %ai
    %xi = fadd fast <2 x float> %i9, %i10

    ;%j6 = fmul fast <2 x float> %cr, %ar
    %j7 = fmul fast <2 x float> %ci, %ai
    %yr = fsub fast <2 x float> %i6, %j7
    ;%j9 = fmul fast <2 x float> %ci, %ar
    %j10 = fmul fast <2 x float> %cr, %ai
    %yi = fadd fast <2 x float> %i9, %j10

    %zr = fsub fast <2 x float> %yr, %xi
    %zi = fadd fast <2 x float> %yi, %xr
    %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
    ret <4 x float> %interleaved.vec
  }

The Incomplete nodes worry me as it looks like a rich source of bugs. If the identifyPartialMul and identifyNodeWithImplicitAdd need to work together more closely for the time being, that is probably fine. We can always change it in the future if needed.

Some other issues are that we need to check for multiple uses. As in something like

  define arm_aapcs_vfpcc <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) {
  ; CHECK-LABEL: mul_triangle_multiuses:
  ; CHECK:       @ %bb.0: @ %entry
  ; CHECK-NEXT:    .vsave {d14}
  ; CHECK-NEXT:    vpush {d14}
  ; CHECK-NEXT:    .vsave {d10, d11, d12}
  ; CHECK-NEXT:    vpush {d10, d11, d12}
  ; CHECK-NEXT:    .vsave {d8}
  ; CHECK-NEXT:    vpush {d8}
  ; CHECK-NEXT:    vmov q2, q0
  ; CHECK-NEXT:    vmov.f32 s16, s4
  ; CHECK-NEXT:    vmov.f32 s17, s6
  ; CHECK-NEXT:    vmov.i32 q0, #0x0
  ; CHECK-NEXT:    vmov.f32 s20, s9
  ; CHECK-NEXT:    vmov.f32 s21, s11
  ; CHECK-NEXT:    vmov.f32 s28, s5
  ; CHECK-NEXT:    vmul.f32 q3, q5, q4
  ; CHECK-NEXT:    vmov.f32 s29, s7
  ; CHECK-NEXT:    vmul.f32 q5, q7, q5
  ; CHECK-NEXT:    vmov.f32 s24, s8
  ; CHECK-NEXT:    vmov.f32 s25, s10
  ; CHECK-NEXT:    vneg.f32 q5, q5
  ; CHECK-NEXT:    vfma.f32 q3, q7, q6
  ; CHECK-NEXT:    vfma.f32 q5, q4, q6
  ; CHECK-NEXT:    vmov.f32 s22, s12
  ; CHECK-NEXT:    vmov.f32 s23, s13
  ; CHECK-NEXT:    vmov q3, q0
  ; CHECK-NEXT:    vcmla.f32 q3, q1, q2, #0
  ; CHECK-NEXT:    vstrw.32 q5, [r0]
  ; CHECK-NEXT:    vcmla.f32 q3, q1, q2, #90
  ; CHECK-NEXT:    vcmla.f32 q0, q2, q3, #0
  ; CHECK-NEXT:    vcmla.f32 q0, q2, q3, #90
  ; CHECK-NEXT:    vpop {d8}
  ; CHECK-NEXT:    vpop {d10, d11, d12}
  ; CHECK-NEXT:    vpop {d14}
  ; CHECK-NEXT:    bx lr
  entry:
    %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
    %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
    %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
    %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
    %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
    %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
    %2 = fsub fast <2 x float> %0, %1
    %3 = fmul fast <2 x float> %2, %strided.vec35
    %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
    %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
    %6 = fadd fast <2 x float> %4, %5
    %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    store <4 x float> %otheruse, ptr %p
    %7 = fmul fast <2 x float> %6, %strided.vec
    %8 = fadd fast <2 x float> %3, %7
    %9 = fmul fast <2 x float> %2, %strided.vec
    %10 = fmul fast <2 x float> %6, %strided.vec35
    %11 = fsub fast <2 x float> %9, %10
    %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
    ret <4 x float> %interleaved.vec
  }

And we probably need to check for fast-math flags, where we are generating fma. I also think that it is safer if nodes are uniquely identified from {real, imag} root pairs, not just from nodes that might contain either real or imag somewhere in them.

================
Comment at: llvm/test/CodeGen/Thumb2/complex-deinterleaving-uniform-cases.ll:1
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
----------------
All the mve tests start with mve-

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D114174/new/

https://reviews.llvm.org/D114174