[PATCH] Add support to recognize non SIMD kind of parallelism in SLPVectorizer

Tue Jun 17 08:59:16 PDT 2014

================
Comment at: lib/Transforms/Vectorize/SLPVectorizer.cpp:1259
@@ -1186,1 +1258,3 @@
+      return VecCost - ScalarCost;
+    }
     default:
----------------
Ultimately, I think this could be replaced by a call to TargetTransfromInfo::getAddSubCost and BasicTargetTransformInfo should override this with the implementation you have here.
Targets can then override the cost as they see fit.
 
For now, your code should be conservatively correct for all targets assuming they return an accurate (conservative cost) for getShuffleCost(SK_Alternate) - and I don't see addsubs generated yet anyway, so introducing the getAddSubCost abstraction change would be premature.

Can you take a look at the code we generate for arm, arm64 and x86 to make sure that one instruction is correct? It seems to me that we generate 2 instructions for x86_64, arm, and arm64 and <4 x float>. Indicating that those targets should override getShuffleCost(SK_Alternate) and should return a cost of 2 for getShuffleCost(SK_Alternate, <4 x float>).

  cat > test.ll
  
  define void @test1(<4 x float> *%a, <4 x float> *%b, <4 x float> *%c) {
  entry:
    %in1 = load <4 x float>* %a
    %in2 = load <4 x float>* %b
    %add = fadd <4 x float> %in1, %in2
    %sub = fsub <4 x float> %in1, %in2
    %Shuff = shufflevector <4 x float> %add,
                           <4 x float> %sub,
                           <4 x i32> <i32 0, i32 5, i32 1, i32 6>
    store <4 x float> %Shuff, <4 x float>* %c
    ret void
  }
  
  define void @test2(<2 x double> *%a, <2 x double> *%b, <2 x double> *%c) {
  entry:
    %in1 = load <2 x double>* %a
    %in2 = load <2 x double>* %b
    %add = fadd <2 x double> %in1, %in2
    %sub = fsub <2 x double> %in1, %in2
    %Shuff = shufflevector <2 x double> %add,
                           <2 x double> %sub,
                           <2 x i32> <i32 0, i32 3>
    store <2 x double> %Shuff, <2 x double>* %c
    ret void
  }
  
  
  bin/llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone < testshufflevector.ll
  
          .section        __TEXT,__text,regular,pure_instructions
          .ios_version_min 7, 0
          .globl  _test1
          .align  2
  _test1:                                 ; @test1
          .cfi_startproc
  ; BB#0:                                 ; %entry
          ldr      q0, [x0]
          ldr      q1, [x1]
          fadd.4s v2, v0, v1
          fsub.4s v0, v0, v1
  // TWO INSTRUCTIONS
          ext.16b v0, v0, v2, #4
          zip1.4s v0, v2, v0
  ///
          str      q0, [x2]
          ret
          .cfi_endproc
  
  bin/llc -mtriple=armv7s-apple-ios7.0 < testshufflevector.ll
  bin/llc -mtriple=x86_64-apple-macos < testshufflevector.ll

With the adjustments to the cost model (X86TargetTransformInfo/ARMTargetTransformInfo/AArch64TargetTransformInfo::getShuffleCost) this LGTM.

Thanks

http://reviews.llvm.org/D4015