[PATCH] Add support to recognize non SIMD kind of parallelism in SLPVectorizer
Arnold Schwaighofer
aschwaighofer at apple.com
Tue Jun 17 08:59:16 PDT 2014
================
Comment at: lib/Transforms/Vectorize/SLPVectorizer.cpp:1259
@@ -1186,1 +1258,3 @@
+ return VecCost - ScalarCost;
+ }
default:
----------------
Ultimately, I think this could be replaced by a call to TargetTransfromInfo::getAddSubCost and BasicTargetTransformInfo should override this with the implementation you have here.
Targets can then override the cost as they see fit.
For now, your code should be conservatively correct for all targets assuming they return an accurate (conservative cost) for getShuffleCost(SK_Alternate) - and I don't see addsubs generated yet anyway, so introducing the getAddSubCost abstraction change would be premature.
Can you take a look at the code we generate for arm, arm64 and x86 to make sure that one instruction is correct? It seems to me that we generate 2 instructions for x86_64, arm, and arm64 and <4 x float>. Indicating that those targets should override getShuffleCost(SK_Alternate) and should return a cost of 2 for getShuffleCost(SK_Alternate, <4 x float>).
cat > test.ll
define void @test1(<4 x float> *%a, <4 x float> *%b, <4 x float> *%c) {
entry:
%in1 = load <4 x float>* %a
%in2 = load <4 x float>* %b
%add = fadd <4 x float> %in1, %in2
%sub = fsub <4 x float> %in1, %in2
%Shuff = shufflevector <4 x float> %add,
<4 x float> %sub,
<4 x i32> <i32 0, i32 5, i32 1, i32 6>
store <4 x float> %Shuff, <4 x float>* %c
ret void
}
define void @test2(<2 x double> *%a, <2 x double> *%b, <2 x double> *%c) {
entry:
%in1 = load <2 x double>* %a
%in2 = load <2 x double>* %b
%add = fadd <2 x double> %in1, %in2
%sub = fsub <2 x double> %in1, %in2
%Shuff = shufflevector <2 x double> %add,
<2 x double> %sub,
<2 x i32> <i32 0, i32 3>
store <2 x double> %Shuff, <2 x double>* %c
ret void
}
bin/llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone < testshufflevector.ll
.section __TEXT,__text,regular,pure_instructions
.ios_version_min 7, 0
.globl _test1
.align 2
_test1: ; @test1
.cfi_startproc
; BB#0: ; %entry
ldr q0, [x0]
ldr q1, [x1]
fadd.4s v2, v0, v1
fsub.4s v0, v0, v1
// TWO INSTRUCTIONS
ext.16b v0, v0, v2, #4
zip1.4s v0, v2, v0
///
str q0, [x2]
ret
.cfi_endproc
bin/llc -mtriple=armv7s-apple-ios7.0 < testshufflevector.ll
bin/llc -mtriple=x86_64-apple-macos < testshufflevector.ll
With the adjustments to the cost model (X86TargetTransformInfo/ARMTargetTransformInfo/AArch64TargetTransformInfo::getShuffleCost) this LGTM.
Thanks
http://reviews.llvm.org/D4015
More information about the llvm-commits
mailing list