[PATCH] D13269: Improved X86-FMA3 mem-folding & coalescing
Vyacheslav Klochkov via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 29 13:50:57 PDT 2015
v_klochkov created this revision.
v_klochkov added a reviewer: qcolombet.
v_klochkov added a subscriber: llvm-commits.
`
This change-set was initially included into a bigger change-set http://reviews.llvm.org/D11370
but X86 FMA3 specific changes were removed from D11370 to simplify that change-set.
The changes proposed here implement optimal form selection (213/312/231)
for X86 FMA3 instructions, and help to improve Memory-operand folding and Coalescing
optimizations performed for X86 FMA instructions.
Better Memory-folding and Coalescing optimizations help to reduce
registers pressure. Improvement from the changes can be shown on such
an example:
for (int i = 0; i < N; i += 1) {
val1 = _mm_and_pd(val1, val5);
val2 = _mm_and_pd(val2, val6);
val3 = _mm_and_pd(val3, val7);
val4 = _mm_and_pd(val4, val8);
val5 = _mm_xor_pd(val1, val5);
val6 = _mm_xor_pd(val2, val6);
val7 = _mm_xor_pd(val3, val7);
val8 = _mm_xor_pd(val4, val8);
v_accu1 = _mm_fmadd_pd(v_accu1, x1_arr[i], val1);
v_accu2 = _mm_fmadd_pd(v_accu2, x2_arr[i], val2);
v_accu3 = _mm_fmadd_pd(v_accu3, x3_arr[i], val3);
v_accu4 = _mm_fmadd_pd(v_accu4, x4_arr[i], val4);
v_accu5 = _mm_fmadd_pd(v_accu5, x5_arr[i], val5);
v_accu6 = _mm_fmadd_pd(v_accu6, x6_arr[i], val6);
v_accu7 = _mm_fmadd_pd(v_accu7, x7_arr[i], val7);
v_accu8 = _mm_fmadd_pd(v_accu8, x8_arr[i], val8);
}
ASM code BEFORE the changes:
.LBB1_2: # %for.body.6
# Parent Loop BB1_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovapd %xmm0, -56(%rsp) # 16-byte Spill
vandpd %xmm7, %xmm3, %xmm7
vandpd %xmm5, %xmm12, %xmm5
vandpd %xmm6, %xmm9, %xmm6
vmovapd -40(%rsp), %xmm10 # 16-byte Reload
vandpd %xmm10, %xmm13, %xmm10
vmovapd %xmm10, -40(%rsp) # 16-byte Spill
vxorpd %xmm7, %xmm3, %xmm3
vxorpd %xmm5, %xmm12, %xmm12
vxorpd %xmm6, %xmm9, %xmm9
vxorpd %xmm10, %xmm13, %xmm13
vmovapd %xmm8, %xmm0
vmovapd x1_arr+8192(%rcx), %xmm8
vmovapd -24(%rsp), %xmm1 # 16-byte Reload
vfmadd213pd %xmm7, %xmm8, %xmm1
vmovapd %xmm1, -24(%rsp) # 16-byte Spill
vmovapd %xmm0, %xmm8
vmovapd x2_arr+8192(%rcx), %xmm1
vfmadd213pd %xmm5, %xmm1, %xmm4
vmovapd x3_arr+8192(%rcx), %xmm1
vfmadd213pd %xmm6, %xmm1, %xmm8
vmovapd x4_arr+8192(%rcx), %xmm1
vfmadd213pd %xmm10, %xmm1, %xmm11
vmovapd -56(%rsp), %xmm0 # 16-byte Reload
vmovapd x5_arr+8192(%rcx), %xmm1
vfmadd213pd %xmm3, %xmm1, %xmm15
vmovapd x6_arr+8192(%rcx), %xmm1
vfmadd213pd %xmm12, %xmm1, %xmm0
vmovapd x7_arr+8192(%rcx), %xmm1
vfmadd213pd %xmm9, %xmm1, %xmm2
vmovapd x8_arr+8192(%rcx), %xmm1
vfmadd213pd %xmm13, %xmm1, %xmm14
addq $16, %rcx
jne .LBB1_2
ASM code WITH the new changes (about 30% faster):
.LBB1_2: # %for.body.6
# Parent Loop BB1_1 Depth=1
# => This Inner Loop Header: Depth=2
vandpd %xmm7, %xmm3, %xmm7
vandpd %xmm5, %xmm2, %xmm5
vandpd %xmm6, %xmm0, %xmm6
vandpd %xmm1, %xmm4, %xmm1
vxorpd %xmm7, %xmm3, %xmm3
vxorpd %xmm5, %xmm2, %xmm2
vxorpd %xmm6, %xmm0, %xmm0
vfmadd132pd x1_arr+8192(%rcx), %xmm7, %xmm15
vfmadd132pd x2_arr+8192(%rcx), %xmm5, %xmm8
vfmadd132pd x3_arr+8192(%rcx), %xmm6, %xmm9
vfmadd132pd x4_arr+8192(%rcx), %xmm1, %xmm10
vfmadd132pd x5_arr+8192(%rcx), %xmm3, %xmm14
vfmadd132pd x6_arr+8192(%rcx), %xmm2, %xmm11
vfmadd132pd x7_arr+8192(%rcx), %xmm0, %xmm12
vxorpd %xmm1, %xmm4, %xmm4
vfmadd132pd x8_arr+8192(%rcx), %xmm4, %xmm13
addq $16, %rcx
jne .LBB1_2
This change-set also fixed an existing correctness problem caused
by commuting 1st and 2nd operands of scalar FMAs generated for intrinsics.
For FMA intrinsic call:
__m128d foo(__m128d a, __m128d b, __m128d c) {
// must return XMM0={b[127:64], a[63:0]*b[63:0]+c[63:0]}
// but currently returned value is XMM0={a[127:64], a[63:0]*b[63:0]+c[63:0]}
return _mm_fmadd_sd(b, a, c);
}
The Coalescer/TwoAddressInstructionPass swapped the 1st and 2nd operands
of SCALAR FMA and invalidated the higher bits of the result returned
from foo().
The change-set fixes that and prohibits swapping 1st and 2nd operands
of scalar FMAs.
Swapping 1st and 2nd operands of scalar FMAs may be possible and legal,
but only after special analysis of FMA users. Such optimization/analysis
can be implemented separately.
Another way is to separate FMA opcodes generated for FP operations
and FMA opcodes generated for FMA intrinsics as it is done now for ADD operations,
e.g. ADDSSrr vs ADDSSrr_Int. *_Int opcodes are handled more conservatively.
Being more conservative in commuting 1st and 2nd operands of scalar FMAs
right now seems better choice as stability/correctness has higher priority.
With regards to performance these changes are very good for vector/packed FMAs
(all source operands became commutable),
and neutral for scalar FMAs:
a) prohibit/disable commuting 1st and 2nd operands,
b) enable commuting 2nd and 3rd operands.
`
http://reviews.llvm.org/D13269
Files:
llvm/lib/Target/X86/X86InstrFMA.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86InstrInfo.h
llvm/test/CodeGen/X86/fma-commute-x86.ll
llvm/test/CodeGen/X86/fma_patterns.ll
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D13269.36032.patch
Type: text/x-patch
Size: 46087 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150929/8ae1c682/attachment.bin>
More information about the llvm-commits
mailing list