[PATCH] D13269: Improved X86-FMA3 mem-folding & coalescing

Tue Sep 29 13:50:57 PDT 2015

v_klochkov created this revision.
v_klochkov added a reviewer: qcolombet.
v_klochkov added a subscriber: llvm-commits.

`
This change-set was initially included into a bigger change-set http://reviews.llvm.org/D11370
but X86 FMA3 specific changes were removed from D11370 to simplify that change-set.

The changes proposed here implement optimal form selection (213/312/231) 
for X86 FMA3 instructions, and help to improve Memory-operand folding and Coalescing
optimizations performed for X86 FMA instructions.

Better Memory-folding and Coalescing optimizations help to reduce 
registers pressure. Improvement from the changes can be shown on such 
an example:

    for (int i = 0; i < N; i += 1) {
        val1 = _mm_and_pd(val1, val5);
        val2 = _mm_and_pd(val2, val6);
        val3 = _mm_and_pd(val3, val7);
        val4 = _mm_and_pd(val4, val8);
        val5 = _mm_xor_pd(val1, val5);
        val6 = _mm_xor_pd(val2, val6);
        val7 = _mm_xor_pd(val3, val7);
        val8 = _mm_xor_pd(val4, val8);

		v_accu1 = _mm_fmadd_pd(v_accu1, x1_arr[i], val1);
        v_accu2 = _mm_fmadd_pd(v_accu2, x2_arr[i], val2);
        v_accu3 = _mm_fmadd_pd(v_accu3, x3_arr[i], val3);
        v_accu4 = _mm_fmadd_pd(v_accu4, x4_arr[i], val4);
        v_accu5 = _mm_fmadd_pd(v_accu5, x5_arr[i], val5);
        v_accu6 = _mm_fmadd_pd(v_accu6, x6_arr[i], val6);
        v_accu7 = _mm_fmadd_pd(v_accu7, x7_arr[i], val7);
        v_accu8 = _mm_fmadd_pd(v_accu8, x8_arr[i], val8);
    }

    ASM code BEFORE the changes:
        .LBB1_2:                                # %for.body.6
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovapd %xmm0, -56(%rsp)        # 16-byte Spill
        vandpd  %xmm7, %xmm3, %xmm7
        vandpd  %xmm5, %xmm12, %xmm5
        vandpd  %xmm6, %xmm9, %xmm6
        vmovapd -40(%rsp), %xmm10       # 16-byte Reload
        vandpd  %xmm10, %xmm13, %xmm10
        vmovapd %xmm10, -40(%rsp)       # 16-byte Spill
        vxorpd  %xmm7, %xmm3, %xmm3
        vxorpd  %xmm5, %xmm12, %xmm12
        vxorpd  %xmm6, %xmm9, %xmm9
        vxorpd  %xmm10, %xmm13, %xmm13
        vmovapd %xmm8, %xmm0
        vmovapd x1_arr+8192(%rcx), %xmm8
        vmovapd -24(%rsp), %xmm1        # 16-byte Reload
        vfmadd213pd     %xmm7, %xmm8, %xmm1
        vmovapd %xmm1, -24(%rsp)        # 16-byte Spill
        vmovapd %xmm0, %xmm8
        vmovapd x2_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm5, %xmm1, %xmm4
        vmovapd x3_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm6, %xmm1, %xmm8
        vmovapd x4_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm10, %xmm1, %xmm11
        vmovapd -56(%rsp), %xmm0        # 16-byte Reload
        vmovapd x5_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm3, %xmm1, %xmm15
        vmovapd x6_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm12, %xmm1, %xmm0
        vmovapd x7_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm9, %xmm1, %xmm2
        vmovapd x8_arr+8192(%rcx), %xmm1
        vfmadd213pd     %xmm13, %xmm1, %xmm14
        addq    $16, %rcx
        jne     .LBB1_2

        ASM code WITH the new changes (about 30% faster):
        .LBB1_2:                                # %for.body.6
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vandpd  %xmm7, %xmm3, %xmm7
        vandpd  %xmm5, %xmm2, %xmm5
        vandpd  %xmm6, %xmm0, %xmm6
        vandpd  %xmm1, %xmm4, %xmm1
        vxorpd  %xmm7, %xmm3, %xmm3
        vxorpd  %xmm5, %xmm2, %xmm2
        vxorpd  %xmm6, %xmm0, %xmm0
        vfmadd132pd     x1_arr+8192(%rcx), %xmm7, %xmm15
        vfmadd132pd     x2_arr+8192(%rcx), %xmm5, %xmm8
        vfmadd132pd     x3_arr+8192(%rcx), %xmm6, %xmm9
        vfmadd132pd     x4_arr+8192(%rcx), %xmm1, %xmm10
        vfmadd132pd     x5_arr+8192(%rcx), %xmm3, %xmm14
        vfmadd132pd     x6_arr+8192(%rcx), %xmm2, %xmm11
        vfmadd132pd     x7_arr+8192(%rcx), %xmm0, %xmm12
        vxorpd  %xmm1, %xmm4, %xmm4
        vfmadd132pd     x8_arr+8192(%rcx), %xmm4, %xmm13
        addq    $16, %rcx
        jne     .LBB1_2

This change-set also fixed an existing correctness problem caused 
by commuting 1st and 2nd operands of scalar FMAs generated for intrinsics. 

   For FMA intrinsic call:

       __m128d foo(__m128d a, __m128d b, __m128d c) {
	     // must return XMM0={b[127:64], a[63:0]*b[63:0]+c[63:0]}
		 // but currently returned value is XMM0={a[127:64], a[63:0]*b[63:0]+c[63:0]}
	     return _mm_fmadd_sd(b, a, c);
	   }

The Coalescer/TwoAddressInstructionPass swapped the 1st and 2nd operands
of SCALAR FMA and invalidated the higher bits of the result returned
from foo().
The change-set fixes that and prohibits swapping 1st and 2nd operands
of scalar FMAs.

Swapping 1st and 2nd operands of scalar FMAs may be possible and legal,
but only after special analysis of FMA users. Such optimization/analysis
can be implemented separately.
Another way is to separate FMA opcodes generated for FP operations 
and FMA opcodes generated for FMA intrinsics as it is done now for ADD operations,
e.g. ADDSSrr vs ADDSSrr_Int. *_Int opcodes are handled more conservatively.
Being more conservative in commuting 1st and 2nd operands of scalar FMAs
right now seems better choice as stability/correctness has higher priority.

With regards to performance these changes are very good for vector/packed FMAs 
(all source operands became commutable),
and neutral for scalar FMAs:
a) prohibit/disable commuting 1st and 2nd operands, 
b) enable commuting 2nd and 3rd operands.
`

http://reviews.llvm.org/D13269

Files:
  llvm/lib/Target/X86/X86InstrFMA.td
  llvm/lib/Target/X86/X86InstrInfo.cpp
  llvm/lib/Target/X86/X86InstrInfo.h
  llvm/test/CodeGen/X86/fma-commute-x86.ll
  llvm/test/CodeGen/X86/fma_patterns.ll

-------------- next part --------------
A non-text attachment was scrubbed...
Name: D13269.36032.patch
Type: text/x-patch
Size: 46087 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150929/8ae1c682/attachment.bin>