[PATCH] use conversion of builtin vector types to enable constant propagation
Matthias Kretz
kretz at kde.org
Tue Oct 1 00:55:34 PDT 2013
Testcase:
__m128d cvt1() { return _mm_cvtepi32_pd(_mm_set1_epi32(2)); } // good
__m128d cvt1(__m128i x) { return _mm_cvtepi32_pd(x); } // bad
__m128d cvt2() { return _mm_cvtps_pd(_mm_set1_ps(2.f)); } // good
__m128d cvt2(__m128 x) { return _mm_cvtps_pd(x); } // bad
__m128 cvt3() { return _mm_cvtpd_ps(_mm_set1_pd(2.)); } // good
__m128 cvt3(__m128d x) { return _mm_cvtpd_ps(x); } // bad
__m128i cvt4() { return _mm_cvttpd_epi32(_mm_set1_pd(2.)); } // good
__m128i cvt4(__m128d x) { return _mm_cvttpd_epi32(x); } // bad
__m128i cvt5() { return _mm_cvttps_epi32(_mm_set1_ps(2.f)); } // good
__m128i cvt5(__m128 x) { return _mm_cvttps_epi32(x); } // bad
resulting IR:
define <2 x double> @_Z4cvt1v() #0 {
entry:
ret <2 x double> <double 2.000000e+00, double 2.000000e+00>
}
define <2 x double> @_Z4cvt1Dv2_x(<2 x i64> %x) #0 {
entry:
%0 = bitcast <2 x i64> %x to <4 x i32>
%vecext.i = extractelement <4 x i32> %0, i32 0
%conv.i = sitofp i32 %vecext.i to double
%vecinit.i = insertelement <2 x double> undef, double %conv.i, i32 0
%vecext1.i = extractelement <4 x i32> %0, i32 1
%conv2.i = sitofp i32 %vecext1.i to double
%vecinit3.i = insertelement <2 x double> %vecinit.i, double %conv2.i, i32 1
ret <2 x double> %vecinit3.i
}
define <2 x double> @_Z4cvt2v() #0 {
entry:
ret <2 x double> <double 2.000000e+00, double 2.000000e+00>
}
define <2 x double> @_Z4cvt2Dv4_f(<4 x float> %x) #0 {
entry:
%vecext.i = extractelement <4 x float> %x, i32 0
%conv.i = fpext float %vecext.i to double
%vecinit.i = insertelement <2 x double> undef, double %conv.i, i32 0
%vecext1.i = extractelement <4 x float> %x, i32 1
%conv2.i = fpext float %vecext1.i to double
%vecinit3.i = insertelement <2 x double> %vecinit.i, double %conv2.i, i32 1
ret <2 x double> %vecinit3.i
}
define <4 x float> @_Z4cvt3v() #0 {
entry:
ret <4 x float> <float 2.000000e+00, float 2.000000e+00, float 0.000000e+00, float 0.000000e+00>
}
define <4 x float> @_Z4cvt3Dv2_d(<2 x double> %x) #0 {
entry:
%vecext.i = extractelement <2 x double> %x, i32 0
%conv.i = fptrunc double %vecext.i to float
%vecinit.i = insertelement <4 x float> undef, float %conv.i, i32 0
%vecext1.i = extractelement <2 x double> %x, i32 1
%conv2.i = fptrunc double %vecext1.i to float
%vecinit3.i = insertelement <4 x float> %vecinit.i, float %conv2.i, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 2
%vecinit5.i = insertelement <4 x float> %vecinit4.i, float 0.000000e+00, i32 3
ret <4 x float> %vecinit5.i
}
define <2 x i64> @_Z4cvt4v() #0 {
entry:
ret <2 x i64> <i64 8589934594, i64 0>
}
define <2 x i64> @_Z4cvt4Dv2_d(<2 x double> %x) #0 {
entry:
%vecext.i = extractelement <2 x double> %x, i32 0
%conv.i = fptosi double %vecext.i to i32
%vecinit.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
%vecext1.i = extractelement <2 x double> %x, i32 1
%conv2.i = fptosi double %vecext1.i to i32
%vecinit3.i = insertelement <4 x i32> %vecinit.i, i32 %conv2.i, i32 1
%vecinit4.i = insertelement <4 x i32> %vecinit3.i, i32 0, i32 2
%vecinit5.i = insertelement <4 x i32> %vecinit4.i, i32 0, i32 3
%0 = bitcast <4 x i32> %vecinit5.i to <2 x i64>
ret <2 x i64> %0
}
define <2 x i64> @_Z4cvt5v() #0 {
entry:
ret <2 x i64> <i64 8589934594, i64 8589934594>
}
define <2 x i64> @_Z4cvt5Dv4_f(<4 x float> %x) #0 {
entry:
%vecext.i = extractelement <4 x float> %x, i32 0
%conv.i = fptosi float %vecext.i to i32
%vecinit.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
%vecext1.i = extractelement <4 x float> %x, i32 1
%conv2.i = fptosi float %vecext1.i to i32
%vecinit3.i = insertelement <4 x i32> %vecinit.i, i32 %conv2.i, i32 1
%vecext4.i = extractelement <4 x float> %x, i32 2
%conv5.i = fptosi float %vecext4.i to i32
%vecinit6.i = insertelement <4 x i32> %vecinit3.i, i32 %conv5.i, i32 2
%vecext7.i = extractelement <4 x float> %x, i32 3
%conv8.i = fptosi float %vecext7.i to i32
%vecinit9.i = insertelement <4 x i32> %vecinit6.i, i32 %conv8.i, i32 3
%0 = bitcast <4 x i32> %vecinit9.i to <2 x i64>
ret <2 x i64> %0
}
resulting x86:
0000000000000020 <cvt1()>:
20:· c5 f8 28 05 00 00 00 00 · vmovaps 0x0(%rip),%xmm0 # 28 <cvt1()+0x8>· 24: R_X86_64_PC32· .LCPI2_0-0x4
28:· c3 · retq
0000000000000030 <cvt1(long long __vector(2))>:
30:· c4 e3 79 16 c0 01 · vpextrd $0x1,%xmm0,%eax
36:· c5 fb 2a c8 · vcvtsi2sd %eax,%xmm0,%xmm1
3a:· c5 f9 7e c0 · vmovd %xmm0,%eax
3e:· c5 fb 2a c0 · vcvtsi2sd %eax,%xmm0,%xmm0
42:· c5 f9 14 c1 · vunpcklpd %xmm1,%xmm0,%xmm0
46:· c3 · retq
0000000000000050 <cvt2()>:
50:· c5 f8 28 05 00 00 00 00 · vmovaps 0x0(%rip),%xmm0 # 58 <cvt2()+0x8>· 54: R_X86_64_PC32· .LCPI4_0-0x4
58:· c3 · retq
0000000000000060 <cvt2(float __vector(4))>:
60:· c5 fa 5a c8 · vcvtss2sd %xmm0,%xmm0,%xmm1
64:· c5 f9 70 c0 01 · vpshufd $0x1,%xmm0,%xmm0
69:· c5 fa 5a c0 · vcvtss2sd %xmm0,%xmm0,%xmm0
6d:· c5 f1 14 c0 · vunpcklpd %xmm0,%xmm1,%xmm0
71:· c3 · retq
0000000000000080 <cvt3()>:
80:· c5 f8 28 05 00 00 00 00 · vmovaps 0x0(%rip),%xmm0 # 88 <cvt3()+0x8>· 84: R_X86_64_PC32· .LCPI6_0-0x4
88:· c3 · retq
0000000000000090 <cvt3(double __vector(2))>:
90:· c5 fb 5a c8 · vcvtsd2ss %xmm0,%xmm0,%xmm1
94:· c5 f9 15 c0 · vunpckhpd %xmm0,%xmm0,%xmm0
98:· c5 fb 5a c0 · vcvtsd2ss %xmm0,%xmm0,%xmm0
9c:· c5 f0 14 c0 · vunpcklps %xmm0,%xmm1,%xmm0
a0:· c5 fa 7e c0 · vmovq %xmm0,%xmm0
a4:· c3 · retq
00000000000000b0 <cvt4()>:
b0:· 48 b8 02 00 00 00 02 00 00 00 · movabs $0x200000002,%rax
ba:· c4 e1 f9 6e c0 · vmovq %rax,%xmm0
bf:· c3 · retq
00000000000000c0 <cvt4(double __vector(2))>:
c0:· c5 fb 2c c0 · vcvttsd2si %xmm0,%eax
c4:· c5 f9 6e c8 · vmovd %eax,%xmm1
c8:· c5 f9 15 c0 · vunpckhpd %xmm0,%xmm0,%xmm0
cc:· c5 fb 2c c0 · vcvttsd2si %xmm0,%eax
d0:· c5 f9 6e c0 · vmovd %eax,%xmm0
d4:· c5 f1 62 c0 · vpunpckldq %xmm0,%xmm1,%xmm0
d8:· c5 fa 7e c0 · vmovq %xmm0,%xmm0
dc:· c3 · retq
00000000000000e0 <cvt5()>:
e0:· c5 f8 28 05 00 00 00 00 · vmovaps 0x0(%rip),%xmm0 # e8 <cvt5()+0x8>· e4: R_X86_64_PC32· .LCPI10_0-0x4
e8:· c3 · retq
00000000000000f0 <cvt5(float __vector(4))>:
f0:· c5 f9 70 c8 01 · vpshufd $0x1,%xmm0,%xmm1
f5:· c5 fa 2c c1 · vcvttss2si %xmm1,%eax
f9:· c5 fa 2c c8 · vcvttss2si %xmm0,%ecx
fd:· c5 f9 6e c9 · vmovd %ecx,%xmm1
101:· c4 e3 71 22 c8 01 · vpinsrd $0x1,%eax,%xmm1,%xmm1
107:· c5 f8 12 d0 · vmovhlps %xmm0,%xmm0,%xmm2
10b:· c5 fa 2c c2 · vcvttss2si %xmm2,%eax
10f:· c4 e3 71 22 c8 02 · vpinsrd $0x2,%eax,%xmm1,%xmm1
115:· c5 f9 70 c0 03 · vpshufd $0x3,%xmm0,%xmm0
11a:· c5 fa 2c c0 · vcvttss2si %xmm0,%eax
11e:· c4 e3 71 22 c0 03 · vpinsrd $0x3,%eax,%xmm1,%xmm0
124:· c3 · retq
http://llvm-reviews.chandlerc.com/D1793
More information about the cfe-commits
mailing list