[llvm] 1ebb31b - [x86] add tests for fmax/fmin experimental intrinsics with 'fast' FMF; NFC
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 10 05:55:51 PDT 2020
Author: Sanjay Patel
Date: 2020-09-10T08:55:40-04:00
New Revision: 1ebb31b14cd175b3f272e232958d342221eb875c
URL: https://github.com/llvm/llvm-project/commit/1ebb31b14cd175b3f272e232958d342221eb875c
DIFF: https://github.com/llvm/llvm-project/commit/1ebb31b14cd175b3f272e232958d342221eb875c.diff
LOG: [x86] add tests for fmax/fmin experimental intrinsics with 'fast' FMF; NFC
D87391 proposes to change the lowerings for 'nnan'-only FMF.
That's the minimal requirement to get good codegen for x86,
but currently we have bugs hindering that output unless the
full 'fast' FMF is applied. These tests provide coverage for
the ideal lowerings.
Added:
llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
new file mode 100644
index 000000000000..50b88c2c55f5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
+
+; These tests are identical to corresponding tests in the 'nnan' versions
+; of the files except that they use 'fast' FMF. If things are working as
+; expected, the 'nnan' codegen should be the same as 'fast'.
+
+;
+; vXf32
+;
+
+define float @test_v2f32(<2 x float> %a0) {
+; SSE2-LABEL: test_v2f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE2-NEXT: minss %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2f32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: minss %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v2f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0)
+ ret float %1
+}
+
+define float @test_v4f32(<4 x float> %a0) {
+; SSE2-LABEL: test_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT: maxps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE2-NEXT: maxss %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4f32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE41-NEXT: maxps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: maxss %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0)
+ ret float %1
+}
+
+define float @test_v8f32(<8 x float> %a0) {
+; SSE2-LABEL: test_v8f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: minps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT: minps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE2-NEXT: minss %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8f32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: minps %xmm1, %xmm0
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE41-NEXT: minps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: minss %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0)
+ ret float %1
+}
+
+define float @test_v16f32(<16 x float> %a0) {
+; SSE2-LABEL: test_v16f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: maxps %xmm3, %xmm1
+; SSE2-NEXT: maxps %xmm2, %xmm0
+; SSE2-NEXT: maxps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT: maxps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE2-NEXT: maxss %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16f32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: maxps %xmm3, %xmm1
+; SSE41-NEXT: maxps %xmm2, %xmm0
+; SSE41-NEXT: maxps %xmm1, %xmm0
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE41-NEXT: maxps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: maxss %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v16f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = call fast float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0)
+ ret float %1
+}
+
+;
+; vXf64
+;
+
+define double @test_v2f64(<2 x double> %a0) {
+; SSE-LABEL: test_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: minsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0)
+ ret double %1
+}
+
+define double @test_v4f64(<4 x double> %a0) {
+; SSE-LABEL: test_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: maxpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: maxsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_v4f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0)
+ ret double %1
+}
+
+define double @test_v8f64(<8 x double> %a0) {
+; SSE-LABEL: test_v8f64:
+; SSE: # %bb.0:
+; SSE-NEXT: minpd %xmm3, %xmm1
+; SSE-NEXT: minpd %xmm2, %xmm0
+; SSE-NEXT: minpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: minsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_v8f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v8f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = call fast double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0)
+ ret double %1
+}
+
+define double @test_v16f64(<16 x double> %a0) {
+; SSE-LABEL: test_v16f64:
+; SSE: # %bb.0:
+; SSE-NEXT: maxpd %xmm6, %xmm2
+; SSE-NEXT: maxpd %xmm4, %xmm0
+; SSE-NEXT: maxpd %xmm2, %xmm0
+; SSE-NEXT: maxpd %xmm7, %xmm3
+; SSE-NEXT: maxpd %xmm5, %xmm1
+; SSE-NEXT: maxpd %xmm3, %xmm1
+; SSE-NEXT: maxpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: maxsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_v16f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1
+; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v16f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = call fast double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0)
+ ret double %1
+}
+
+declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
+declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
+declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>)
+
+declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>)
+declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>)
+declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>)
More information about the llvm-commits
mailing list