[llvm] eb6429d - [CostModel][X86] Add uitpfp v4f32->v4i32 + v8f32->v8i32 SSE/AVX costs
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri May 21 03:31:30 PDT 2021
Author: Simon Pilgrim
Date: 2021-05-21T11:30:15+01:00
New Revision: eb6429d0fb94fd467e03d229177ae6ff3a44e3cc
URL: https://github.com/llvm/llvm-project/commit/eb6429d0fb94fd467e03d229177ae6ff3a44e3cc
DIFF: https://github.com/llvm/llvm-project/commit/eb6429d0fb94fd467e03d229177ae6ff3a44e3cc.diff
LOG: [CostModel][X86] Add uitpfp v4f32->v4i32 + v8f32->v8i32 SSE/AVX costs
These were using (default) scalarized values.
Added:
Modified:
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/test/Analysis/CostModel/X86/fptoui.ll
llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 4b8d245b3756c..af7b4202f5a2f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1877,12 +1877,12 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
{ ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 9 },
// This node is expanded into scalarized operations but BasicTTI is overly
// optimistic estimating its cost. It computes 3 per element (one
// vector-extract, one scalar conversion and one vector-insert). The
// problem is that the inserts form a read-modify-write chain so latency
// should be factored in too. Inflating the cost per element by 1.
- { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
{ ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
@@ -1985,6 +1985,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
diff --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll
index a3ec315c4eb46..85de13eda3648 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -233,28 +233,20 @@ define i32 @fptoui_float_i64(i32 %arg) {
}
define i32 @fptoui_float_i32(i32 %arg) {
-; SSE2-LABEL: 'fptoui_float_i32'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'fptoui_float_i32'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'fptoui_float_i32'
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX-LABEL: 'fptoui_float_i32'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'fptoui_float_i32'
@@ -264,14 +256,6 @@ define i32 @fptoui_float_i32(i32 %arg) {
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SLM-LABEL: 'fptoui_float_i32'
-; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; SLM-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SLM-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SLM-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
-; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I32 = fptoui float undef to i32
%V2I32 = fptoui <2 x float> undef to <2 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index 6233845e2a6ec..99f16a07e5626 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -61,55 +61,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
}
define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
-; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
-; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
-; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
-; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
-; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
-; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SSE-NEXT: ret <8 x i32> [[R7]]
-;
-; SLM-LABEL: @fptosi_fptoui(
-; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
-; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SLM-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
-; SLM-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
-; SLM-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
-; SLM-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SLM-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
-; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SLM-NEXT: ret <8 x i32> [[R7]]
-;
-; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; AVX-NEXT: [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT: ret <8 x i32> [[R72]]
-;
-; AVX512-LABEL: @fptosi_fptoui(
-; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @fptosi_fptoui(
+; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 1c19a9a89467a..759744055bca9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -61,55 +61,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
}
define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
-; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
-; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
-; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
-; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
-; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
-; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SSE-NEXT: ret <8 x i32> [[R7]]
-;
-; SLM-LABEL: @fptosi_fptoui(
-; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
-; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SLM-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
-; SLM-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
-; SLM-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
-; SLM-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SLM-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
-; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SLM-NEXT: ret <8 x i32> [[R7]]
-;
-; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; AVX-NEXT: [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT: ret <8 x i32> [[R72]]
-;
-; AVX512-LABEL: @fptosi_fptoui(
-; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @fptosi_fptoui(
+; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
index c8cb290382e33..9544a4a820c5b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256DQ
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX256DQ
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -74,11 +74,11 @@ define void @fptoui_8f64_8i64() #0 {
; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
; AVX256NODQ-NEXT: ret void
;
-; AVX512-LABEL: @fptoui_8f64_8i64(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
-; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
-; AVX512-NEXT: ret void
+; AVX512F-LABEL: @fptoui_8f64_8i64(
+; AVX512F-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512F-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
+; AVX512F-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512F-NEXT: ret void
;
; AVX256DQ-LABEL: @fptoui_8f64_8i64(
; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
@@ -171,11 +171,11 @@ define void @fptoui_8f64_8i32() #0 {
; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
; AVX256NODQ-NEXT: ret void
;
-; AVX-LABEL: @fptoui_8f64_8i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
-; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
-; AVX-NEXT: ret void
+; AVX512-LABEL: @fptoui_8f64_8i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
+; AVX512-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX512-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -352,11 +352,11 @@ define void @fptoui_8f32_8i64() #0 {
; AVX256NODQ-NEXT: store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
; AVX256NODQ-NEXT: ret void
;
-; AVX512-LABEL: @fptoui_8f32_8i64(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
-; AVX512-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
-; AVX512-NEXT: ret void
+; AVX512F-LABEL: @fptoui_8f32_8i64(
+; AVX512F-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX512F-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
+; AVX512F-NEXT: store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512F-NEXT: ret void
;
; AVX256DQ-LABEL: @fptoui_8f32_8i64(
; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
@@ -396,59 +396,14 @@ define void @fptoui_8f32_8i64() #0 {
define void @fptoui_8f32_8i32() #0 {
; SSE-LABEL: @fptoui_8f32_8i32(
-; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT: [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32>
+; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
-; AVX256NODQ-LABEL: @fptoui_8f32_8i32(
-; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; AVX256NODQ-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT: ret void
-;
; AVX-LABEL: @fptoui_8f32_8i32(
; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32>
More information about the llvm-commits
mailing list