[llvm] r287760 - [CostModel][X86] Add missing AVX512DQ v8i64 fptosi/sitofp costs

Wed Nov 23 05:42:10 PST 2016

Author: rksimon
Date: Wed Nov 23 07:42:09 2016
New Revision: 287760

URL: http://llvm.org/viewvc/llvm-project?rev=287760&view=rev
Log:
[CostModel][X86] Add missing AVX512DQ v8i64 fptosi/sitofp costs

Modified:
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/trunk/test/Analysis/CostModel/X86/sitofp.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=287760&r1=287759&r2=287760&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Wed Nov 23 07:42:09 2016
@@ -716,6 +716,9 @@ int X86TTIImpl::getCastInstrCost(unsigne
   // potential massive combinations (elem_num x src_type x dst_type).
 
   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
+
     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
@@ -723,12 +726,15 @@ int X86TTIImpl::getCastInstrCost(unsigne
     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
 
-    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f32, 1 },
-    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f32, 1 },
-    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f32, 1 },
-    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f64, 1 },
-    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f64, 1 },
+    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
   };
 
   // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and

Modified: llvm/trunk/test/Analysis/CostModel/X86/sitofp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/sitofp.ll?rev=287760&r1=287759&r2=287760&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/sitofp.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/sitofp.ll Wed Nov 23 07:42:09 2016
@@ -117,7 +117,7 @@ define i32 @sitofp_i64_double() {
   ; AVX1: cost of 27 {{.*}} sitofp <8 x i64>
   ; AVX2: cost of 27 {{.*}} sitofp <8 x i64>
   ; AVX512F: cost of 22 {{.*}} sitofp <8 x i64>
-  ; AVX512DQ: cost of 22 {{.*}} sitofp <8 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} sitofp <8 x i64>
   %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
 
   ret i32 undef
@@ -236,14 +236,14 @@ define i32 @sitofp_i64_float() {
   ; AVX1: cost of 21 {{.*}} sitofp <8 x i64>
   ; AVX2: cost of 21 {{.*}} sitofp <8 x i64>
   ; AVX512F: cost of 22 {{.*}} sitofp <8 x i64>
-  ; AVX512DQ: cost of 22 {{.*}} sitofp <8 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} sitofp <8 x i64>
   %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
 
   ; SSE2: cost of 120 {{.*}} sitofp <16 x i64>
   ; AVX1: cost of 43 {{.*}} sitofp <16 x i64>
   ; AVX2: cost of 43 {{.*}} sitofp <16 x i64>
   ; AVX512F: cost of 45 {{.*}} sitofp <16 x i64>
-  ; AVX512DQ: cost of 45 {{.*}} sitofp <16 x i64>
+  ; AVX512DQ: cost of 3 {{.*}} sitofp <16 x i64>
   %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
 
   ret i32 undef

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll?rev=287760&r1=287759&r2=287760&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/sitofp.ll Wed Nov 23 07:42:09 2016
@@ -70,32 +70,65 @@ define void @sitofp_4i64_4f64() #0 {
 }
 
 define void @sitofp_8i64_8f64() #0 {
-; CHECK-LABEL: @sitofp_8i64_8f64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; CHECK-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; CHECK-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
-; CHECK-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
-; CHECK-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
-; CHECK-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
-; CHECK-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
-; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; CHECK-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; CHECK-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; CHECK-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; CHECK-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; CHECK-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; CHECK-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @sitofp_8i64_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i64_8f64(
+; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; AVX256-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; AVX256-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; AVX256-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX256-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX256-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX256-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i64_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -557,32 +590,65 @@ define void @sitofp_4i64_4f32() #0 {
 }
 
 define void @sitofp_8i64_8f32() #0 {
-; CHECK-LABEL: @sitofp_8i64_8f32(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; CHECK-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; CHECK-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
-; CHECK-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
-; CHECK-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
-; CHECK-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
-; CHECK-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
-; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; CHECK-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; CHECK-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; CHECK-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; CHECK-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; CHECK-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @sitofp_8i64_8f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i64_8f32(
+; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
+; AVX256-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
+; AVX256-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
+; AVX256-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
+; AVX256-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; AVX256-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; AVX256-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; AVX256-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i64_8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX512-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8