[llvm] X86: Improve cost model of fp16 conversion (PR #113195)
Matthias Braun via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 25 09:35:27 PDT 2024
https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/113195
>From 4f2a5293ea966245788cc0bc2b430510f2edca13 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 18 Oct 2024 16:24:57 -0700
Subject: [PATCH 1/2] X86: Improve cost model of fp16 conversion
Improve cost-modeling for x86 __fp16 conversions so the SLPVectorizer
transforms the patterns:
- `setOperationAction` of v4f16, v8f16 and v16f16 to Custom so
`TargetTransformInfo::getStoreMinimumVF` reports them as acceptable.
- Add missing cost entries to `X86TTIImpl::getCastInstrCost`
conversion from/to fp16. Note that conversion from f64 to f16 is not
supported by an X86 instruction.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +
.../lib/Target/X86/X86TargetTransformInfo.cpp | 25 +
.../SLPVectorizer/X86/conversion-fp16.ll | 601 ++++++++++++++++++
3 files changed, 632 insertions(+)
create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bcb84add65d83e..da88a1a0a5a3b8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1714,6 +1714,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
}
+ // trunc+store via vcvtps2ph
+ setOperationAction(ISD::STORE, MVT::v4f16, Custom);
+ setOperationAction(ISD::STORE, MVT::v8f16, Custom);
}
// This block controls legalization of the mask vector sizes that are
@@ -1784,6 +1787,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ // trunc+store via vcvtps2ph
+ setOperationAction(ISD::STORE, MVT::v16f16, Custom);
}
if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 413ef0136d5c06..2d2c804ed46e54 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2296,7 +2296,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
{ ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
+ { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
+ { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
+ { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
@@ -2973,6 +2976,14 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
};
+ static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
+ { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } }, // vcvtps2ph
+ { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } }, // vcvtps2ph
+ { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } }, // vcvtph2ps
+ { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } }, // vcvtph2ps
+ { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
+ };
+
// Attempt to map directly to (simple) MVT types to let us match custom entries.
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
@@ -3034,6 +3045,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
return *KindCost;
}
+ if (ST->hasF16C()) {
+ if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ if (auto KindCost = Entry->Cost[CostKind])
+ return *KindCost;
+ }
+
if (ST->hasSSE41()) {
if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
@@ -3107,6 +3125,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
if (auto KindCost = Entry->Cost[CostKind])
return std::max(LTSrc.first, LTDest.first) * *KindCost;
+ if (ST->hasF16C()) {
+ if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
+ LTDest.second, LTSrc.second))
+ if (auto KindCost = Entry->Cost[CostKind])
+ return std::max(LTSrc.first, LTDest.first) * *KindCost;
+ }
+
if (ST->hasSSE41())
if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
LTDest.second, LTSrc.second))
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
new file mode 100644
index 00000000000000..1d5dee6cb8121c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
@@ -0,0 +1,601 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx2 -mattr=+f16c | FileCheck %s --check-prefix=CHECK-F16C
+; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx512f | FileCheck %s --check-prefix=CHECK-AVX512
+
+define void @fpext_v4xf16_v4xf32(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v4xf16_v4xf32(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
+; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
+; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3
+; CHECK-NEXT: [[L0:%.*]] = load half, ptr [[S0]], align 2
+; CHECK-NEXT: [[L1:%.*]] = load half, ptr [[S1]], align 2
+; CHECK-NEXT: [[L2:%.*]] = load half, ptr [[S2]], align 2
+; CHECK-NEXT: [[L3:%.*]] = load half, ptr [[S3]], align 2
+; CHECK-NEXT: [[E0:%.*]] = fpext half [[L0]] to float
+; CHECK-NEXT: [[E1:%.*]] = fpext half [[L1]] to float
+; CHECK-NEXT: [[E2:%.*]] = fpext half [[L2]] to float
+; CHECK-NEXT: [[E3:%.*]] = fpext half [[L3]] to float
+; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 1
+; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 2
+; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 3
+; CHECK-NEXT: store float [[E0]], ptr [[D0]], align 8
+; CHECK-NEXT: store float [[E1]], ptr [[D1]], align 8
+; CHECK-NEXT: store float [[E2]], ptr [[D2]], align 8
+; CHECK-NEXT: store float [[E3]], ptr [[D3]], align 8
+; CHECK-NEXT: ret void
+;
+; CHECK-F16C-LABEL: define void @fpext_v4xf16_v4xf32(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2
+; CHECK-F16C-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x float>
+; CHECK-F16C-NEXT: store <4 x float> [[TMP2]], ptr [[D0]], align 8
+; CHECK-F16C-NEXT: ret void
+;
+; CHECK-AVX512-LABEL: define void @fpext_v4xf16_v4xf32(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2
+; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x float>
+; CHECK-AVX512-NEXT: store <4 x float> [[TMP2]], ptr [[D0]], align 8
+; CHECK-AVX512-NEXT: ret void
+;
+ %s1 = getelementptr inbounds half, ptr %s0, i64 1
+ %s2 = getelementptr inbounds half, ptr %s0, i64 2
+ %s3 = getelementptr inbounds half, ptr %s0, i64 3
+ %l0 = load half, ptr %s0, align 2
+ %l1 = load half, ptr %s1, align 2
+ %l2 = load half, ptr %s2, align 2
+ %l3 = load half, ptr %s3, align 2
+
+ %e0 = fpext half %l0 to float
+ %e1 = fpext half %l1 to float
+ %e2 = fpext half %l2 to float
+ %e3 = fpext half %l3 to float
+
+ %d1 = getelementptr inbounds float, ptr %d0, i64 1
+ %d2 = getelementptr inbounds float, ptr %d0, i64 2
+ %d3 = getelementptr inbounds float, ptr %d0, i64 3
+ store float %e0, ptr %d0, align 8
+ store float %e1, ptr %d1, align 8
+ store float %e2, ptr %d2, align 8
+ store float %e3, ptr %d3, align 8
+ ret void
+}
+
+define void @fpext_v4xf16_v4xf64(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v4xf16_v4xf64(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
+; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
+; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3
+; CHECK-NEXT: [[L0:%.*]] = load half, ptr [[S0]], align 2
+; CHECK-NEXT: [[L1:%.*]] = load half, ptr [[S1]], align 2
+; CHECK-NEXT: [[L2:%.*]] = load half, ptr [[S2]], align 2
+; CHECK-NEXT: [[L3:%.*]] = load half, ptr [[S3]], align 2
+; CHECK-NEXT: [[E0:%.*]] = fpext half [[L0]] to double
+; CHECK-NEXT: [[E1:%.*]] = fpext half [[L1]] to double
+; CHECK-NEXT: [[E2:%.*]] = fpext half [[L2]] to double
+; CHECK-NEXT: [[E3:%.*]] = fpext half [[L3]] to double
+; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 1
+; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 2
+; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 3
+; CHECK-NEXT: store double [[E0]], ptr [[D0]], align 8
+; CHECK-NEXT: store double [[E1]], ptr [[D1]], align 8
+; CHECK-NEXT: store double [[E2]], ptr [[D2]], align 8
+; CHECK-NEXT: store double [[E3]], ptr [[D3]], align 8
+; CHECK-NEXT: ret void
+;
+; CHECK-F16C-LABEL: define void @fpext_v4xf16_v4xf64(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2
+; CHECK-F16C-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x double>
+; CHECK-F16C-NEXT: store <4 x double> [[TMP2]], ptr [[D0]], align 8
+; CHECK-F16C-NEXT: ret void
+;
+; CHECK-AVX512-LABEL: define void @fpext_v4xf16_v4xf64(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2
+; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x double>
+; CHECK-AVX512-NEXT: store <4 x double> [[TMP2]], ptr [[D0]], align 8
+; CHECK-AVX512-NEXT: ret void
+;
+ %s1 = getelementptr inbounds half, ptr %s0, i64 1
+ %s2 = getelementptr inbounds half, ptr %s0, i64 2
+ %s3 = getelementptr inbounds half, ptr %s0, i64 3
+ %l0 = load half, ptr %s0, align 2
+ %l1 = load half, ptr %s1, align 2
+ %l2 = load half, ptr %s2, align 2
+ %l3 = load half, ptr %s3, align 2
+
+ %e0 = fpext half %l0 to double
+ %e1 = fpext half %l1 to double
+ %e2 = fpext half %l2 to double
+ %e3 = fpext half %l3 to double
+
+ %d1 = getelementptr inbounds double, ptr %d0, i64 1
+ %d2 = getelementptr inbounds double, ptr %d0, i64 2
+ %d3 = getelementptr inbounds double, ptr %d0, i64 3
+ store double %e0, ptr %d0, align 8
+ store double %e1, ptr %d1, align 8
+ store double %e2, ptr %d2, align 8
+ store double %e3, ptr %d3, align 8
+ ret void
+}
+
+define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
+; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
+; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3
+; CHECK-NEXT: [[S4:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 4
+; CHECK-NEXT: [[S5:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 5
+; CHECK-NEXT: [[S6:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 6
+; CHECK-NEXT: [[S7:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 7
+; CHECK-NEXT: [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8
+; CHECK-NEXT: [[S9:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 9
+; CHECK-NEXT: [[S10:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 10
+; CHECK-NEXT: [[S11:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 11
+; CHECK-NEXT: [[S12:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 12
+; CHECK-NEXT: [[S13:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 13
+; CHECK-NEXT: [[S14:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 14
+; CHECK-NEXT: [[S15:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 15
+; CHECK-NEXT: [[L0:%.*]] = load half, ptr [[S0]], align 2
+; CHECK-NEXT: [[L1:%.*]] = load half, ptr [[S1]], align 2
+; CHECK-NEXT: [[L2:%.*]] = load half, ptr [[S2]], align 2
+; CHECK-NEXT: [[L3:%.*]] = load half, ptr [[S3]], align 2
+; CHECK-NEXT: [[L4:%.*]] = load half, ptr [[S4]], align 2
+; CHECK-NEXT: [[L5:%.*]] = load half, ptr [[S5]], align 2
+; CHECK-NEXT: [[L6:%.*]] = load half, ptr [[S6]], align 2
+; CHECK-NEXT: [[L7:%.*]] = load half, ptr [[S7]], align 2
+; CHECK-NEXT: [[L8:%.*]] = load half, ptr [[S8]], align 2
+; CHECK-NEXT: [[L9:%.*]] = load half, ptr [[S9]], align 2
+; CHECK-NEXT: [[L10:%.*]] = load half, ptr [[S10]], align 2
+; CHECK-NEXT: [[L11:%.*]] = load half, ptr [[S11]], align 2
+; CHECK-NEXT: [[L12:%.*]] = load half, ptr [[S12]], align 2
+; CHECK-NEXT: [[L13:%.*]] = load half, ptr [[S13]], align 2
+; CHECK-NEXT: [[L14:%.*]] = load half, ptr [[S14]], align 2
+; CHECK-NEXT: [[L15:%.*]] = load half, ptr [[S15]], align 2
+; CHECK-NEXT: [[E0:%.*]] = fpext half [[L0]] to float
+; CHECK-NEXT: [[E1:%.*]] = fpext half [[L1]] to float
+; CHECK-NEXT: [[E2:%.*]] = fpext half [[L2]] to float
+; CHECK-NEXT: [[E3:%.*]] = fpext half [[L3]] to float
+; CHECK-NEXT: [[E4:%.*]] = fpext half [[L4]] to float
+; CHECK-NEXT: [[E5:%.*]] = fpext half [[L5]] to float
+; CHECK-NEXT: [[E6:%.*]] = fpext half [[L6]] to float
+; CHECK-NEXT: [[E7:%.*]] = fpext half [[L7]] to float
+; CHECK-NEXT: [[E8:%.*]] = fpext half [[L8]] to float
+; CHECK-NEXT: [[E9:%.*]] = fpext half [[L9]] to float
+; CHECK-NEXT: [[E10:%.*]] = fpext half [[L10]] to float
+; CHECK-NEXT: [[E11:%.*]] = fpext half [[L11]] to float
+; CHECK-NEXT: [[E12:%.*]] = fpext half [[L12]] to float
+; CHECK-NEXT: [[E13:%.*]] = fpext half [[L13]] to float
+; CHECK-NEXT: [[E14:%.*]] = fpext half [[L14]] to float
+; CHECK-NEXT: [[E15:%.*]] = fpext half [[L15]] to float
+; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 1
+; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 2
+; CHECK-NEXT: [[D15:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 3
+; CHECK-NEXT: [[D4:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 4
+; CHECK-NEXT: [[D5:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 5
+; CHECK-NEXT: [[D6:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 6
+; CHECK-NEXT: [[D7:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 7
+; CHECK-NEXT: [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8
+; CHECK-NEXT: [[D9:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 9
+; CHECK-NEXT: [[D10:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 10
+; CHECK-NEXT: [[D11:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 11
+; CHECK-NEXT: [[D12:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 12
+; CHECK-NEXT: [[D13:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 13
+; CHECK-NEXT: [[D14:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 14
+; CHECK-NEXT: [[D16:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 15
+; CHECK-NEXT: store float [[E0]], ptr [[D0]], align 8
+; CHECK-NEXT: store float [[E1]], ptr [[D1]], align 8
+; CHECK-NEXT: store float [[E2]], ptr [[D2]], align 8
+; CHECK-NEXT: store float [[E3]], ptr [[D15]], align 8
+; CHECK-NEXT: store float [[E4]], ptr [[D4]], align 8
+; CHECK-NEXT: store float [[E5]], ptr [[D5]], align 8
+; CHECK-NEXT: store float [[E6]], ptr [[D6]], align 8
+; CHECK-NEXT: store float [[E7]], ptr [[D7]], align 8
+; CHECK-NEXT: store float [[E8]], ptr [[D8]], align 8
+; CHECK-NEXT: store float [[E9]], ptr [[D9]], align 8
+; CHECK-NEXT: store float [[E10]], ptr [[D10]], align 8
+; CHECK-NEXT: store float [[E11]], ptr [[D11]], align 8
+; CHECK-NEXT: store float [[E12]], ptr [[D12]], align 8
+; CHECK-NEXT: store float [[E13]], ptr [[D13]], align 8
+; CHECK-NEXT: store float [[E14]], ptr [[D14]], align 8
+; CHECK-NEXT: store float [[E15]], ptr [[D16]], align 8
+; CHECK-NEXT: ret void
+;
+; CHECK-F16C-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT: [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8
+; CHECK-F16C-NEXT: [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8
+; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[S0]], align 2
+; CHECK-F16C-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[TMP1]] to <8 x float>
+; CHECK-F16C-NEXT: [[TMP3:%.*]] = load <8 x half>, ptr [[S8]], align 2
+; CHECK-F16C-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[TMP3]] to <8 x float>
+; CHECK-F16C-NEXT: store <8 x float> [[TMP2]], ptr [[D0]], align 8
+; CHECK-F16C-NEXT: store <8 x float> [[TMP4]], ptr [[D8]], align 8
+; CHECK-F16C-NEXT: ret void
+;
+; CHECK-AVX512-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[S0]], align 2
+; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x float>
+; CHECK-AVX512-NEXT: store <16 x float> [[TMP2]], ptr [[D0]], align 8
+; CHECK-AVX512-NEXT: ret void
+;
+ %s1 = getelementptr inbounds half, ptr %s0, i64 1
+ %s2 = getelementptr inbounds half, ptr %s0, i64 2
+ %s3 = getelementptr inbounds half, ptr %s0, i64 3
+ %s4 = getelementptr inbounds half, ptr %s0, i64 4
+ %s5 = getelementptr inbounds half, ptr %s0, i64 5
+ %s6 = getelementptr inbounds half, ptr %s0, i64 6
+ %s7 = getelementptr inbounds half, ptr %s0, i64 7
+ %s8 = getelementptr inbounds half, ptr %s0, i64 8
+ %s9 = getelementptr inbounds half, ptr %s0, i64 9
+ %s10 = getelementptr inbounds half, ptr %s0, i64 10
+ %s11 = getelementptr inbounds half, ptr %s0, i64 11
+ %s12 = getelementptr inbounds half, ptr %s0, i64 12
+ %s13 = getelementptr inbounds half, ptr %s0, i64 13
+ %s14 = getelementptr inbounds half, ptr %s0, i64 14
+ %s15 = getelementptr inbounds half, ptr %s0, i64 15
+ %l0 = load half, ptr %s0, align 2
+ %l1 = load half, ptr %s1, align 2
+ %l2 = load half, ptr %s2, align 2
+ %l3 = load half, ptr %s3, align 2
+ %l4 = load half, ptr %s4, align 2
+ %l5 = load half, ptr %s5, align 2
+ %l6 = load half, ptr %s6, align 2
+ %l7 = load half, ptr %s7, align 2
+ %l8 = load half, ptr %s8, align 2
+ %l9 = load half, ptr %s9, align 2
+ %l10 = load half, ptr %s10, align 2
+ %l11 = load half, ptr %s11, align 2
+ %l12 = load half, ptr %s12, align 2
+ %l13 = load half, ptr %s13, align 2
+ %l14 = load half, ptr %s14, align 2
+ %l15 = load half, ptr %s15, align 2
+
+ %e0 = fpext half %l0 to float
+ %e1 = fpext half %l1 to float
+ %e2 = fpext half %l2 to float
+ %e3 = fpext half %l3 to float
+ %e4 = fpext half %l4 to float
+ %e5 = fpext half %l5 to float
+ %e6 = fpext half %l6 to float
+ %e7 = fpext half %l7 to float
+ %e8 = fpext half %l8 to float
+ %e9 = fpext half %l9 to float
+ %e10 = fpext half %l10 to float
+ %e11 = fpext half %l11 to float
+ %e12 = fpext half %l12 to float
+ %e13 = fpext half %l13 to float
+ %e14 = fpext half %l14 to float
+ %e15 = fpext half %l15 to float
+
+ %d1 = getelementptr inbounds float, ptr %d0, i64 1
+ %d2 = getelementptr inbounds float, ptr %d0, i64 2
+ %d3 = getelementptr inbounds float, ptr %d0, i64 3
+ %d4 = getelementptr inbounds float, ptr %d0, i64 4
+ %d5 = getelementptr inbounds float, ptr %d0, i64 5
+ %d6 = getelementptr inbounds float, ptr %d0, i64 6
+ %d7 = getelementptr inbounds float, ptr %d0, i64 7
+ %d8 = getelementptr inbounds float, ptr %d0, i64 8
+ %d9 = getelementptr inbounds float, ptr %d0, i64 9
+ %d10 = getelementptr inbounds float, ptr %d0, i64 10
+ %d11 = getelementptr inbounds float, ptr %d0, i64 11
+ %d12 = getelementptr inbounds float, ptr %d0, i64 12
+ %d13 = getelementptr inbounds float, ptr %d0, i64 13
+ %d14 = getelementptr inbounds float, ptr %d0, i64 14
+ %d15 = getelementptr inbounds float, ptr %d0, i64 15
+ store float %e0, ptr %d0, align 8
+ store float %e1, ptr %d1, align 8
+ store float %e2, ptr %d2, align 8
+ store float %e3, ptr %d3, align 8
+ store float %e4, ptr %d4, align 8
+ store float %e5, ptr %d5, align 8
+ store float %e6, ptr %d6, align 8
+ store float %e7, ptr %d7, align 8
+ store float %e8, ptr %d8, align 8
+ store float %e9, ptr %d9, align 8
+ store float %e10, ptr %d10, align 8
+ store float %e11, ptr %d11, align 8
+ store float %e12, ptr %d12, align 8
+ store float %e13, ptr %d13, align 8
+ store float %e14, ptr %d14, align 8
+ store float %e15, ptr %d15, align 8
+ ret void
+}
+
+define void @fpround_v4xf32_v4xf16(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpround_v4xf32_v4xf16(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 1
+; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 2
+; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 3
+; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[S0]], align 4
+; CHECK-NEXT: [[L1:%.*]] = load float, ptr [[S1]], align 4
+; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[S2]], align 4
+; CHECK-NEXT: [[L3:%.*]] = load float, ptr [[S3]], align 4
+; CHECK-NEXT: [[T0:%.*]] = fptrunc float [[L0]] to half
+; CHECK-NEXT: [[T1:%.*]] = fptrunc float [[L1]] to half
+; CHECK-NEXT: [[T2:%.*]] = fptrunc float [[L2]] to half
+; CHECK-NEXT: [[T3:%.*]] = fptrunc float [[L3]] to half
+; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 2
+; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 3
+; CHECK-NEXT: store half [[T0]], ptr [[D0]], align 2
+; CHECK-NEXT: store half [[T1]], ptr [[D1]], align 2
+; CHECK-NEXT: store half [[T2]], ptr [[D2]], align 2
+; CHECK-NEXT: store half [[T3]], ptr [[D3]], align 2
+; CHECK-NEXT: ret void
+;
+; CHECK-F16C-LABEL: define void @fpround_v4xf32_v4xf16(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[S0]], align 4
+; CHECK-F16C-NEXT: [[TMP2:%.*]] = fptrunc <4 x float> [[TMP1]] to <4 x half>
+; CHECK-F16C-NEXT: store <4 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT: ret void
+;
+; CHECK-AVX512-LABEL: define void @fpround_v4xf32_v4xf16(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[S0]], align 4
+; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fptrunc <4 x float> [[TMP1]] to <4 x half>
+; CHECK-AVX512-NEXT: store <4 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-AVX512-NEXT: ret void
+;
+ %s1 = getelementptr inbounds float, ptr %s0, i64 1
+ %s2 = getelementptr inbounds float, ptr %s0, i64 2
+ %s3 = getelementptr inbounds float, ptr %s0, i64 3
+ %l0 = load float, ptr %s0, align 4
+ %l1 = load float, ptr %s1, align 4
+ %l2 = load float, ptr %s2, align 4
+ %l3 = load float, ptr %s3, align 4
+
+ %t0 = fptrunc float %l0 to half
+ %t1 = fptrunc float %l1 to half
+ %t2 = fptrunc float %l2 to half
+ %t3 = fptrunc float %l3 to half
+
+ %d1 = getelementptr inbounds half, ptr %d0, i64 1
+ %d2 = getelementptr inbounds half, ptr %d0, i64 2
+ %d3 = getelementptr inbounds half, ptr %d0, i64 3
+ store half %t0, ptr %d0, align 2
+ store half %t1, ptr %d1, align 2
+ store half %t2, ptr %d2, align 2
+ store half %t3, ptr %d3, align 2
+ ret void
+}
+
+define void @fpround_v16xf32_v16xf16(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpround_v16xf32_v16xf16(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 1
+; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 2
+; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 3
+; CHECK-NEXT: [[S4:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 4
+; CHECK-NEXT: [[S5:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 5
+; CHECK-NEXT: [[S6:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 6
+; CHECK-NEXT: [[S7:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 7
+; CHECK-NEXT: [[S8:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 8
+; CHECK-NEXT: [[S9:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 9
+; CHECK-NEXT: [[S10:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 10
+; CHECK-NEXT: [[S11:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 11
+; CHECK-NEXT: [[S12:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 12
+; CHECK-NEXT: [[S13:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 13
+; CHECK-NEXT: [[S14:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 14
+; CHECK-NEXT: [[S15:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 15
+; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[S0]], align 4
+; CHECK-NEXT: [[L1:%.*]] = load float, ptr [[S1]], align 4
+; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[S2]], align 4
+; CHECK-NEXT: [[L3:%.*]] = load float, ptr [[S3]], align 4
+; CHECK-NEXT: [[L4:%.*]] = load float, ptr [[S4]], align 4
+; CHECK-NEXT: [[L5:%.*]] = load float, ptr [[S5]], align 4
+; CHECK-NEXT: [[L6:%.*]] = load float, ptr [[S6]], align 4
+; CHECK-NEXT: [[L7:%.*]] = load float, ptr [[S7]], align 4
+; CHECK-NEXT: [[L8:%.*]] = load float, ptr [[S8]], align 4
+; CHECK-NEXT: [[L9:%.*]] = load float, ptr [[S9]], align 4
+; CHECK-NEXT: [[L10:%.*]] = load float, ptr [[S10]], align 4
+; CHECK-NEXT: [[L11:%.*]] = load float, ptr [[S11]], align 4
+; CHECK-NEXT: [[L12:%.*]] = load float, ptr [[S12]], align 4
+; CHECK-NEXT: [[L13:%.*]] = load float, ptr [[S13]], align 4
+; CHECK-NEXT: [[L14:%.*]] = load float, ptr [[S14]], align 4
+; CHECK-NEXT: [[L15:%.*]] = load float, ptr [[S15]], align 4
+; CHECK-NEXT: [[T0:%.*]] = fptrunc float [[L0]] to half
+; CHECK-NEXT: [[T1:%.*]] = fptrunc float [[L1]] to half
+; CHECK-NEXT: [[T2:%.*]] = fptrunc float [[L2]] to half
+; CHECK-NEXT: [[T3:%.*]] = fptrunc float [[L3]] to half
+; CHECK-NEXT: [[T4:%.*]] = fptrunc float [[L4]] to half
+; CHECK-NEXT: [[T5:%.*]] = fptrunc float [[L5]] to half
+; CHECK-NEXT: [[T6:%.*]] = fptrunc float [[L6]] to half
+; CHECK-NEXT: [[T7:%.*]] = fptrunc float [[L7]] to half
+; CHECK-NEXT: [[T8:%.*]] = fptrunc float [[L8]] to half
+; CHECK-NEXT: [[T9:%.*]] = fptrunc float [[L9]] to half
+; CHECK-NEXT: [[T10:%.*]] = fptrunc float [[L10]] to half
+; CHECK-NEXT: [[T11:%.*]] = fptrunc float [[L11]] to half
+; CHECK-NEXT: [[T12:%.*]] = fptrunc float [[L12]] to half
+; CHECK-NEXT: [[T13:%.*]] = fptrunc float [[L13]] to half
+; CHECK-NEXT: [[T14:%.*]] = fptrunc float [[L14]] to half
+; CHECK-NEXT: [[T15:%.*]] = fptrunc float [[L15]] to half
+; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 2
+; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 3
+; CHECK-NEXT: [[D4:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 4
+; CHECK-NEXT: [[D5:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 5
+; CHECK-NEXT: [[D6:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 6
+; CHECK-NEXT: [[D7:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 7
+; CHECK-NEXT: [[D8:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 8
+; CHECK-NEXT: [[D9:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 9
+; CHECK-NEXT: [[D10:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 10
+; CHECK-NEXT: [[D11:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 11
+; CHECK-NEXT: [[D12:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 12
+; CHECK-NEXT: [[D13:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 13
+; CHECK-NEXT: [[D14:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 14
+; CHECK-NEXT: [[D15:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 15
+; CHECK-NEXT: store half [[T0]], ptr [[D0]], align 2
+; CHECK-NEXT: store half [[T1]], ptr [[D1]], align 2
+; CHECK-NEXT: store half [[T2]], ptr [[D2]], align 2
+; CHECK-NEXT: store half [[T3]], ptr [[D3]], align 2
+; CHECK-NEXT: store half [[T4]], ptr [[D4]], align 2
+; CHECK-NEXT: store half [[T5]], ptr [[D5]], align 2
+; CHECK-NEXT: store half [[T6]], ptr [[D6]], align 2
+; CHECK-NEXT: store half [[T7]], ptr [[D7]], align 2
+; CHECK-NEXT: store half [[T8]], ptr [[D8]], align 2
+; CHECK-NEXT: store half [[T9]], ptr [[D9]], align 2
+; CHECK-NEXT: store half [[T10]], ptr [[D10]], align 2
+; CHECK-NEXT: store half [[T11]], ptr [[D11]], align 2
+; CHECK-NEXT: store half [[T12]], ptr [[D12]], align 2
+; CHECK-NEXT: store half [[T13]], ptr [[D13]], align 2
+; CHECK-NEXT: store half [[T14]], ptr [[D14]], align 2
+; CHECK-NEXT: store half [[T15]], ptr [[D15]], align 2
+; CHECK-NEXT: ret void
+;
+; CHECK-F16C-LABEL: define void @fpround_v16xf32_v16xf16(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[S0]], align 4
+; CHECK-F16C-NEXT: [[TMP2:%.*]] = fptrunc <16 x float> [[TMP1]] to <16 x half>
+; CHECK-F16C-NEXT: store <16 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT: ret void
+;
+; CHECK-AVX512-LABEL: define void @fpround_v16xf32_v16xf16(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[S0]], align 4
+; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fptrunc <16 x float> [[TMP1]] to <16 x half>
+; CHECK-AVX512-NEXT: store <16 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-AVX512-NEXT: ret void
+;
+ %s1 = getelementptr inbounds float, ptr %s0, i64 1
+ %s2 = getelementptr inbounds float, ptr %s0, i64 2
+ %s3 = getelementptr inbounds float, ptr %s0, i64 3
+ %s4 = getelementptr inbounds float, ptr %s0, i64 4
+ %s5 = getelementptr inbounds float, ptr %s0, i64 5
+ %s6 = getelementptr inbounds float, ptr %s0, i64 6
+ %s7 = getelementptr inbounds float, ptr %s0, i64 7
+ %s8 = getelementptr inbounds float, ptr %s0, i64 8
+ %s9 = getelementptr inbounds float, ptr %s0, i64 9
+ %s10 = getelementptr inbounds float, ptr %s0, i64 10
+ %s11 = getelementptr inbounds float, ptr %s0, i64 11
+ %s12 = getelementptr inbounds float, ptr %s0, i64 12
+ %s13 = getelementptr inbounds float, ptr %s0, i64 13
+ %s14 = getelementptr inbounds float, ptr %s0, i64 14
+ %s15 = getelementptr inbounds float, ptr %s0, i64 15
+ %l0 = load float, ptr %s0, align 4
+ %l1 = load float, ptr %s1, align 4
+ %l2 = load float, ptr %s2, align 4
+ %l3 = load float, ptr %s3, align 4
+ %l4 = load float, ptr %s4, align 4
+ %l5 = load float, ptr %s5, align 4
+ %l6 = load float, ptr %s6, align 4
+ %l7 = load float, ptr %s7, align 4
+ %l8 = load float, ptr %s8, align 4
+ %l9 = load float, ptr %s9, align 4
+ %l10 = load float, ptr %s10, align 4
+ %l11 = load float, ptr %s11, align 4
+ %l12 = load float, ptr %s12, align 4
+ %l13 = load float, ptr %s13, align 4
+ %l14 = load float, ptr %s14, align 4
+ %l15 = load float, ptr %s15, align 4
+
+ %t0 = fptrunc float %l0 to half
+ %t1 = fptrunc float %l1 to half
+ %t2 = fptrunc float %l2 to half
+ %t3 = fptrunc float %l3 to half
+ %t4 = fptrunc float %l4 to half
+ %t5 = fptrunc float %l5 to half
+ %t6 = fptrunc float %l6 to half
+ %t7 = fptrunc float %l7 to half
+ %t8 = fptrunc float %l8 to half
+ %t9 = fptrunc float %l9 to half
+ %t10 = fptrunc float %l10 to half
+ %t11 = fptrunc float %l11 to half
+ %t12 = fptrunc float %l12 to half
+ %t13 = fptrunc float %l13 to half
+ %t14 = fptrunc float %l14 to half
+ %t15 = fptrunc float %l15 to half
+
+ %d1 = getelementptr inbounds half, ptr %d0, i64 1
+ %d2 = getelementptr inbounds half, ptr %d0, i64 2
+ %d3 = getelementptr inbounds half, ptr %d0, i64 3
+ %d4 = getelementptr inbounds half, ptr %d0, i64 4
+ %d5 = getelementptr inbounds half, ptr %d0, i64 5
+ %d6 = getelementptr inbounds half, ptr %d0, i64 6
+ %d7 = getelementptr inbounds half, ptr %d0, i64 7
+ %d8 = getelementptr inbounds half, ptr %d0, i64 8
+ %d9 = getelementptr inbounds half, ptr %d0, i64 9
+ %d10 = getelementptr inbounds half, ptr %d0, i64 10
+ %d11 = getelementptr inbounds half, ptr %d0, i64 11
+ %d12 = getelementptr inbounds half, ptr %d0, i64 12
+ %d13 = getelementptr inbounds half, ptr %d0, i64 13
+ %d14 = getelementptr inbounds half, ptr %d0, i64 14
+ %d15 = getelementptr inbounds half, ptr %d0, i64 15
+ store half %t0, ptr %d0, align 2
+ store half %t1, ptr %d1, align 2
+ store half %t2, ptr %d2, align 2
+ store half %t3, ptr %d3, align 2
+ store half %t4, ptr %d4, align 2
+ store half %t5, ptr %d5, align 2
+ store half %t6, ptr %d6, align 2
+ store half %t7, ptr %d7, align 2
+ store half %t8, ptr %d8, align 2
+ store half %t9, ptr %d9, align 2
+ store half %t10, ptr %d10, align 2
+ store half %t11, ptr %d11, align 2
+ store half %t12, ptr %d12, align 2
+ store half %t13, ptr %d13, align 2
+ store half %t14, ptr %d14, align 2
+ store half %t15, ptr %d15, align 2
+ ret void
+
+}
+
+; There is no instruction to round f64 to f16; this should not get vectorized!
+define void @fpround_v2xf64_v2xf16(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpround_v2xf64_v2xf16(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1
+; CHECK-NEXT: [[L0:%.*]] = load double, ptr [[S0]], align 4
+; CHECK-NEXT: [[L1:%.*]] = load double, ptr [[S1]], align 4
+; CHECK-NEXT: [[T0:%.*]] = fptrunc double [[L0]] to half
+; CHECK-NEXT: [[T1:%.*]] = fptrunc double [[L1]] to half
+; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-NEXT: store half [[T0]], ptr [[D0]], align 2
+; CHECK-NEXT: store half [[T1]], ptr [[D1]], align 2
+; CHECK-NEXT: ret void
+;
+; CHECK-F16C-LABEL: define void @fpround_v2xf64_v2xf16(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT: [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1
+; CHECK-F16C-NEXT: [[L0:%.*]] = load double, ptr [[S0]], align 4
+; CHECK-F16C-NEXT: [[L1:%.*]] = load double, ptr [[S1]], align 4
+; CHECK-F16C-NEXT: [[T0:%.*]] = fptrunc double [[L0]] to half
+; CHECK-F16C-NEXT: [[T1:%.*]] = fptrunc double [[L1]] to half
+; CHECK-F16C-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-F16C-NEXT: store half [[T0]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT: store half [[T1]], ptr [[D1]], align 2
+; CHECK-F16C-NEXT: ret void
+;
+; CHECK-AVX512-LABEL: define void @fpround_v2xf64_v2xf16(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT: [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1
+; CHECK-AVX512-NEXT: [[L0:%.*]] = load double, ptr [[S0]], align 4
+; CHECK-AVX512-NEXT: [[L1:%.*]] = load double, ptr [[S1]], align 4
+; CHECK-AVX512-NEXT: [[T0:%.*]] = fptrunc double [[L0]] to half
+; CHECK-AVX512-NEXT: [[T1:%.*]] = fptrunc double [[L1]] to half
+; CHECK-AVX512-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-AVX512-NEXT: store half [[T0]], ptr [[D0]], align 2
+; CHECK-AVX512-NEXT: store half [[T1]], ptr [[D1]], align 2
+; CHECK-AVX512-NEXT: ret void
+;
+ %s1 = getelementptr inbounds double, ptr %s0, i64 1
+ %l0 = load double, ptr %s0, align 4
+ %l1 = load double, ptr %s1, align 4
+
+ %t0 = fptrunc double %l0 to half
+ %t1 = fptrunc double %l1 to half
+
+ %d1 = getelementptr inbounds half, ptr %d0, i64 1
+ store half %t0, ptr %d0, align 2
+ store half %t1, ptr %d1, align 2
+ ret void
+}
>From 84a22ef1a5ced6fc2383c38b99fb040df17db76e Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Tue, 22 Oct 2024 17:19:29 -0700
Subject: [PATCH 2/2] Override X86TTIImpl::getStoreMinimumVF instead of
tweaking codegen tables.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 6 -----
.../lib/Target/X86/X86TargetTransformInfo.cpp | 24 +++++++++++++++----
llvm/lib/Target/X86/X86TargetTransformInfo.h | 3 +++
.../SLPVectorizer/X86/conversion-fp16.ll | 19 +++++++++------
4 files changed, 35 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index da88a1a0a5a3b8..bcb84add65d83e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1714,9 +1714,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
}
- // trunc+store via vcvtps2ph
- setOperationAction(ISD::STORE, MVT::v4f16, Custom);
- setOperationAction(ISD::STORE, MVT::v8f16, Custom);
}
// This block controls legalization of the mask vector sizes that are
@@ -1787,9 +1784,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
- // trunc+store via vcvtps2ph
- setOperationAction(ISD::STORE, MVT::v16f16, Custom);
}
if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2d2c804ed46e54..bae223243b3dc9 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2977,10 +2977,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
};
static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
- { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } }, // vcvtps2ph
- { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } }, // vcvtps2ph
- { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } }, // vcvtph2ps
- { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } }, // vcvtph2ps
+ { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
+ { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
+ { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
+ { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
+ { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
+ { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
+ { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
{ ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
};
@@ -3171,6 +3174,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
TTI::CastContextHint::None, CostKind);
}
+ if (ISD == ISD::FP_ROUND && LTDest.second.getScalarType() == MVT::f16) {
+ // Conversion requires a libcall.
+ return InstructionCost::getInvalid();
+ }
+
// TODO: Allow non-throughput costs that aren't binary.
auto AdjustCost = [&CostKind](InstructionCost Cost,
InstructionCost N = 1) -> InstructionCost {
@@ -6948,6 +6956,14 @@ bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const {
return true;
}
+unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+ Type *ScalarValTy) const {
+ if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
+ return 4;
+ }
+ return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
+}
+
bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
using namespace llvm::PatternMatch;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0100f328ab4bd3..36d00cee0d18b5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -302,6 +302,9 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
bool isVectorShiftByScalarCheap(Type *Ty) const;
+ unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+ Type *ScalarValTy) const;
+
private:
bool supportsGather() const;
InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
index 1d5dee6cb8121c..bcea147d724f53 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
@@ -123,8 +123,8 @@ define void @fpext_v4xf16_v4xf64(ptr %s0, ptr %d0) {
ret void
}
-define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
-; CHECK-LABEL: define void @fpext_v16xf15_v16xf32(
+define void @fpext_v16xf16_v16xf32(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v16xf16_v16xf32(
; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
@@ -206,7 +206,7 @@ define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
; CHECK-NEXT: store float [[E15]], ptr [[D16]], align 8
; CHECK-NEXT: ret void
;
-; CHECK-F16C-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-F16C-LABEL: define void @fpext_v16xf16_v16xf32(
; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
; CHECK-F16C-NEXT: [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8
; CHECK-F16C-NEXT: [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8
@@ -218,7 +218,7 @@ define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
; CHECK-F16C-NEXT: store <8 x float> [[TMP4]], ptr [[D8]], align 8
; CHECK-F16C-NEXT: ret void
;
-; CHECK-AVX512-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-AVX512-LABEL: define void @fpext_v16xf16_v16xf32(
; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[S0]], align 2
; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x float>
@@ -453,9 +453,14 @@ define void @fpround_v16xf32_v16xf16(ptr %s0, ptr %d0) {
;
; CHECK-F16C-LABEL: define void @fpround_v16xf32_v16xf16(
; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
-; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[S0]], align 4
-; CHECK-F16C-NEXT: [[TMP2:%.*]] = fptrunc <16 x float> [[TMP1]] to <16 x half>
-; CHECK-F16C-NEXT: store <16 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT: [[S8:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 8
+; CHECK-F16C-NEXT: [[D8:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 8
+; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[S0]], align 4
+; CHECK-F16C-NEXT: [[TMP2:%.*]] = fptrunc <8 x float> [[TMP1]] to <8 x half>
+; CHECK-F16C-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[S8]], align 4
+; CHECK-F16C-NEXT: [[TMP4:%.*]] = fptrunc <8 x float> [[TMP3]] to <8 x half>
+; CHECK-F16C-NEXT: store <8 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT: store <8 x half> [[TMP4]], ptr [[D8]], align 2
; CHECK-F16C-NEXT: ret void
;
; CHECK-AVX512-LABEL: define void @fpround_v16xf32_v16xf16(
More information about the llvm-commits
mailing list