[llvm] X86: Improve cost model of fp16 conversion (PR #113195)

Matthias Braun via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 25 09:35:27 PDT 2024


https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/113195

>From 4f2a5293ea966245788cc0bc2b430510f2edca13 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 18 Oct 2024 16:24:57 -0700
Subject: [PATCH 1/2] X86: Improve cost model of fp16 conversion

Improve cost-modeling for x86 __fp16 conversions so the SLPVectorizer
transforms the patterns:

- `setOperationAction` of v4f16, v8f16 and v16f16 to Custom so
  `TargetTransformInfo::getStoreMinimumVF` reports them as acceptable.
- Add missing cost entries to `X86TTIImpl::getCastInstrCost`
  conversion from/to fp16. Note that conversion from f64 to f16 is not
  supported by an X86 instruction.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   6 +
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  25 +
 .../SLPVectorizer/X86/conversion-fp16.ll      | 601 ++++++++++++++++++
 3 files changed, 632 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bcb84add65d83e..da88a1a0a5a3b8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1714,6 +1714,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
     }
+    // trunc+store via vcvtps2ph
+    setOperationAction(ISD::STORE, MVT::v4f16, Custom);
+    setOperationAction(ISD::STORE, MVT::v8f16, Custom);
   }
 
   // This block controls legalization of the mask vector sizes that are
@@ -1784,6 +1787,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+    // trunc+store via vcvtps2ph
+    setOperationAction(ISD::STORE, MVT::v16f16, Custom);
   }
   if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
     for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 413ef0136d5c06..2d2c804ed46e54 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2296,7 +2296,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,   { 1, 1, 1, 1 } },
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32,  { 3, 1, 1, 1 } },
     { ISD::FP_EXTEND, MVT::v16f64,  MVT::v16f32,  { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
+    { ISD::FP_EXTEND, MVT::v16f32,  MVT::v16f16,  { 1, 1, 1, 1 } }, // vcvtph2ps
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f16,   { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,   { 1, 1, 1, 1 } },
+    { ISD::FP_ROUND,  MVT::v16f16,  MVT::v16f32,  { 1, 1, 1, 1 } }, // vcvtps2ph
 
     { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
     { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,    { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
@@ -2973,6 +2976,14 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::TRUNCATE,    MVT::v4i32,  MVT::v2i64,  { 1, 1, 1, 1 } }, // PSHUFD
   };
 
+  static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
+    { ISD::FP_ROUND,  MVT::v8f16,   MVT::v8f32,   { 1, 1, 1, 1 } }, // vcvtps2ph
+    { ISD::FP_ROUND,  MVT::v4f16,   MVT::v4f32,   { 1, 1, 1, 1 } }, // vcvtps2ph
+    { ISD::FP_EXTEND, MVT::v8f32,   MVT::v8f16,   { 1, 1, 1, 1 } }, // vcvtph2ps
+    { ISD::FP_EXTEND, MVT::v4f32,   MVT::v4f16,   { 1, 1, 1, 1 } }, // vcvtph2ps
+    { ISD::FP_EXTEND, MVT::v4f64,   MVT::v4f16,   { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
+  };
+
   // Attempt to map directly to (simple) MVT types to let us match custom entries.
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
@@ -3034,6 +3045,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
           return *KindCost;
     }
 
+    if (ST->hasF16C()) {
+      if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
+                                                     SimpleDstTy, SimpleSrcTy))
+        if (auto KindCost = Entry->Cost[CostKind])
+          return *KindCost;
+    }
+
     if (ST->hasSSE41()) {
       if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
@@ -3107,6 +3125,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       if (auto KindCost = Entry->Cost[CostKind])
         return std::max(LTSrc.first, LTDest.first) * *KindCost;
 
+  if (ST->hasF16C()) {
+    if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
+                                                   LTDest.second, LTSrc.second))
+      if (auto KindCost = Entry->Cost[CostKind])
+        return std::max(LTSrc.first, LTDest.first) * *KindCost;
+  }
+
   if (ST->hasSSE41())
     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
new file mode 100644
index 00000000000000..1d5dee6cb8121c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
@@ -0,0 +1,601 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx2 -mattr=+f16c | FileCheck %s --check-prefix=CHECK-F16C
+; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx512f | FileCheck %s --check-prefix=CHECK-AVX512
+
+define void @fpext_v4xf16_v4xf32(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v4xf16_v4xf32(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
+; CHECK-NEXT:    [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
+; CHECK-NEXT:    [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3
+; CHECK-NEXT:    [[L0:%.*]] = load half, ptr [[S0]], align 2
+; CHECK-NEXT:    [[L1:%.*]] = load half, ptr [[S1]], align 2
+; CHECK-NEXT:    [[L2:%.*]] = load half, ptr [[S2]], align 2
+; CHECK-NEXT:    [[L3:%.*]] = load half, ptr [[S3]], align 2
+; CHECK-NEXT:    [[E0:%.*]] = fpext half [[L0]] to float
+; CHECK-NEXT:    [[E1:%.*]] = fpext half [[L1]] to float
+; CHECK-NEXT:    [[E2:%.*]] = fpext half [[L2]] to float
+; CHECK-NEXT:    [[E3:%.*]] = fpext half [[L3]] to float
+; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 1
+; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 2
+; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 3
+; CHECK-NEXT:    store float [[E0]], ptr [[D0]], align 8
+; CHECK-NEXT:    store float [[E1]], ptr [[D1]], align 8
+; CHECK-NEXT:    store float [[E2]], ptr [[D2]], align 8
+; CHECK-NEXT:    store float [[E3]], ptr [[D3]], align 8
+; CHECK-NEXT:    ret void
+;
+; CHECK-F16C-LABEL: define void @fpext_v4xf16_v4xf32(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2
+; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x float>
+; CHECK-F16C-NEXT:    store <4 x float> [[TMP2]], ptr [[D0]], align 8
+; CHECK-F16C-NEXT:    ret void
+;
+; CHECK-AVX512-LABEL: define void @fpext_v4xf16_v4xf32(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-AVX512-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2
+; CHECK-AVX512-NEXT:    [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x float>
+; CHECK-AVX512-NEXT:    store <4 x float> [[TMP2]], ptr [[D0]], align 8
+; CHECK-AVX512-NEXT:    ret void
+;
+  %s1 = getelementptr inbounds half, ptr %s0, i64 1
+  %s2 = getelementptr inbounds half, ptr %s0, i64 2
+  %s3 = getelementptr inbounds half, ptr %s0, i64 3
+  %l0 = load half, ptr %s0, align 2
+  %l1 = load half, ptr %s1, align 2
+  %l2 = load half, ptr %s2, align 2
+  %l3 = load half, ptr %s3, align 2
+
+  %e0 = fpext half %l0 to float
+  %e1 = fpext half %l1 to float
+  %e2 = fpext half %l2 to float
+  %e3 = fpext half %l3 to float
+
+  %d1 = getelementptr inbounds float, ptr %d0, i64 1
+  %d2 = getelementptr inbounds float, ptr %d0, i64 2
+  %d3 = getelementptr inbounds float, ptr %d0, i64 3
+  store float %e0, ptr %d0, align 8
+  store float %e1, ptr %d1, align 8
+  store float %e2, ptr %d2, align 8
+  store float %e3, ptr %d3, align 8
+  ret void
+}
+
+define void @fpext_v4xf16_v4xf64(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v4xf16_v4xf64(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
+; CHECK-NEXT:    [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
+; CHECK-NEXT:    [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3
+; CHECK-NEXT:    [[L0:%.*]] = load half, ptr [[S0]], align 2
+; CHECK-NEXT:    [[L1:%.*]] = load half, ptr [[S1]], align 2
+; CHECK-NEXT:    [[L2:%.*]] = load half, ptr [[S2]], align 2
+; CHECK-NEXT:    [[L3:%.*]] = load half, ptr [[S3]], align 2
+; CHECK-NEXT:    [[E0:%.*]] = fpext half [[L0]] to double
+; CHECK-NEXT:    [[E1:%.*]] = fpext half [[L1]] to double
+; CHECK-NEXT:    [[E2:%.*]] = fpext half [[L2]] to double
+; CHECK-NEXT:    [[E3:%.*]] = fpext half [[L3]] to double
+; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 1
+; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 2
+; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 3
+; CHECK-NEXT:    store double [[E0]], ptr [[D0]], align 8
+; CHECK-NEXT:    store double [[E1]], ptr [[D1]], align 8
+; CHECK-NEXT:    store double [[E2]], ptr [[D2]], align 8
+; CHECK-NEXT:    store double [[E3]], ptr [[D3]], align 8
+; CHECK-NEXT:    ret void
+;
+; CHECK-F16C-LABEL: define void @fpext_v4xf16_v4xf64(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2
+; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x double>
+; CHECK-F16C-NEXT:    store <4 x double> [[TMP2]], ptr [[D0]], align 8
+; CHECK-F16C-NEXT:    ret void
+;
+; CHECK-AVX512-LABEL: define void @fpext_v4xf16_v4xf64(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2
+; CHECK-AVX512-NEXT:    [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x double>
+; CHECK-AVX512-NEXT:    store <4 x double> [[TMP2]], ptr [[D0]], align 8
+; CHECK-AVX512-NEXT:    ret void
+;
+  %s1 = getelementptr inbounds half, ptr %s0, i64 1
+  %s2 = getelementptr inbounds half, ptr %s0, i64 2
+  %s3 = getelementptr inbounds half, ptr %s0, i64 3
+  %l0 = load half, ptr %s0, align 2
+  %l1 = load half, ptr %s1, align 2
+  %l2 = load half, ptr %s2, align 2
+  %l3 = load half, ptr %s3, align 2
+
+  %e0 = fpext half %l0 to double
+  %e1 = fpext half %l1 to double
+  %e2 = fpext half %l2 to double
+  %e3 = fpext half %l3 to double
+
+  %d1 = getelementptr inbounds double, ptr %d0, i64 1
+  %d2 = getelementptr inbounds double, ptr %d0, i64 2
+  %d3 = getelementptr inbounds double, ptr %d0, i64 3
+  store double %e0, ptr %d0, align 8
+  store double %e1, ptr %d1, align 8
+  store double %e2, ptr %d2, align 8
+  store double %e3, ptr %d3, align 8
+  ret void
+}
+
+define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
+; CHECK-NEXT:    [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
+; CHECK-NEXT:    [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3
+; CHECK-NEXT:    [[S4:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 4
+; CHECK-NEXT:    [[S5:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 5
+; CHECK-NEXT:    [[S6:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 6
+; CHECK-NEXT:    [[S7:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 7
+; CHECK-NEXT:    [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8
+; CHECK-NEXT:    [[S9:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 9
+; CHECK-NEXT:    [[S10:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 10
+; CHECK-NEXT:    [[S11:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 11
+; CHECK-NEXT:    [[S12:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 12
+; CHECK-NEXT:    [[S13:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 13
+; CHECK-NEXT:    [[S14:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 14
+; CHECK-NEXT:    [[S15:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 15
+; CHECK-NEXT:    [[L0:%.*]] = load half, ptr [[S0]], align 2
+; CHECK-NEXT:    [[L1:%.*]] = load half, ptr [[S1]], align 2
+; CHECK-NEXT:    [[L2:%.*]] = load half, ptr [[S2]], align 2
+; CHECK-NEXT:    [[L3:%.*]] = load half, ptr [[S3]], align 2
+; CHECK-NEXT:    [[L4:%.*]] = load half, ptr [[S4]], align 2
+; CHECK-NEXT:    [[L5:%.*]] = load half, ptr [[S5]], align 2
+; CHECK-NEXT:    [[L6:%.*]] = load half, ptr [[S6]], align 2
+; CHECK-NEXT:    [[L7:%.*]] = load half, ptr [[S7]], align 2
+; CHECK-NEXT:    [[L8:%.*]] = load half, ptr [[S8]], align 2
+; CHECK-NEXT:    [[L9:%.*]] = load half, ptr [[S9]], align 2
+; CHECK-NEXT:    [[L10:%.*]] = load half, ptr [[S10]], align 2
+; CHECK-NEXT:    [[L11:%.*]] = load half, ptr [[S11]], align 2
+; CHECK-NEXT:    [[L12:%.*]] = load half, ptr [[S12]], align 2
+; CHECK-NEXT:    [[L13:%.*]] = load half, ptr [[S13]], align 2
+; CHECK-NEXT:    [[L14:%.*]] = load half, ptr [[S14]], align 2
+; CHECK-NEXT:    [[L15:%.*]] = load half, ptr [[S15]], align 2
+; CHECK-NEXT:    [[E0:%.*]] = fpext half [[L0]] to float
+; CHECK-NEXT:    [[E1:%.*]] = fpext half [[L1]] to float
+; CHECK-NEXT:    [[E2:%.*]] = fpext half [[L2]] to float
+; CHECK-NEXT:    [[E3:%.*]] = fpext half [[L3]] to float
+; CHECK-NEXT:    [[E4:%.*]] = fpext half [[L4]] to float
+; CHECK-NEXT:    [[E5:%.*]] = fpext half [[L5]] to float
+; CHECK-NEXT:    [[E6:%.*]] = fpext half [[L6]] to float
+; CHECK-NEXT:    [[E7:%.*]] = fpext half [[L7]] to float
+; CHECK-NEXT:    [[E8:%.*]] = fpext half [[L8]] to float
+; CHECK-NEXT:    [[E9:%.*]] = fpext half [[L9]] to float
+; CHECK-NEXT:    [[E10:%.*]] = fpext half [[L10]] to float
+; CHECK-NEXT:    [[E11:%.*]] = fpext half [[L11]] to float
+; CHECK-NEXT:    [[E12:%.*]] = fpext half [[L12]] to float
+; CHECK-NEXT:    [[E13:%.*]] = fpext half [[L13]] to float
+; CHECK-NEXT:    [[E14:%.*]] = fpext half [[L14]] to float
+; CHECK-NEXT:    [[E15:%.*]] = fpext half [[L15]] to float
+; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 1
+; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 2
+; CHECK-NEXT:    [[D15:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 3
+; CHECK-NEXT:    [[D4:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 4
+; CHECK-NEXT:    [[D5:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 5
+; CHECK-NEXT:    [[D6:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 6
+; CHECK-NEXT:    [[D7:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 7
+; CHECK-NEXT:    [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8
+; CHECK-NEXT:    [[D9:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 9
+; CHECK-NEXT:    [[D10:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 10
+; CHECK-NEXT:    [[D11:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 11
+; CHECK-NEXT:    [[D12:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 12
+; CHECK-NEXT:    [[D13:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 13
+; CHECK-NEXT:    [[D14:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 14
+; CHECK-NEXT:    [[D16:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 15
+; CHECK-NEXT:    store float [[E0]], ptr [[D0]], align 8
+; CHECK-NEXT:    store float [[E1]], ptr [[D1]], align 8
+; CHECK-NEXT:    store float [[E2]], ptr [[D2]], align 8
+; CHECK-NEXT:    store float [[E3]], ptr [[D15]], align 8
+; CHECK-NEXT:    store float [[E4]], ptr [[D4]], align 8
+; CHECK-NEXT:    store float [[E5]], ptr [[D5]], align 8
+; CHECK-NEXT:    store float [[E6]], ptr [[D6]], align 8
+; CHECK-NEXT:    store float [[E7]], ptr [[D7]], align 8
+; CHECK-NEXT:    store float [[E8]], ptr [[D8]], align 8
+; CHECK-NEXT:    store float [[E9]], ptr [[D9]], align 8
+; CHECK-NEXT:    store float [[E10]], ptr [[D10]], align 8
+; CHECK-NEXT:    store float [[E11]], ptr [[D11]], align 8
+; CHECK-NEXT:    store float [[E12]], ptr [[D12]], align 8
+; CHECK-NEXT:    store float [[E13]], ptr [[D13]], align 8
+; CHECK-NEXT:    store float [[E14]], ptr [[D14]], align 8
+; CHECK-NEXT:    store float [[E15]], ptr [[D16]], align 8
+; CHECK-NEXT:    ret void
+;
+; CHECK-F16C-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT:    [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8
+; CHECK-F16C-NEXT:    [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8
+; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[S0]], align 2
+; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fpext <8 x half> [[TMP1]] to <8 x float>
+; CHECK-F16C-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[S8]], align 2
+; CHECK-F16C-NEXT:    [[TMP4:%.*]] = fpext <8 x half> [[TMP3]] to <8 x float>
+; CHECK-F16C-NEXT:    store <8 x float> [[TMP2]], ptr [[D0]], align 8
+; CHECK-F16C-NEXT:    store <8 x float> [[TMP4]], ptr [[D8]], align 8
+; CHECK-F16C-NEXT:    ret void
+;
+; CHECK-AVX512-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[S0]], align 2
+; CHECK-AVX512-NEXT:    [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x float>
+; CHECK-AVX512-NEXT:    store <16 x float> [[TMP2]], ptr [[D0]], align 8
+; CHECK-AVX512-NEXT:    ret void
+;
+  %s1 = getelementptr inbounds half, ptr %s0, i64 1
+  %s2 = getelementptr inbounds half, ptr %s0, i64 2
+  %s3 = getelementptr inbounds half, ptr %s0, i64 3
+  %s4 = getelementptr inbounds half, ptr %s0, i64 4
+  %s5 = getelementptr inbounds half, ptr %s0, i64 5
+  %s6 = getelementptr inbounds half, ptr %s0, i64 6
+  %s7 = getelementptr inbounds half, ptr %s0, i64 7
+  %s8 = getelementptr inbounds half, ptr %s0, i64 8
+  %s9 = getelementptr inbounds half, ptr %s0, i64 9
+  %s10 = getelementptr inbounds half, ptr %s0, i64 10
+  %s11 = getelementptr inbounds half, ptr %s0, i64 11
+  %s12 = getelementptr inbounds half, ptr %s0, i64 12
+  %s13 = getelementptr inbounds half, ptr %s0, i64 13
+  %s14 = getelementptr inbounds half, ptr %s0, i64 14
+  %s15 = getelementptr inbounds half, ptr %s0, i64 15
+  %l0 = load half, ptr %s0, align 2
+  %l1 = load half, ptr %s1, align 2
+  %l2 = load half, ptr %s2, align 2
+  %l3 = load half, ptr %s3, align 2
+  %l4 = load half, ptr %s4, align 2
+  %l5 = load half, ptr %s5, align 2
+  %l6 = load half, ptr %s6, align 2
+  %l7 = load half, ptr %s7, align 2
+  %l8 = load half, ptr %s8, align 2
+  %l9 = load half, ptr %s9, align 2
+  %l10 = load half, ptr %s10, align 2
+  %l11 = load half, ptr %s11, align 2
+  %l12 = load half, ptr %s12, align 2
+  %l13 = load half, ptr %s13, align 2
+  %l14 = load half, ptr %s14, align 2
+  %l15 = load half, ptr %s15, align 2
+
+  %e0 = fpext half %l0 to float
+  %e1 = fpext half %l1 to float
+  %e2 = fpext half %l2 to float
+  %e3 = fpext half %l3 to float
+  %e4 = fpext half %l4 to float
+  %e5 = fpext half %l5 to float
+  %e6 = fpext half %l6 to float
+  %e7 = fpext half %l7 to float
+  %e8 = fpext half %l8 to float
+  %e9 = fpext half %l9 to float
+  %e10 = fpext half %l10 to float
+  %e11 = fpext half %l11 to float
+  %e12 = fpext half %l12 to float
+  %e13 = fpext half %l13 to float
+  %e14 = fpext half %l14 to float
+  %e15 = fpext half %l15 to float
+
+  %d1 = getelementptr inbounds float, ptr %d0, i64 1
+  %d2 = getelementptr inbounds float, ptr %d0, i64 2
+  %d3 = getelementptr inbounds float, ptr %d0, i64 3
+  %d4 = getelementptr inbounds float, ptr %d0, i64 4
+  %d5 = getelementptr inbounds float, ptr %d0, i64 5
+  %d6 = getelementptr inbounds float, ptr %d0, i64 6
+  %d7 = getelementptr inbounds float, ptr %d0, i64 7
+  %d8 = getelementptr inbounds float, ptr %d0, i64 8
+  %d9 = getelementptr inbounds float, ptr %d0, i64 9
+  %d10 = getelementptr inbounds float, ptr %d0, i64 10
+  %d11 = getelementptr inbounds float, ptr %d0, i64 11
+  %d12 = getelementptr inbounds float, ptr %d0, i64 12
+  %d13 = getelementptr inbounds float, ptr %d0, i64 13
+  %d14 = getelementptr inbounds float, ptr %d0, i64 14
+  %d15 = getelementptr inbounds float, ptr %d0, i64 15
+  store float %e0, ptr %d0, align 8
+  store float %e1, ptr %d1, align 8
+  store float %e2, ptr %d2, align 8
+  store float %e3, ptr %d3, align 8
+  store float %e4, ptr %d4, align 8
+  store float %e5, ptr %d5, align 8
+  store float %e6, ptr %d6, align 8
+  store float %e7, ptr %d7, align 8
+  store float %e8, ptr %d8, align 8
+  store float %e9, ptr %d9, align 8
+  store float %e10, ptr %d10, align 8
+  store float %e11, ptr %d11, align 8
+  store float %e12, ptr %d12, align 8
+  store float %e13, ptr %d13, align 8
+  store float %e14, ptr %d14, align 8
+  store float %e15, ptr %d15, align 8
+  ret void
+}
+
+define void @fpround_v4xf32_v4xf16(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpround_v4xf32_v4xf16(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S1:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 1
+; CHECK-NEXT:    [[S2:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 2
+; CHECK-NEXT:    [[S3:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 3
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[S0]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[S1]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[S2]], align 4
+; CHECK-NEXT:    [[L3:%.*]] = load float, ptr [[S3]], align 4
+; CHECK-NEXT:    [[T0:%.*]] = fptrunc float [[L0]] to half
+; CHECK-NEXT:    [[T1:%.*]] = fptrunc float [[L1]] to half
+; CHECK-NEXT:    [[T2:%.*]] = fptrunc float [[L2]] to half
+; CHECK-NEXT:    [[T3:%.*]] = fptrunc float [[L3]] to half
+; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 2
+; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 3
+; CHECK-NEXT:    store half [[T0]], ptr [[D0]], align 2
+; CHECK-NEXT:    store half [[T1]], ptr [[D1]], align 2
+; CHECK-NEXT:    store half [[T2]], ptr [[D2]], align 2
+; CHECK-NEXT:    store half [[T3]], ptr [[D3]], align 2
+; CHECK-NEXT:    ret void
+;
+; CHECK-F16C-LABEL: define void @fpround_v4xf32_v4xf16(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[S0]], align 4
+; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fptrunc <4 x float> [[TMP1]] to <4 x half>
+; CHECK-F16C-NEXT:    store <4 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT:    ret void
+;
+; CHECK-AVX512-LABEL: define void @fpround_v4xf32_v4xf16(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[S0]], align 4
+; CHECK-AVX512-NEXT:    [[TMP2:%.*]] = fptrunc <4 x float> [[TMP1]] to <4 x half>
+; CHECK-AVX512-NEXT:    store <4 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-AVX512-NEXT:    ret void
+;
+  %s1 = getelementptr inbounds float, ptr %s0, i64 1
+  %s2 = getelementptr inbounds float, ptr %s0, i64 2
+  %s3 = getelementptr inbounds float, ptr %s0, i64 3
+  %l0 = load float, ptr %s0, align 4
+  %l1 = load float, ptr %s1, align 4
+  %l2 = load float, ptr %s2, align 4
+  %l3 = load float, ptr %s3, align 4
+
+  %t0 = fptrunc float %l0 to half
+  %t1 = fptrunc float %l1 to half
+  %t2 = fptrunc float %l2 to half
+  %t3 = fptrunc float %l3 to half
+
+  %d1 = getelementptr inbounds half, ptr %d0, i64 1
+  %d2 = getelementptr inbounds half, ptr %d0, i64 2
+  %d3 = getelementptr inbounds half, ptr %d0, i64 3
+  store half %t0, ptr %d0, align 2
+  store half %t1, ptr %d1, align 2
+  store half %t2, ptr %d2, align 2
+  store half %t3, ptr %d3, align 2
+  ret void
+}
+
+define void @fpround_v16xf32_v16xf16(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpround_v16xf32_v16xf16(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S1:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 1
+; CHECK-NEXT:    [[S2:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 2
+; CHECK-NEXT:    [[S3:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 3
+; CHECK-NEXT:    [[S4:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 4
+; CHECK-NEXT:    [[S5:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 5
+; CHECK-NEXT:    [[S6:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 6
+; CHECK-NEXT:    [[S7:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 7
+; CHECK-NEXT:    [[S8:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 8
+; CHECK-NEXT:    [[S9:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 9
+; CHECK-NEXT:    [[S10:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 10
+; CHECK-NEXT:    [[S11:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 11
+; CHECK-NEXT:    [[S12:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 12
+; CHECK-NEXT:    [[S13:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 13
+; CHECK-NEXT:    [[S14:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 14
+; CHECK-NEXT:    [[S15:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 15
+; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[S0]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[S1]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[S2]], align 4
+; CHECK-NEXT:    [[L3:%.*]] = load float, ptr [[S3]], align 4
+; CHECK-NEXT:    [[L4:%.*]] = load float, ptr [[S4]], align 4
+; CHECK-NEXT:    [[L5:%.*]] = load float, ptr [[S5]], align 4
+; CHECK-NEXT:    [[L6:%.*]] = load float, ptr [[S6]], align 4
+; CHECK-NEXT:    [[L7:%.*]] = load float, ptr [[S7]], align 4
+; CHECK-NEXT:    [[L8:%.*]] = load float, ptr [[S8]], align 4
+; CHECK-NEXT:    [[L9:%.*]] = load float, ptr [[S9]], align 4
+; CHECK-NEXT:    [[L10:%.*]] = load float, ptr [[S10]], align 4
+; CHECK-NEXT:    [[L11:%.*]] = load float, ptr [[S11]], align 4
+; CHECK-NEXT:    [[L12:%.*]] = load float, ptr [[S12]], align 4
+; CHECK-NEXT:    [[L13:%.*]] = load float, ptr [[S13]], align 4
+; CHECK-NEXT:    [[L14:%.*]] = load float, ptr [[S14]], align 4
+; CHECK-NEXT:    [[L15:%.*]] = load float, ptr [[S15]], align 4
+; CHECK-NEXT:    [[T0:%.*]] = fptrunc float [[L0]] to half
+; CHECK-NEXT:    [[T1:%.*]] = fptrunc float [[L1]] to half
+; CHECK-NEXT:    [[T2:%.*]] = fptrunc float [[L2]] to half
+; CHECK-NEXT:    [[T3:%.*]] = fptrunc float [[L3]] to half
+; CHECK-NEXT:    [[T4:%.*]] = fptrunc float [[L4]] to half
+; CHECK-NEXT:    [[T5:%.*]] = fptrunc float [[L5]] to half
+; CHECK-NEXT:    [[T6:%.*]] = fptrunc float [[L6]] to half
+; CHECK-NEXT:    [[T7:%.*]] = fptrunc float [[L7]] to half
+; CHECK-NEXT:    [[T8:%.*]] = fptrunc float [[L8]] to half
+; CHECK-NEXT:    [[T9:%.*]] = fptrunc float [[L9]] to half
+; CHECK-NEXT:    [[T10:%.*]] = fptrunc float [[L10]] to half
+; CHECK-NEXT:    [[T11:%.*]] = fptrunc float [[L11]] to half
+; CHECK-NEXT:    [[T12:%.*]] = fptrunc float [[L12]] to half
+; CHECK-NEXT:    [[T13:%.*]] = fptrunc float [[L13]] to half
+; CHECK-NEXT:    [[T14:%.*]] = fptrunc float [[L14]] to half
+; CHECK-NEXT:    [[T15:%.*]] = fptrunc float [[L15]] to half
+; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 2
+; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 3
+; CHECK-NEXT:    [[D4:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 4
+; CHECK-NEXT:    [[D5:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 5
+; CHECK-NEXT:    [[D6:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 6
+; CHECK-NEXT:    [[D7:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 7
+; CHECK-NEXT:    [[D8:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 8
+; CHECK-NEXT:    [[D9:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 9
+; CHECK-NEXT:    [[D10:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 10
+; CHECK-NEXT:    [[D11:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 11
+; CHECK-NEXT:    [[D12:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 12
+; CHECK-NEXT:    [[D13:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 13
+; CHECK-NEXT:    [[D14:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 14
+; CHECK-NEXT:    [[D15:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 15
+; CHECK-NEXT:    store half [[T0]], ptr [[D0]], align 2
+; CHECK-NEXT:    store half [[T1]], ptr [[D1]], align 2
+; CHECK-NEXT:    store half [[T2]], ptr [[D2]], align 2
+; CHECK-NEXT:    store half [[T3]], ptr [[D3]], align 2
+; CHECK-NEXT:    store half [[T4]], ptr [[D4]], align 2
+; CHECK-NEXT:    store half [[T5]], ptr [[D5]], align 2
+; CHECK-NEXT:    store half [[T6]], ptr [[D6]], align 2
+; CHECK-NEXT:    store half [[T7]], ptr [[D7]], align 2
+; CHECK-NEXT:    store half [[T8]], ptr [[D8]], align 2
+; CHECK-NEXT:    store half [[T9]], ptr [[D9]], align 2
+; CHECK-NEXT:    store half [[T10]], ptr [[D10]], align 2
+; CHECK-NEXT:    store half [[T11]], ptr [[D11]], align 2
+; CHECK-NEXT:    store half [[T12]], ptr [[D12]], align 2
+; CHECK-NEXT:    store half [[T13]], ptr [[D13]], align 2
+; CHECK-NEXT:    store half [[T14]], ptr [[D14]], align 2
+; CHECK-NEXT:    store half [[T15]], ptr [[D15]], align 2
+; CHECK-NEXT:    ret void
+;
+; CHECK-F16C-LABEL: define void @fpround_v16xf32_v16xf16(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[S0]], align 4
+; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fptrunc <16 x float> [[TMP1]] to <16 x half>
+; CHECK-F16C-NEXT:    store <16 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT:    ret void
+;
+; CHECK-AVX512-LABEL: define void @fpround_v16xf32_v16xf16(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[S0]], align 4
+; CHECK-AVX512-NEXT:    [[TMP2:%.*]] = fptrunc <16 x float> [[TMP1]] to <16 x half>
+; CHECK-AVX512-NEXT:    store <16 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-AVX512-NEXT:    ret void
+;
+  %s1 = getelementptr inbounds float, ptr %s0, i64 1
+  %s2 = getelementptr inbounds float, ptr %s0, i64 2
+  %s3 = getelementptr inbounds float, ptr %s0, i64 3
+  %s4 = getelementptr inbounds float, ptr %s0, i64 4
+  %s5 = getelementptr inbounds float, ptr %s0, i64 5
+  %s6 = getelementptr inbounds float, ptr %s0, i64 6
+  %s7 = getelementptr inbounds float, ptr %s0, i64 7
+  %s8 = getelementptr inbounds float, ptr %s0, i64 8
+  %s9 = getelementptr inbounds float, ptr %s0, i64 9
+  %s10 = getelementptr inbounds float, ptr %s0, i64 10
+  %s11 = getelementptr inbounds float, ptr %s0, i64 11
+  %s12 = getelementptr inbounds float, ptr %s0, i64 12
+  %s13 = getelementptr inbounds float, ptr %s0, i64 13
+  %s14 = getelementptr inbounds float, ptr %s0, i64 14
+  %s15 = getelementptr inbounds float, ptr %s0, i64 15
+  %l0 = load float, ptr %s0, align 4
+  %l1 = load float, ptr %s1, align 4
+  %l2 = load float, ptr %s2, align 4
+  %l3 = load float, ptr %s3, align 4
+  %l4 = load float, ptr %s4, align 4
+  %l5 = load float, ptr %s5, align 4
+  %l6 = load float, ptr %s6, align 4
+  %l7 = load float, ptr %s7, align 4
+  %l8 = load float, ptr %s8, align 4
+  %l9 = load float, ptr %s9, align 4
+  %l10 = load float, ptr %s10, align 4
+  %l11 = load float, ptr %s11, align 4
+  %l12 = load float, ptr %s12, align 4
+  %l13 = load float, ptr %s13, align 4
+  %l14 = load float, ptr %s14, align 4
+  %l15 = load float, ptr %s15, align 4
+
+  %t0 = fptrunc float %l0 to half
+  %t1 = fptrunc float %l1 to half
+  %t2 = fptrunc float %l2 to half
+  %t3 = fptrunc float %l3 to half
+  %t4 = fptrunc float %l4 to half
+  %t5 = fptrunc float %l5 to half
+  %t6 = fptrunc float %l6 to half
+  %t7 = fptrunc float %l7 to half
+  %t8 = fptrunc float %l8 to half
+  %t9 = fptrunc float %l9 to half
+  %t10 = fptrunc float %l10 to half
+  %t11 = fptrunc float %l11 to half
+  %t12 = fptrunc float %l12 to half
+  %t13 = fptrunc float %l13 to half
+  %t14 = fptrunc float %l14 to half
+  %t15 = fptrunc float %l15 to half
+
+  %d1 = getelementptr inbounds half, ptr %d0, i64 1
+  %d2 = getelementptr inbounds half, ptr %d0, i64 2
+  %d3 = getelementptr inbounds half, ptr %d0, i64 3
+  %d4 = getelementptr inbounds half, ptr %d0, i64 4
+  %d5 = getelementptr inbounds half, ptr %d0, i64 5
+  %d6 = getelementptr inbounds half, ptr %d0, i64 6
+  %d7 = getelementptr inbounds half, ptr %d0, i64 7
+  %d8 = getelementptr inbounds half, ptr %d0, i64 8
+  %d9 = getelementptr inbounds half, ptr %d0, i64 9
+  %d10 = getelementptr inbounds half, ptr %d0, i64 10
+  %d11 = getelementptr inbounds half, ptr %d0, i64 11
+  %d12 = getelementptr inbounds half, ptr %d0, i64 12
+  %d13 = getelementptr inbounds half, ptr %d0, i64 13
+  %d14 = getelementptr inbounds half, ptr %d0, i64 14
+  %d15 = getelementptr inbounds half, ptr %d0, i64 15
+  store half %t0, ptr %d0, align 2
+  store half %t1, ptr %d1, align 2
+  store half %t2, ptr %d2, align 2
+  store half %t3, ptr %d3, align 2
+  store half %t4, ptr %d4, align 2
+  store half %t5, ptr %d5, align 2
+  store half %t6, ptr %d6, align 2
+  store half %t7, ptr %d7, align 2
+  store half %t8, ptr %d8, align 2
+  store half %t9, ptr %d9, align 2
+  store half %t10, ptr %d10, align 2
+  store half %t11, ptr %d11, align 2
+  store half %t12, ptr %d12, align 2
+  store half %t13, ptr %d13, align 2
+  store half %t14, ptr %d14, align 2
+  store half %t15, ptr %d15, align 2
+  ret void
+
+}
+
+; There is no instruction to round f64 to f16; this should not get vectorized!
+define void @fpround_v2xf64_v2xf16(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpround_v2xf64_v2xf16(
+; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1
+; CHECK-NEXT:    [[L0:%.*]] = load double, ptr [[S0]], align 4
+; CHECK-NEXT:    [[L1:%.*]] = load double, ptr [[S1]], align 4
+; CHECK-NEXT:    [[T0:%.*]] = fptrunc double [[L0]] to half
+; CHECK-NEXT:    [[T1:%.*]] = fptrunc double [[L1]] to half
+; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-NEXT:    store half [[T0]], ptr [[D0]], align 2
+; CHECK-NEXT:    store half [[T1]], ptr [[D1]], align 2
+; CHECK-NEXT:    ret void
+;
+; CHECK-F16C-LABEL: define void @fpround_v2xf64_v2xf16(
+; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-F16C-NEXT:    [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1
+; CHECK-F16C-NEXT:    [[L0:%.*]] = load double, ptr [[S0]], align 4
+; CHECK-F16C-NEXT:    [[L1:%.*]] = load double, ptr [[S1]], align 4
+; CHECK-F16C-NEXT:    [[T0:%.*]] = fptrunc double [[L0]] to half
+; CHECK-F16C-NEXT:    [[T1:%.*]] = fptrunc double [[L1]] to half
+; CHECK-F16C-NEXT:    [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-F16C-NEXT:    store half [[T0]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT:    store half [[T1]], ptr [[D1]], align 2
+; CHECK-F16C-NEXT:    ret void
+;
+; CHECK-AVX512-LABEL: define void @fpround_v2xf64_v2xf16(
+; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT:    [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1
+; CHECK-AVX512-NEXT:    [[L0:%.*]] = load double, ptr [[S0]], align 4
+; CHECK-AVX512-NEXT:    [[L1:%.*]] = load double, ptr [[S1]], align 4
+; CHECK-AVX512-NEXT:    [[T0:%.*]] = fptrunc double [[L0]] to half
+; CHECK-AVX512-NEXT:    [[T1:%.*]] = fptrunc double [[L1]] to half
+; CHECK-AVX512-NEXT:    [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1
+; CHECK-AVX512-NEXT:    store half [[T0]], ptr [[D0]], align 2
+; CHECK-AVX512-NEXT:    store half [[T1]], ptr [[D1]], align 2
+; CHECK-AVX512-NEXT:    ret void
+;
+  %s1 = getelementptr inbounds double, ptr %s0, i64 1
+  %l0 = load double, ptr %s0, align 4
+  %l1 = load double, ptr %s1, align 4
+
+  %t0 = fptrunc double %l0 to half
+  %t1 = fptrunc double %l1 to half
+
+  %d1 = getelementptr inbounds half, ptr %d0, i64 1
+  store half %t0, ptr %d0, align 2
+  store half %t1, ptr %d1, align 2
+  ret void
+}

>From 84a22ef1a5ced6fc2383c38b99fb040df17db76e Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Tue, 22 Oct 2024 17:19:29 -0700
Subject: [PATCH 2/2] Override X86TTIImpl::getStoreMinimumVF instead of
 tweaking codegen tables.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  6 -----
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 24 +++++++++++++++----
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |  3 +++
 .../SLPVectorizer/X86/conversion-fp16.ll      | 19 +++++++++------
 4 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index da88a1a0a5a3b8..bcb84add65d83e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1714,9 +1714,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
     }
-    // trunc+store via vcvtps2ph
-    setOperationAction(ISD::STORE, MVT::v4f16, Custom);
-    setOperationAction(ISD::STORE, MVT::v8f16, Custom);
   }
 
   // This block controls legalization of the mask vector sizes that are
@@ -1787,9 +1784,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
-    // trunc+store via vcvtps2ph
-    setOperationAction(ISD::STORE, MVT::v16f16, Custom);
   }
   if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
     for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2d2c804ed46e54..bae223243b3dc9 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2977,10 +2977,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
-    { ISD::FP_ROUND,  MVT::v8f16,   MVT::v8f32,   { 1, 1, 1, 1 } }, // vcvtps2ph
-    { ISD::FP_ROUND,  MVT::v4f16,   MVT::v4f32,   { 1, 1, 1, 1 } }, // vcvtps2ph
-    { ISD::FP_EXTEND, MVT::v8f32,   MVT::v8f16,   { 1, 1, 1, 1 } }, // vcvtph2ps
-    { ISD::FP_EXTEND, MVT::v4f32,   MVT::v4f16,   { 1, 1, 1, 1 } }, // vcvtph2ps
+    { ISD::FP_ROUND,  MVT::f16,     MVT::f32,     { 1, 1, 1, 1 } },
+    { ISD::FP_ROUND,  MVT::v8f16,   MVT::v8f32,   { 1, 1, 1, 1 } },
+    { ISD::FP_ROUND,  MVT::v4f16,   MVT::v4f32,   { 1, 1, 1, 1 } },
+    { ISD::FP_EXTEND, MVT::f32,     MVT::f16,     { 1, 1, 1, 1 } },
+    { ISD::FP_EXTEND, MVT::f64,     MVT::f16,     { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
+    { ISD::FP_EXTEND, MVT::v8f32,   MVT::v8f16,   { 1, 1, 1, 1 } },
+    { ISD::FP_EXTEND, MVT::v4f32,   MVT::v4f16,   { 1, 1, 1, 1 } },
     { ISD::FP_EXTEND, MVT::v4f64,   MVT::v4f16,   { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
   };
 
@@ -3171,6 +3174,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                             TTI::CastContextHint::None, CostKind);
   }
 
+  if (ISD == ISD::FP_ROUND && LTDest.second.getScalarType() == MVT::f16) {
+    // Conversion requires a libcall.
+    return InstructionCost::getInvalid();
+  }
+
   // TODO: Allow non-throughput costs that aren't binary.
   auto AdjustCost = [&CostKind](InstructionCost Cost,
                                 InstructionCost N = 1) -> InstructionCost {
@@ -6948,6 +6956,14 @@ bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const {
   return true;
 }
 
+unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                                       Type *ScalarValTy) const {
+  if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
+    return 4;
+  }
+  return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
+}
+
 bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
                                             SmallVectorImpl<Use *> &Ops) const {
   using namespace llvm::PatternMatch;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0100f328ab4bd3..36d00cee0d18b5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -302,6 +302,9 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
 
   bool isVectorShiftByScalarCheap(Type *Ty) const;
 
+  unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                             Type *ScalarValTy) const;
+
 private:
   bool supportsGather() const;
   InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
index 1d5dee6cb8121c..bcea147d724f53 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
@@ -123,8 +123,8 @@ define void @fpext_v4xf16_v4xf64(ptr %s0, ptr %d0) {
   ret void
 }
 
-define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
-; CHECK-LABEL: define void @fpext_v16xf15_v16xf32(
+define void @fpext_v16xf16_v16xf32(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v16xf16_v16xf32(
 ; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
 ; CHECK-NEXT:    [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
@@ -206,7 +206,7 @@ define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
 ; CHECK-NEXT:    store float [[E15]], ptr [[D16]], align 8
 ; CHECK-NEXT:    ret void
 ;
-; CHECK-F16C-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-F16C-LABEL: define void @fpext_v16xf16_v16xf32(
 ; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
 ; CHECK-F16C-NEXT:    [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8
 ; CHECK-F16C-NEXT:    [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8
@@ -218,7 +218,7 @@ define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
 ; CHECK-F16C-NEXT:    store <8 x float> [[TMP4]], ptr [[D8]], align 8
 ; CHECK-F16C-NEXT:    ret void
 ;
-; CHECK-AVX512-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-AVX512-LABEL: define void @fpext_v16xf16_v16xf32(
 ; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
 ; CHECK-AVX512-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[S0]], align 2
 ; CHECK-AVX512-NEXT:    [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x float>
@@ -453,9 +453,14 @@ define void @fpround_v16xf32_v16xf16(ptr %s0, ptr %d0) {
 ;
 ; CHECK-F16C-LABEL: define void @fpround_v16xf32_v16xf16(
 ; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
-; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[S0]], align 4
-; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fptrunc <16 x float> [[TMP1]] to <16 x half>
-; CHECK-F16C-NEXT:    store <16 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT:    [[S8:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 8
+; CHECK-F16C-NEXT:    [[D8:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 8
+; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[S0]], align 4
+; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fptrunc <8 x float> [[TMP1]] to <8 x half>
+; CHECK-F16C-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[S8]], align 4
+; CHECK-F16C-NEXT:    [[TMP4:%.*]] = fptrunc <8 x float> [[TMP3]] to <8 x half>
+; CHECK-F16C-NEXT:    store <8 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT:    store <8 x half> [[TMP4]], ptr [[D8]], align 2
 ; CHECK-F16C-NEXT:    ret void
 ;
 ; CHECK-AVX512-LABEL: define void @fpround_v16xf32_v16xf16(



More information about the llvm-commits mailing list