[clang] [Clang] Correctly enable the f16 type for offloading (PR #98331)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Jul 10 07:50:16 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
Summary:
There's an extra argument that's required to *actually* enable f16
usage. For whatever reason there's a difference between fp16 and f16,
where fp16 is some weird version that converts between the two. Long
story short, without this the math builtins are blatantly broken.
---
Patch is 49.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/98331.diff
2 Files Affected:
- (modified) clang/lib/Basic/Targets/NVPTX.h (+2)
- (modified) clang/test/CodeGen/builtins-nvptx-native-half-type-err.c (+596-54)
``````````diff
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 9a985e46e22da..be43bb04fa2ed 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -75,6 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
ArrayRef<Builtin::Info> getTargetBuiltins() const override;
+ bool useFP16ConversionIntrinsics() const override { return false; }
+
bool
initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
StringRef CPU,
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
index 3b9413ddd4a4b..63acf25b8fe90 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
@@ -1,12 +1,605 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// REQUIRES: nvptx-registered-target
//
-// RUN: not %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN: sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
-// RUN: | FileCheck -check-prefix=CHECK_ERROR %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
+// RUN: sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s \
+// RUN: | FileCheck %s
#define __device__ __attribute__((device))
typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
+// CHECK-LABEL: define dso_local void @_Z22nvvm_native_half_typesPvS_S_PDh(
+// CHECK-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT: [[RESV2:%.*]] = alloca <2 x half>, align 4
+// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: store <2 x half> zeroinitializer, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[TMP0]], align 2
+// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.nvvm.ex2.approx.f16(half [[TMP1]])
+// CHECK-NEXT: [[CONV:%.*]] = fpext half [[TMP2]] to float
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[TMP3]], align 2
+// CHECK-NEXT: [[CONV1:%.*]] = fpext half [[TMP4]] to float
+// CHECK-NEXT: [[ADD:%.*]] = fadd float [[CONV1]], [[CONV]]
+// CHECK-NEXT: [[TMP5:%.*]] = fptrunc float [[ADD]] to half
+// CHECK-NEXT: store half [[TMP5]], ptr [[TMP3]], align 2
+// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr [[TMP6]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> [[TMP7]])
+// CHECK-NEXT: store <2 x half> [[TMP8]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[TMP9]], align 2
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load half, ptr [[TMP11]], align 2
+// CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load half, ptr [[TMP13]], align 2
+// CHECK-NEXT: [[TMP15:%.*]] = call half @llvm.nvvm.fma.rn.relu.f16(half [[TMP10]], half [[TMP12]], half [[TMP14]])
+// CHECK-NEXT: [[CONV2:%.*]] = fpext half [[TMP15]] to float
+// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load half, ptr [[TMP16]], align 2
+// CHECK-NEXT: [[CONV3:%.*]] = fpext half [[TMP17]] to float
+// CHECK-NEXT: [[ADD4:%.*]] = fadd float [[CONV3]], [[CONV2]]
+// CHECK-NEXT: [[TMP18:%.*]] = fptrunc float [[ADD4]] to half
+// CHECK-NEXT: store half [[TMP18]], ptr [[TMP16]], align 2
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP20:%.*]] = load half, ptr [[TMP19]], align 2
+// CHECK-NEXT: [[TMP21:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = load half, ptr [[TMP21]], align 2
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP24:%.*]] = load half, ptr [[TMP23]], align 2
+// CHECK-NEXT: [[TMP25:%.*]] = call half @llvm.nvvm.fma.rn.ftz.relu.f16(half [[TMP20]], half [[TMP22]], half [[TMP24]])
+// CHECK-NEXT: [[CONV5:%.*]] = fpext half [[TMP25]] to float
+// CHECK-NEXT: [[TMP26:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[TMP26]], align 2
+// CHECK-NEXT: [[CONV6:%.*]] = fpext half [[TMP27]] to float
+// CHECK-NEXT: [[ADD7:%.*]] = fadd float [[CONV6]], [[CONV5]]
+// CHECK-NEXT: [[TMP28:%.*]] = fptrunc float [[ADD7]] to half
+// CHECK-NEXT: store half [[TMP28]], ptr [[TMP26]], align 2
+// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP30:%.*]] = load <2 x half>, ptr [[TMP29]], align 4
+// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP32:%.*]] = load <2 x half>, ptr [[TMP31]], align 4
+// CHECK-NEXT: [[TMP33:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP34:%.*]] = load <2 x half>, ptr [[TMP33]], align 4
+// CHECK-NEXT: [[TMP35:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half> [[TMP30]], <2 x half> [[TMP32]], <2 x half> [[TMP34]])
+// CHECK-NEXT: [[CONV8:%.*]] = fpext <2 x half> [[TMP35]] to <2 x float>
+// CHECK-NEXT: [[TMP36:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV9:%.*]] = fpext <2 x half> [[TMP36]] to <2 x float>
+// CHECK-NEXT: [[ADD10:%.*]] = fadd <2 x float> [[CONV9]], [[CONV8]]
+// CHECK-NEXT: [[CONV11:%.*]] = fptrunc <2 x float> [[ADD10]] to <2 x half>
+// CHECK-NEXT: store <2 x half> [[CONV11]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP38:%.*]] = load <2 x half>, ptr [[TMP37]], align 4
+// CHECK-NEXT: [[TMP39:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP40:%.*]] = load <2 x half>, ptr [[TMP39]], align 4
+// CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP42:%.*]] = load <2 x half>, ptr [[TMP41]], align 4
+// CHECK-NEXT: [[TMP43:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2(<2 x half> [[TMP38]], <2 x half> [[TMP40]], <2 x half> [[TMP42]])
+// CHECK-NEXT: [[CONV12:%.*]] = fpext <2 x half> [[TMP43]] to <2 x float>
+// CHECK-NEXT: [[TMP44:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV13:%.*]] = fpext <2 x half> [[TMP44]] to <2 x float>
+// CHECK-NEXT: [[ADD14:%.*]] = fadd <2 x float> [[CONV13]], [[CONV12]]
+// CHECK-NEXT: [[CONV15:%.*]] = fptrunc <2 x float> [[ADD14]] to <2 x half>
+// CHECK-NEXT: store <2 x half> [[CONV15]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP46:%.*]] = load half, ptr [[TMP45]], align 2
+// CHECK-NEXT: [[TMP47:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP48:%.*]] = load half, ptr [[TMP47]], align 2
+// CHECK-NEXT: [[TMP49:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP50:%.*]] = load half, ptr [[TMP49]], align 2
+// CHECK-NEXT: [[TMP51:%.*]] = call half @llvm.nvvm.fma.rn.ftz.f16(half [[TMP46]], half [[TMP48]], half [[TMP50]])
+// CHECK-NEXT: [[CONV16:%.*]] = fpext half [[TMP51]] to float
+// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP53:%.*]] = load half, ptr [[TMP52]], align 2
+// CHECK-NEXT: [[CONV17:%.*]] = fpext half [[TMP53]] to float
+// CHECK-NEXT: [[ADD18:%.*]] = fadd float [[CONV17]], [[CONV16]]
+// CHECK-NEXT: [[TMP54:%.*]] = fptrunc float [[ADD18]] to half
+// CHECK-NEXT: store half [[TMP54]], ptr [[TMP52]], align 2
+// CHECK-NEXT: [[TMP55:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP56:%.*]] = load half, ptr [[TMP55]], align 2
+// CHECK-NEXT: [[TMP57:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP58:%.*]] = load half, ptr [[TMP57]], align 2
+// CHECK-NEXT: [[TMP59:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP60:%.*]] = load half, ptr [[TMP59]], align 2
+// CHECK-NEXT: [[TMP61:%.*]] = call half @llvm.nvvm.fma.rn.sat.f16(half [[TMP56]], half [[TMP58]], half [[TMP60]])
+// CHECK-NEXT: [[CONV19:%.*]] = fpext half [[TMP61]] to float
+// CHECK-NEXT: [[TMP62:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP63:%.*]] = load half, ptr [[TMP62]], align 2
+// CHECK-NEXT: [[CONV20:%.*]] = fpext half [[TMP63]] to float
+// CHECK-NEXT: [[ADD21:%.*]] = fadd float [[CONV20]], [[CONV19]]
+// CHECK-NEXT: [[TMP64:%.*]] = fptrunc float [[ADD21]] to half
+// CHECK-NEXT: store half [[TMP64]], ptr [[TMP62]], align 2
+// CHECK-NEXT: [[TMP65:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP66:%.*]] = load half, ptr [[TMP65]], align 2
+// CHECK-NEXT: [[TMP67:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP68:%.*]] = load half, ptr [[TMP67]], align 2
+// CHECK-NEXT: [[TMP69:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP70:%.*]] = load half, ptr [[TMP69]], align 2
+// CHECK-NEXT: [[TMP71:%.*]] = call half @llvm.nvvm.fma.rn.ftz.sat.f16(half [[TMP66]], half [[TMP68]], half [[TMP70]])
+// CHECK-NEXT: [[CONV22:%.*]] = fpext half [[TMP71]] to float
+// CHECK-NEXT: [[TMP72:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP73:%.*]] = load half, ptr [[TMP72]], align 2
+// CHECK-NEXT: [[CONV23:%.*]] = fpext half [[TMP73]] to float
+// CHECK-NEXT: [[ADD24:%.*]] = fadd float [[CONV23]], [[CONV22]]
+// CHECK-NEXT: [[TMP74:%.*]] = fptrunc float [[ADD24]] to half
+// CHECK-NEXT: store half [[TMP74]], ptr [[TMP72]], align 2
+// CHECK-NEXT: [[TMP75:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP76:%.*]] = load <2 x half>, ptr [[TMP75]], align 4
+// CHECK-NEXT: [[TMP77:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP78:%.*]] = load <2 x half>, ptr [[TMP77]], align 4
+// CHECK-NEXT: [[TMP79:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP80:%.*]] = load <2 x half>, ptr [[TMP79]], align 4
+// CHECK-NEXT: [[TMP81:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half> [[TMP76]], <2 x half> [[TMP78]], <2 x half> [[TMP80]])
+// CHECK-NEXT: [[CONV25:%.*]] = fpext <2 x half> [[TMP81]] to <2 x float>
+// CHECK-NEXT: [[TMP82:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV26:%.*]] = fpext <2 x half> [[TMP82]] to <2 x float>
+// CHECK-NEXT: [[ADD27:%.*]] = fadd <2 x float> [[CONV26]], [[CONV25]]
+// CHECK-NEXT: [[CONV28:%.*]] = fptrunc <2 x float> [[ADD27]] to <2 x half>
+// CHECK-NEXT: store <2 x half> [[CONV28]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP83:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP84:%.*]] = load <2 x half>, ptr [[TMP83]], align 4
+// CHECK-NEXT: [[TMP85:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP86:%.*]] = load <2 x half>, ptr [[TMP85]], align 4
+// CHECK-NEXT: [[TMP87:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP88:%.*]] = load <2 x half>, ptr [[TMP87]], align 4
+// CHECK-NEXT: [[TMP89:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half> [[TMP84]], <2 x half> [[TMP86]], <2 x half> [[TMP88]])
+// CHECK-NEXT: [[CONV29:%.*]] = fpext <2 x half> [[TMP89]] to <2 x float>
+// CHECK-NEXT: [[TMP90:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV30:%.*]] = fpext <2 x half> [[TMP90]] to <2 x float>
+// CHECK-NEXT: [[ADD31:%.*]] = fadd <2 x float> [[CONV30]], [[CONV29]]
+// CHECK-NEXT: [[CONV32:%.*]] = fptrunc <2 x float> [[ADD31]] to <2 x half>
+// CHECK-NEXT: store <2 x half> [[CONV32]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP91:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP92:%.*]] = load <2 x half>, ptr [[TMP91]], align 4
+// CHECK-NEXT: [[TMP93:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP94:%.*]] = load <2 x half>, ptr [[TMP93]], align 4
+// CHECK-NEXT: [[TMP95:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP96:%.*]] = load <2 x half>, ptr [[TMP95]], align 4
+// CHECK-NEXT: [[TMP97:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half> [[TMP92]], <2 x half> [[TMP94]], <2 x half> [[TMP96]])
+// CHECK-NEXT: [[CONV33:%.*]] = fpext <2 x half> [[TMP97]] to <2 x float>
+// CHECK-NEXT: [[TMP98:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV34:%.*]] = fpext <2 x half> [[TMP98]] to <2 x float>
+// CHECK-NEXT: [[ADD35:%.*]] = fadd <2 x float> [[CONV34]], [[CONV33]]
+// CHECK-NEXT: [[CONV36:%.*]] = fptrunc <2 x float> [[ADD35]] to <2 x half>
+// CHECK-NEXT: store <2 x half> [[CONV36]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP99:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP100:%.*]] = load <2 x half>, ptr [[TMP99]], align 4
+// CHECK-NEXT: [[TMP101:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP102:%.*]] = load <2 x half>, ptr [[TMP101]], align 4
+// CHECK-NEXT: [[TMP103:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT: [[TMP104:%.*]] = load <2 x half>, ptr [[TMP103]], align 4
+// CHECK-NEXT: [[TMP105:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half> [[TMP100]], <2 x half> [[TMP102]], <2 x half> [[TMP104]])
+// CHECK-NEXT: [[CONV37:%.*]] = fpext <2 x half> [[TMP105]] to <2 x float>
+// CHECK-NEXT: [[TMP106:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV38:%.*]] = fpext <2 x half> [[TMP106]] to <2 x float>
+// CHECK-NEXT: [[ADD39:%.*]] = fadd <2 x float> [[CONV38]], [[CONV37]]
+// CHECK-NEXT: [[CONV40:%.*]] = fptrunc <2 x float> [[ADD39]] to <2 x half>
+// CHECK-NEXT: store <2 x half> [[CONV40]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP107:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP108:%.*]] = load half, ptr [[TMP107]], align 2
+// CHECK-NEXT: [[TMP109:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP110:%.*]] = load half, ptr [[TMP109]], align 2
+// CHECK-NEXT: [[TMP111:%.*]] = call half @llvm.nvvm.fmin.f16(half [[TMP108]], half [[TMP110]])
+// CHECK-NEXT: [[CONV41:%.*]] = fpext half [[TMP111]] to float
+// CHECK-NEXT: [[TMP112:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP113:%.*]] = load half, ptr [[TMP112]], align 2
+// CHECK-NEXT: [[CONV42:%.*]] = fpext half [[TMP113]] to float
+// CHECK-NEXT: [[ADD43:%.*]] = fadd float [[CONV42]], [[CONV41]]
+// CHECK-NEXT: [[TMP114:%.*]] = fptrunc float [[ADD43]] to half
+// CHECK-NEXT: store half [[TMP114]], ptr [[TMP112]], align 2
+// CHECK-NEXT: [[TMP115:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP116:%.*]] = load half, ptr [[TMP115]], align 2
+// CHECK-NEXT: [[TMP117:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP118:%.*]] = load half, ptr [[TMP117]], align 2
+// CHECK-NEXT: [[TMP119:%.*]] = call half @llvm.nvvm.fmin.ftz.f16(half [[TMP116]], half [[TMP118]])
+// CHECK-NEXT: [[CONV44:%.*]] = fpext half [[TMP119]] to float
+// CHECK-NEXT: [[TMP120:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP121:%.*]] = load half, ptr [[TMP120]], align 2
+// CHECK-NEXT: [[CONV45:%.*]] = fpext half [[TMP121]] to float
+// CHECK-NEXT: [[ADD46:%.*]] = fadd float [[CONV45]], [[CONV44]]
+// CHECK-NEXT: [[TMP122:%.*]] = fptrunc float [[ADD46]] to half
+// CHECK-NEXT: store half [[TMP122]], ptr [[TMP120]], align 2
+// CHECK-NEXT: [[TMP123:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP124:%.*]] = load half, ptr [[TMP123]], align 2
+// CHECK-NEXT: [[TMP125:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP126:%.*]] = load half, ptr [[TMP125]], align 2
+// CHECK-NEXT: [[TMP127:%.*]] = call half @llvm.nvvm.fmin.nan.f16(half [[TMP124]], half [[TMP126]])
+// CHECK-NEXT: [[CONV47:%.*]] = fpext half [[TMP127]] to float
+// CHECK-NEXT: [[TMP128:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP129:%.*]] = load half, ptr [[TMP128]], align 2
+// CHECK-NEXT: [[CONV48:%.*]] = fpext half [[TMP129]] to float
+// CHECK-NEXT: [[ADD49:%.*]] = fadd float [[CONV48]], [[CONV47]]
+// CHECK-NEXT: [[TMP130:%.*]] = fptrunc float [[ADD49]] to half
+// CHECK-NEXT: store half [[TMP130]], ptr [[TMP128]], align 2
+// CHECK-NEXT: [[TMP131:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP132:%.*]] = load half, ptr [[TMP131]], align 2
+// CHECK-NEXT: [[TMP133:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP134:%.*]] = load half, ptr [[TMP133]], align 2
+// CHECK-NEXT: [[TMP135:%.*]] = call half @llvm.nvvm.fmin.ftz.nan.f16(half [[TMP132]], half [[TMP134]])
+// CHECK-NEXT: [[CONV50:%.*]] = fpext half [[TMP135]] to float
+// CHECK-NEXT: [[TMP136:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT: [[TMP137:%.*]] = load half, ptr [[TMP136]], align 2
+// CHECK-NEXT: [[CONV51:%.*]] = fpext half [[TMP137]] to float
+// CHECK-NEXT: [[ADD52:%.*]] = fadd float [[CONV51]], [[CONV50]]
+// CHECK-NEXT: [[TMP138:%.*]] = fptrunc float [[ADD52]] to half
+// CHECK-NEXT: store half [[TMP138]], ptr [[TMP136]], align 2
+// CHECK-NEXT: [[TMP139:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP140:%.*]] = load <2 x half>, ptr [[TMP139]], align 4
+// CHECK-NEXT: [[TMP141:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP142:%.*]] = load <2 x half>, ptr [[TMP141]], align 4
+// CHECK-NEXT: [[TMP143:%.*]] = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> [[TMP140]], <2 x half> [[TMP142]])
+// CHECK-NEXT: [[CONV53:%.*]] = fpext <2 x half> [[TMP143]] to <2 x float>
+// CHECK-NEXT: [[TMP144:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV54:%.*]] = fpext <2 x half> [[TMP144]] to <2 x float>
+// CHECK-NEXT: [[ADD55:%.*]] = fadd <2 x float> [[CONV54]], [[CONV53]]
+// CHECK-NEXT: [[CONV56:%.*]] = fptrunc <2 x float> [[ADD55]] to <2 x half>
+// CHECK-NEXT: store <2 x half> [[CONV56]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP145:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP146:%.*]] = load <2 x half>, ptr [[TMP145]], align 4
+// CHECK-NEXT: [[TMP147:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP148:%.*]] = load <2 x half>, ptr [[TMP147]], align 4
+// CHECK-NEXT: [[TMP149:%.*]] = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> [[TMP146]], <2 x half> [[TMP148]])
+// CHECK-NEXT: [[CONV57:%.*]] = fpext <2 x half> [[TMP149]] to <2 x float>
+// CHECK-NEXT: [[TMP150:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV58:%.*]] = fpext <2 x half> [[TMP150]] to <2 x float>
+// CHECK-NEXT: [[ADD59:%.*]] = fadd <2 x float> [[CONV58]], [[CONV57]]
+// CHECK-NEXT: [[CONV60:%.*]] = fptrunc <2 x float> [[ADD59]] to <2 x half>
+// CHECK-NEXT: store <2 x half> [[CONV60]], ptr [[RESV2]], align 4
+// CHECK-NEXT: [[TMP151:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP152:%.*]] = load <2 x half>, ptr [[TMP151]], align 4
+// CHECK-NEXT: [[TMP153:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP154:%.*]] = load <2 x half>, ptr [[TMP153]], align 4
+// CHECK-NEXT: [[TMP155:%.*]] = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> [[TMP152]], <2 x half> [[TMP154]])
+// CHECK-NEXT: [[CONV61:%.*]] = fpext <2 x half> [[TMP155]] to <2 x float>
+// CHECK-NEXT: [[TMP156:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT: [[CONV62:%.*]] = fpext <2 x half> [[TMP156]] to <2 x float>
+// CHECK...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/98331
More information about the cfe-commits
mailing list