[clang] [Clang] Do not emit intrinsic math functions on GPU targets (PR #98209)

Wed Jul 10 08:23:23 PDT 2024

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/98209

>From 605e9e78c1cba3b1947a538c566ffedbb9525be0 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 10 Jul 2024 09:39:44 -0500
Subject: [PATCH 1/2] [Clang] Correctly enable the f16 type for offloading

Summary:
There's an extra argument that's required to *actually* enable f16
usage. For whatever reason there's a difference between fp16 and f16,
where fp16 is some weird version that converts between the two. Long
story short, without this the math builtins are blatantly broken.
---
 .../builtins-nvptx-native-half-type-err.c     | 650 ++++++++++++++++--
 1 file changed, 596 insertions(+), 54 deletions(-)

diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
index 3b9413ddd4a4b..63acf25b8fe90 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-err.c
@@ -1,12 +1,605 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // REQUIRES: nvptx-registered-target
 //
-// RUN: not %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
-// RUN:   sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK_ERROR %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
+// RUN:   sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s \
+// RUN:   | FileCheck %s
 
 #define __device__ __attribute__((device))
 typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
+// CHECK-LABEL: define dso_local void @_Z22nvvm_native_half_typesPvS_S_PDh(
+// CHECK-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[RESV2:%.*]] = alloca <2 x half>, align 4
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    store <2 x half> zeroinitializer, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[TMP0]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.nvvm.ex2.approx.f16(half [[TMP1]])
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP2]] to float
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[TMP3]], align 2
+// CHECK-NEXT:    [[CONV1:%.*]] = fpext half [[TMP4]] to float
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[CONV1]], [[CONV]]
+// CHECK-NEXT:    [[TMP5:%.*]] = fptrunc float [[ADD]] to half
+// CHECK-NEXT:    store half [[TMP5]], ptr [[TMP3]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x half>, ptr [[TMP6]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> [[TMP7]])
+// CHECK-NEXT:    store <2 x half> [[TMP8]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr [[TMP9]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr [[TMP11]], align 2
+// CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP14:%.*]] = load half, ptr [[TMP13]], align 2
+// CHECK-NEXT:    [[TMP15:%.*]] = call half @llvm.nvvm.fma.rn.relu.f16(half [[TMP10]], half [[TMP12]], half [[TMP14]])
+// CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[TMP15]] to float
+// CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load half, ptr [[TMP16]], align 2
+// CHECK-NEXT:    [[CONV3:%.*]] = fpext half [[TMP17]] to float
+// CHECK-NEXT:    [[ADD4:%.*]] = fadd float [[CONV3]], [[CONV2]]
+// CHECK-NEXT:    [[TMP18:%.*]] = fptrunc float [[ADD4]] to half
+// CHECK-NEXT:    store half [[TMP18]], ptr [[TMP16]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load half, ptr [[TMP19]], align 2
+// CHECK-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr [[TMP21]], align 2
+// CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr [[TMP23]], align 2
+// CHECK-NEXT:    [[TMP25:%.*]] = call half @llvm.nvvm.fma.rn.ftz.relu.f16(half [[TMP20]], half [[TMP22]], half [[TMP24]])
+// CHECK-NEXT:    [[CONV5:%.*]] = fpext half [[TMP25]] to float
+// CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load half, ptr [[TMP26]], align 2
+// CHECK-NEXT:    [[CONV6:%.*]] = fpext half [[TMP27]] to float
+// CHECK-NEXT:    [[ADD7:%.*]] = fadd float [[CONV6]], [[CONV5]]
+// CHECK-NEXT:    [[TMP28:%.*]] = fptrunc float [[ADD7]] to half
+// CHECK-NEXT:    store half [[TMP28]], ptr [[TMP26]], align 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x half>, ptr [[TMP29]], align 4
+// CHECK-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x half>, ptr [[TMP31]], align 4
+// CHECK-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP34:%.*]] = load <2 x half>, ptr [[TMP33]], align 4
+// CHECK-NEXT:    [[TMP35:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half> [[TMP30]], <2 x half> [[TMP32]], <2 x half> [[TMP34]])
+// CHECK-NEXT:    [[CONV8:%.*]] = fpext <2 x half> [[TMP35]] to <2 x float>
+// CHECK-NEXT:    [[TMP36:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV9:%.*]] = fpext <2 x half> [[TMP36]] to <2 x float>
+// CHECK-NEXT:    [[ADD10:%.*]] = fadd <2 x float> [[CONV9]], [[CONV8]]
+// CHECK-NEXT:    [[CONV11:%.*]] = fptrunc <2 x float> [[ADD10]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV11]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP38:%.*]] = load <2 x half>, ptr [[TMP37]], align 4
+// CHECK-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP40:%.*]] = load <2 x half>, ptr [[TMP39]], align 4
+// CHECK-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x half>, ptr [[TMP41]], align 4
+// CHECK-NEXT:    [[TMP43:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2(<2 x half> [[TMP38]], <2 x half> [[TMP40]], <2 x half> [[TMP42]])
+// CHECK-NEXT:    [[CONV12:%.*]] = fpext <2 x half> [[TMP43]] to <2 x float>
+// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV13:%.*]] = fpext <2 x half> [[TMP44]] to <2 x float>
+// CHECK-NEXT:    [[ADD14:%.*]] = fadd <2 x float> [[CONV13]], [[CONV12]]
+// CHECK-NEXT:    [[CONV15:%.*]] = fptrunc <2 x float> [[ADD14]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV15]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP46:%.*]] = load half, ptr [[TMP45]], align 2
+// CHECK-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP48:%.*]] = load half, ptr [[TMP47]], align 2
+// CHECK-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP50:%.*]] = load half, ptr [[TMP49]], align 2
+// CHECK-NEXT:    [[TMP51:%.*]] = call half @llvm.nvvm.fma.rn.ftz.f16(half [[TMP46]], half [[TMP48]], half [[TMP50]])
+// CHECK-NEXT:    [[CONV16:%.*]] = fpext half [[TMP51]] to float
+// CHECK-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP53:%.*]] = load half, ptr [[TMP52]], align 2
+// CHECK-NEXT:    [[CONV17:%.*]] = fpext half [[TMP53]] to float
+// CHECK-NEXT:    [[ADD18:%.*]] = fadd float [[CONV17]], [[CONV16]]
+// CHECK-NEXT:    [[TMP54:%.*]] = fptrunc float [[ADD18]] to half
+// CHECK-NEXT:    store half [[TMP54]], ptr [[TMP52]], align 2
+// CHECK-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP56:%.*]] = load half, ptr [[TMP55]], align 2
+// CHECK-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP58:%.*]] = load half, ptr [[TMP57]], align 2
+// CHECK-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP60:%.*]] = load half, ptr [[TMP59]], align 2
+// CHECK-NEXT:    [[TMP61:%.*]] = call half @llvm.nvvm.fma.rn.sat.f16(half [[TMP56]], half [[TMP58]], half [[TMP60]])
+// CHECK-NEXT:    [[CONV19:%.*]] = fpext half [[TMP61]] to float
+// CHECK-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP63:%.*]] = load half, ptr [[TMP62]], align 2
+// CHECK-NEXT:    [[CONV20:%.*]] = fpext half [[TMP63]] to float
+// CHECK-NEXT:    [[ADD21:%.*]] = fadd float [[CONV20]], [[CONV19]]
+// CHECK-NEXT:    [[TMP64:%.*]] = fptrunc float [[ADD21]] to half
+// CHECK-NEXT:    store half [[TMP64]], ptr [[TMP62]], align 2
+// CHECK-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP66:%.*]] = load half, ptr [[TMP65]], align 2
+// CHECK-NEXT:    [[TMP67:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP68:%.*]] = load half, ptr [[TMP67]], align 2
+// CHECK-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP70:%.*]] = load half, ptr [[TMP69]], align 2
+// CHECK-NEXT:    [[TMP71:%.*]] = call half @llvm.nvvm.fma.rn.ftz.sat.f16(half [[TMP66]], half [[TMP68]], half [[TMP70]])
+// CHECK-NEXT:    [[CONV22:%.*]] = fpext half [[TMP71]] to float
+// CHECK-NEXT:    [[TMP72:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP73:%.*]] = load half, ptr [[TMP72]], align 2
+// CHECK-NEXT:    [[CONV23:%.*]] = fpext half [[TMP73]] to float
+// CHECK-NEXT:    [[ADD24:%.*]] = fadd float [[CONV23]], [[CONV22]]
+// CHECK-NEXT:    [[TMP74:%.*]] = fptrunc float [[ADD24]] to half
+// CHECK-NEXT:    store half [[TMP74]], ptr [[TMP72]], align 2
+// CHECK-NEXT:    [[TMP75:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP76:%.*]] = load <2 x half>, ptr [[TMP75]], align 4
+// CHECK-NEXT:    [[TMP77:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP78:%.*]] = load <2 x half>, ptr [[TMP77]], align 4
+// CHECK-NEXT:    [[TMP79:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP80:%.*]] = load <2 x half>, ptr [[TMP79]], align 4
+// CHECK-NEXT:    [[TMP81:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half> [[TMP76]], <2 x half> [[TMP78]], <2 x half> [[TMP80]])
+// CHECK-NEXT:    [[CONV25:%.*]] = fpext <2 x half> [[TMP81]] to <2 x float>
+// CHECK-NEXT:    [[TMP82:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV26:%.*]] = fpext <2 x half> [[TMP82]] to <2 x float>
+// CHECK-NEXT:    [[ADD27:%.*]] = fadd <2 x float> [[CONV26]], [[CONV25]]
+// CHECK-NEXT:    [[CONV28:%.*]] = fptrunc <2 x float> [[ADD27]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV28]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP83:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP84:%.*]] = load <2 x half>, ptr [[TMP83]], align 4
+// CHECK-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP86:%.*]] = load <2 x half>, ptr [[TMP85]], align 4
+// CHECK-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP88:%.*]] = load <2 x half>, ptr [[TMP87]], align 4
+// CHECK-NEXT:    [[TMP89:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half> [[TMP84]], <2 x half> [[TMP86]], <2 x half> [[TMP88]])
+// CHECK-NEXT:    [[CONV29:%.*]] = fpext <2 x half> [[TMP89]] to <2 x float>
+// CHECK-NEXT:    [[TMP90:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV30:%.*]] = fpext <2 x half> [[TMP90]] to <2 x float>
+// CHECK-NEXT:    [[ADD31:%.*]] = fadd <2 x float> [[CONV30]], [[CONV29]]
+// CHECK-NEXT:    [[CONV32:%.*]] = fptrunc <2 x float> [[ADD31]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV32]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP91:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP92:%.*]] = load <2 x half>, ptr [[TMP91]], align 4
+// CHECK-NEXT:    [[TMP93:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP94:%.*]] = load <2 x half>, ptr [[TMP93]], align 4
+// CHECK-NEXT:    [[TMP95:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP96:%.*]] = load <2 x half>, ptr [[TMP95]], align 4
+// CHECK-NEXT:    [[TMP97:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half> [[TMP92]], <2 x half> [[TMP94]], <2 x half> [[TMP96]])
+// CHECK-NEXT:    [[CONV33:%.*]] = fpext <2 x half> [[TMP97]] to <2 x float>
+// CHECK-NEXT:    [[TMP98:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV34:%.*]] = fpext <2 x half> [[TMP98]] to <2 x float>
+// CHECK-NEXT:    [[ADD35:%.*]] = fadd <2 x float> [[CONV34]], [[CONV33]]
+// CHECK-NEXT:    [[CONV36:%.*]] = fptrunc <2 x float> [[ADD35]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV36]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP99:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP100:%.*]] = load <2 x half>, ptr [[TMP99]], align 4
+// CHECK-NEXT:    [[TMP101:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP102:%.*]] = load <2 x half>, ptr [[TMP101]], align 4
+// CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-NEXT:    [[TMP104:%.*]] = load <2 x half>, ptr [[TMP103]], align 4
+// CHECK-NEXT:    [[TMP105:%.*]] = call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half> [[TMP100]], <2 x half> [[TMP102]], <2 x half> [[TMP104]])
+// CHECK-NEXT:    [[CONV37:%.*]] = fpext <2 x half> [[TMP105]] to <2 x float>
+// CHECK-NEXT:    [[TMP106:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV38:%.*]] = fpext <2 x half> [[TMP106]] to <2 x float>
+// CHECK-NEXT:    [[ADD39:%.*]] = fadd <2 x float> [[CONV38]], [[CONV37]]
+// CHECK-NEXT:    [[CONV40:%.*]] = fptrunc <2 x float> [[ADD39]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV40]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP107:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP108:%.*]] = load half, ptr [[TMP107]], align 2
+// CHECK-NEXT:    [[TMP109:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP110:%.*]] = load half, ptr [[TMP109]], align 2
+// CHECK-NEXT:    [[TMP111:%.*]] = call half @llvm.nvvm.fmin.f16(half [[TMP108]], half [[TMP110]])
+// CHECK-NEXT:    [[CONV41:%.*]] = fpext half [[TMP111]] to float
+// CHECK-NEXT:    [[TMP112:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP113:%.*]] = load half, ptr [[TMP112]], align 2
+// CHECK-NEXT:    [[CONV42:%.*]] = fpext half [[TMP113]] to float
+// CHECK-NEXT:    [[ADD43:%.*]] = fadd float [[CONV42]], [[CONV41]]
+// CHECK-NEXT:    [[TMP114:%.*]] = fptrunc float [[ADD43]] to half
+// CHECK-NEXT:    store half [[TMP114]], ptr [[TMP112]], align 2
+// CHECK-NEXT:    [[TMP115:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP116:%.*]] = load half, ptr [[TMP115]], align 2
+// CHECK-NEXT:    [[TMP117:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP118:%.*]] = load half, ptr [[TMP117]], align 2
+// CHECK-NEXT:    [[TMP119:%.*]] = call half @llvm.nvvm.fmin.ftz.f16(half [[TMP116]], half [[TMP118]])
+// CHECK-NEXT:    [[CONV44:%.*]] = fpext half [[TMP119]] to float
+// CHECK-NEXT:    [[TMP120:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP121:%.*]] = load half, ptr [[TMP120]], align 2
+// CHECK-NEXT:    [[CONV45:%.*]] = fpext half [[TMP121]] to float
+// CHECK-NEXT:    [[ADD46:%.*]] = fadd float [[CONV45]], [[CONV44]]
+// CHECK-NEXT:    [[TMP122:%.*]] = fptrunc float [[ADD46]] to half
+// CHECK-NEXT:    store half [[TMP122]], ptr [[TMP120]], align 2
+// CHECK-NEXT:    [[TMP123:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP124:%.*]] = load half, ptr [[TMP123]], align 2
+// CHECK-NEXT:    [[TMP125:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP126:%.*]] = load half, ptr [[TMP125]], align 2
+// CHECK-NEXT:    [[TMP127:%.*]] = call half @llvm.nvvm.fmin.nan.f16(half [[TMP124]], half [[TMP126]])
+// CHECK-NEXT:    [[CONV47:%.*]] = fpext half [[TMP127]] to float
+// CHECK-NEXT:    [[TMP128:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP129:%.*]] = load half, ptr [[TMP128]], align 2
+// CHECK-NEXT:    [[CONV48:%.*]] = fpext half [[TMP129]] to float
+// CHECK-NEXT:    [[ADD49:%.*]] = fadd float [[CONV48]], [[CONV47]]
+// CHECK-NEXT:    [[TMP130:%.*]] = fptrunc float [[ADD49]] to half
+// CHECK-NEXT:    store half [[TMP130]], ptr [[TMP128]], align 2
+// CHECK-NEXT:    [[TMP131:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP132:%.*]] = load half, ptr [[TMP131]], align 2
+// CHECK-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP134:%.*]] = load half, ptr [[TMP133]], align 2
+// CHECK-NEXT:    [[TMP135:%.*]] = call half @llvm.nvvm.fmin.ftz.nan.f16(half [[TMP132]], half [[TMP134]])
+// CHECK-NEXT:    [[CONV50:%.*]] = fpext half [[TMP135]] to float
+// CHECK-NEXT:    [[TMP136:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP137:%.*]] = load half, ptr [[TMP136]], align 2
+// CHECK-NEXT:    [[CONV51:%.*]] = fpext half [[TMP137]] to float
+// CHECK-NEXT:    [[ADD52:%.*]] = fadd float [[CONV51]], [[CONV50]]
+// CHECK-NEXT:    [[TMP138:%.*]] = fptrunc float [[ADD52]] to half
+// CHECK-NEXT:    store half [[TMP138]], ptr [[TMP136]], align 2
+// CHECK-NEXT:    [[TMP139:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP140:%.*]] = load <2 x half>, ptr [[TMP139]], align 4
+// CHECK-NEXT:    [[TMP141:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP142:%.*]] = load <2 x half>, ptr [[TMP141]], align 4
+// CHECK-NEXT:    [[TMP143:%.*]] = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> [[TMP140]], <2 x half> [[TMP142]])
+// CHECK-NEXT:    [[CONV53:%.*]] = fpext <2 x half> [[TMP143]] to <2 x float>
+// CHECK-NEXT:    [[TMP144:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV54:%.*]] = fpext <2 x half> [[TMP144]] to <2 x float>
+// CHECK-NEXT:    [[ADD55:%.*]] = fadd <2 x float> [[CONV54]], [[CONV53]]
+// CHECK-NEXT:    [[CONV56:%.*]] = fptrunc <2 x float> [[ADD55]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV56]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP145:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP146:%.*]] = load <2 x half>, ptr [[TMP145]], align 4
+// CHECK-NEXT:    [[TMP147:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP148:%.*]] = load <2 x half>, ptr [[TMP147]], align 4
+// CHECK-NEXT:    [[TMP149:%.*]] = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> [[TMP146]], <2 x half> [[TMP148]])
+// CHECK-NEXT:    [[CONV57:%.*]] = fpext <2 x half> [[TMP149]] to <2 x float>
+// CHECK-NEXT:    [[TMP150:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV58:%.*]] = fpext <2 x half> [[TMP150]] to <2 x float>
+// CHECK-NEXT:    [[ADD59:%.*]] = fadd <2 x float> [[CONV58]], [[CONV57]]
+// CHECK-NEXT:    [[CONV60:%.*]] = fptrunc <2 x float> [[ADD59]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV60]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP151:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP152:%.*]] = load <2 x half>, ptr [[TMP151]], align 4
+// CHECK-NEXT:    [[TMP153:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP154:%.*]] = load <2 x half>, ptr [[TMP153]], align 4
+// CHECK-NEXT:    [[TMP155:%.*]] = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> [[TMP152]], <2 x half> [[TMP154]])
+// CHECK-NEXT:    [[CONV61:%.*]] = fpext <2 x half> [[TMP155]] to <2 x float>
+// CHECK-NEXT:    [[TMP156:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV62:%.*]] = fpext <2 x half> [[TMP156]] to <2 x float>
+// CHECK-NEXT:    [[ADD63:%.*]] = fadd <2 x float> [[CONV62]], [[CONV61]]
+// CHECK-NEXT:    [[CONV64:%.*]] = fptrunc <2 x float> [[ADD63]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV64]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP157:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP158:%.*]] = load <2 x half>, ptr [[TMP157]], align 4
+// CHECK-NEXT:    [[TMP159:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP160:%.*]] = load <2 x half>, ptr [[TMP159]], align 4
+// CHECK-NEXT:    [[TMP161:%.*]] = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> [[TMP158]], <2 x half> [[TMP160]])
+// CHECK-NEXT:    [[CONV65:%.*]] = fpext <2 x half> [[TMP161]] to <2 x float>
+// CHECK-NEXT:    [[TMP162:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV66:%.*]] = fpext <2 x half> [[TMP162]] to <2 x float>
+// CHECK-NEXT:    [[ADD67:%.*]] = fadd <2 x float> [[CONV66]], [[CONV65]]
+// CHECK-NEXT:    [[CONV68:%.*]] = fptrunc <2 x float> [[ADD67]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV68]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP163:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP164:%.*]] = load half, ptr [[TMP163]], align 2
+// CHECK-NEXT:    [[TMP165:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP166:%.*]] = load half, ptr [[TMP165]], align 2
+// CHECK-NEXT:    [[TMP167:%.*]] = call half @llvm.nvvm.fmin.xorsign.abs.f16(half [[TMP164]], half [[TMP166]])
+// CHECK-NEXT:    [[CONV69:%.*]] = fpext half [[TMP167]] to float
+// CHECK-NEXT:    [[TMP168:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP169:%.*]] = load half, ptr [[TMP168]], align 2
+// CHECK-NEXT:    [[CONV70:%.*]] = fpext half [[TMP169]] to float
+// CHECK-NEXT:    [[ADD71:%.*]] = fadd float [[CONV70]], [[CONV69]]
+// CHECK-NEXT:    [[TMP170:%.*]] = fptrunc float [[ADD71]] to half
+// CHECK-NEXT:    store half [[TMP170]], ptr [[TMP168]], align 2
+// CHECK-NEXT:    [[TMP171:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP172:%.*]] = load half, ptr [[TMP171]], align 2
+// CHECK-NEXT:    [[TMP173:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP174:%.*]] = load half, ptr [[TMP173]], align 2
+// CHECK-NEXT:    [[TMP175:%.*]] = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half [[TMP172]], half [[TMP174]])
+// CHECK-NEXT:    [[CONV72:%.*]] = fpext half [[TMP175]] to float
+// CHECK-NEXT:    [[TMP176:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP177:%.*]] = load half, ptr [[TMP176]], align 2
+// CHECK-NEXT:    [[CONV73:%.*]] = fpext half [[TMP177]] to float
+// CHECK-NEXT:    [[ADD74:%.*]] = fadd float [[CONV73]], [[CONV72]]
+// CHECK-NEXT:    [[TMP178:%.*]] = fptrunc float [[ADD74]] to half
+// CHECK-NEXT:    store half [[TMP178]], ptr [[TMP176]], align 2
+// CHECK-NEXT:    [[TMP179:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP180:%.*]] = load half, ptr [[TMP179]], align 2
+// CHECK-NEXT:    [[TMP181:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP182:%.*]] = load half, ptr [[TMP181]], align 2
+// CHECK-NEXT:    [[TMP183:%.*]] = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half [[TMP180]], half [[TMP182]])
+// CHECK-NEXT:    [[CONV75:%.*]] = fpext half [[TMP183]] to float
+// CHECK-NEXT:    [[TMP184:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP185:%.*]] = load half, ptr [[TMP184]], align 2
+// CHECK-NEXT:    [[CONV76:%.*]] = fpext half [[TMP185]] to float
+// CHECK-NEXT:    [[ADD77:%.*]] = fadd float [[CONV76]], [[CONV75]]
+// CHECK-NEXT:    [[TMP186:%.*]] = fptrunc float [[ADD77]] to half
+// CHECK-NEXT:    store half [[TMP186]], ptr [[TMP184]], align 2
+// CHECK-NEXT:    [[TMP187:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP188:%.*]] = load half, ptr [[TMP187]], align 2
+// CHECK-NEXT:    [[TMP189:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP190:%.*]] = load half, ptr [[TMP189]], align 2
+// CHECK-NEXT:    [[TMP191:%.*]] = call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half [[TMP188]], half [[TMP190]])
+// CHECK-NEXT:    [[CONV78:%.*]] = fpext half [[TMP191]] to float
+// CHECK-NEXT:    [[TMP192:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP193:%.*]] = load half, ptr [[TMP192]], align 2
+// CHECK-NEXT:    [[CONV79:%.*]] = fpext half [[TMP193]] to float
+// CHECK-NEXT:    [[ADD80:%.*]] = fadd float [[CONV79]], [[CONV78]]
+// CHECK-NEXT:    [[TMP194:%.*]] = fptrunc float [[ADD80]] to half
+// CHECK-NEXT:    store half [[TMP194]], ptr [[TMP192]], align 2
+// CHECK-NEXT:    [[TMP195:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP196:%.*]] = load <2 x half>, ptr [[TMP195]], align 4
+// CHECK-NEXT:    [[TMP197:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP198:%.*]] = load <2 x half>, ptr [[TMP197]], align 4
+// CHECK-NEXT:    [[TMP199:%.*]] = call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> [[TMP196]], <2 x half> [[TMP198]])
+// CHECK-NEXT:    [[CONV81:%.*]] = fpext <2 x half> [[TMP199]] to <2 x float>
+// CHECK-NEXT:    [[TMP200:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV82:%.*]] = fpext <2 x half> [[TMP200]] to <2 x float>
+// CHECK-NEXT:    [[ADD83:%.*]] = fadd <2 x float> [[CONV82]], [[CONV81]]
+// CHECK-NEXT:    [[CONV84:%.*]] = fptrunc <2 x float> [[ADD83]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV84]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP201:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP202:%.*]] = load <2 x half>, ptr [[TMP201]], align 4
+// CHECK-NEXT:    [[TMP203:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP204:%.*]] = load <2 x half>, ptr [[TMP203]], align 4
+// CHECK-NEXT:    [[TMP205:%.*]] = call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> [[TMP202]], <2 x half> [[TMP204]])
+// CHECK-NEXT:    [[CONV85:%.*]] = fpext <2 x half> [[TMP205]] to <2 x float>
+// CHECK-NEXT:    [[TMP206:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV86:%.*]] = fpext <2 x half> [[TMP206]] to <2 x float>
+// CHECK-NEXT:    [[ADD87:%.*]] = fadd <2 x float> [[CONV86]], [[CONV85]]
+// CHECK-NEXT:    [[CONV88:%.*]] = fptrunc <2 x float> [[ADD87]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV88]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP207:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP208:%.*]] = load <2 x half>, ptr [[TMP207]], align 4
+// CHECK-NEXT:    [[TMP209:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP210:%.*]] = load <2 x half>, ptr [[TMP209]], align 4
+// CHECK-NEXT:    [[TMP211:%.*]] = call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> [[TMP208]], <2 x half> [[TMP210]])
+// CHECK-NEXT:    [[CONV89:%.*]] = fpext <2 x half> [[TMP211]] to <2 x float>
+// CHECK-NEXT:    [[TMP212:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV90:%.*]] = fpext <2 x half> [[TMP212]] to <2 x float>
+// CHECK-NEXT:    [[ADD91:%.*]] = fadd <2 x float> [[CONV90]], [[CONV89]]
+// CHECK-NEXT:    [[CONV92:%.*]] = fptrunc <2 x float> [[ADD91]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV92]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP213:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP214:%.*]] = load <2 x half>, ptr [[TMP213]], align 4
+// CHECK-NEXT:    [[TMP215:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP216:%.*]] = load <2 x half>, ptr [[TMP215]], align 4
+// CHECK-NEXT:    [[TMP217:%.*]] = call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> [[TMP214]], <2 x half> [[TMP216]])
+// CHECK-NEXT:    [[CONV93:%.*]] = fpext <2 x half> [[TMP217]] to <2 x float>
+// CHECK-NEXT:    [[TMP218:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV94:%.*]] = fpext <2 x half> [[TMP218]] to <2 x float>
+// CHECK-NEXT:    [[ADD95:%.*]] = fadd <2 x float> [[CONV94]], [[CONV93]]
+// CHECK-NEXT:    [[CONV96:%.*]] = fptrunc <2 x float> [[ADD95]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV96]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP219:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP220:%.*]] = load half, ptr [[TMP219]], align 2
+// CHECK-NEXT:    [[TMP221:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP222:%.*]] = load half, ptr [[TMP221]], align 2
+// CHECK-NEXT:    [[TMP223:%.*]] = call half @llvm.nvvm.fmax.f16(half [[TMP220]], half [[TMP222]])
+// CHECK-NEXT:    [[CONV97:%.*]] = fpext half [[TMP223]] to float
+// CHECK-NEXT:    [[TMP224:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP225:%.*]] = load half, ptr [[TMP224]], align 2
+// CHECK-NEXT:    [[CONV98:%.*]] = fpext half [[TMP225]] to float
+// CHECK-NEXT:    [[ADD99:%.*]] = fadd float [[CONV98]], [[CONV97]]
+// CHECK-NEXT:    [[TMP226:%.*]] = fptrunc float [[ADD99]] to half
+// CHECK-NEXT:    store half [[TMP226]], ptr [[TMP224]], align 2
+// CHECK-NEXT:    [[TMP227:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP228:%.*]] = load half, ptr [[TMP227]], align 2
+// CHECK-NEXT:    [[TMP229:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP230:%.*]] = load half, ptr [[TMP229]], align 2
+// CHECK-NEXT:    [[TMP231:%.*]] = call half @llvm.nvvm.fmax.ftz.f16(half [[TMP228]], half [[TMP230]])
+// CHECK-NEXT:    [[CONV100:%.*]] = fpext half [[TMP231]] to float
+// CHECK-NEXT:    [[TMP232:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP233:%.*]] = load half, ptr [[TMP232]], align 2
+// CHECK-NEXT:    [[CONV101:%.*]] = fpext half [[TMP233]] to float
+// CHECK-NEXT:    [[ADD102:%.*]] = fadd float [[CONV101]], [[CONV100]]
+// CHECK-NEXT:    [[TMP234:%.*]] = fptrunc float [[ADD102]] to half
+// CHECK-NEXT:    store half [[TMP234]], ptr [[TMP232]], align 2
+// CHECK-NEXT:    [[TMP235:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP236:%.*]] = load half, ptr [[TMP235]], align 2
+// CHECK-NEXT:    [[TMP237:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP238:%.*]] = load half, ptr [[TMP237]], align 2
+// CHECK-NEXT:    [[TMP239:%.*]] = call half @llvm.nvvm.fmax.nan.f16(half [[TMP236]], half [[TMP238]])
+// CHECK-NEXT:    [[CONV103:%.*]] = fpext half [[TMP239]] to float
+// CHECK-NEXT:    [[TMP240:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP241:%.*]] = load half, ptr [[TMP240]], align 2
+// CHECK-NEXT:    [[CONV104:%.*]] = fpext half [[TMP241]] to float
+// CHECK-NEXT:    [[ADD105:%.*]] = fadd float [[CONV104]], [[CONV103]]
+// CHECK-NEXT:    [[TMP242:%.*]] = fptrunc float [[ADD105]] to half
+// CHECK-NEXT:    store half [[TMP242]], ptr [[TMP240]], align 2
+// CHECK-NEXT:    [[TMP243:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP244:%.*]] = load half, ptr [[TMP243]], align 2
+// CHECK-NEXT:    [[TMP245:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP246:%.*]] = load half, ptr [[TMP245]], align 2
+// CHECK-NEXT:    [[TMP247:%.*]] = call half @llvm.nvvm.fmax.ftz.nan.f16(half [[TMP244]], half [[TMP246]])
+// CHECK-NEXT:    [[CONV106:%.*]] = fpext half [[TMP247]] to float
+// CHECK-NEXT:    [[TMP248:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP249:%.*]] = load half, ptr [[TMP248]], align 2
+// CHECK-NEXT:    [[CONV107:%.*]] = fpext half [[TMP249]] to float
+// CHECK-NEXT:    [[ADD108:%.*]] = fadd float [[CONV107]], [[CONV106]]
+// CHECK-NEXT:    [[TMP250:%.*]] = fptrunc float [[ADD108]] to half
+// CHECK-NEXT:    store half [[TMP250]], ptr [[TMP248]], align 2
+// CHECK-NEXT:    [[TMP251:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP252:%.*]] = load <2 x half>, ptr [[TMP251]], align 4
+// CHECK-NEXT:    [[TMP253:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP254:%.*]] = load <2 x half>, ptr [[TMP253]], align 4
+// CHECK-NEXT:    [[TMP255:%.*]] = call <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half> [[TMP252]], <2 x half> [[TMP254]])
+// CHECK-NEXT:    [[CONV109:%.*]] = fpext <2 x half> [[TMP255]] to <2 x float>
+// CHECK-NEXT:    [[TMP256:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV110:%.*]] = fpext <2 x half> [[TMP256]] to <2 x float>
+// CHECK-NEXT:    [[ADD111:%.*]] = fadd <2 x float> [[CONV110]], [[CONV109]]
+// CHECK-NEXT:    [[CONV112:%.*]] = fptrunc <2 x float> [[ADD111]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV112]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP257:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP258:%.*]] = load <2 x half>, ptr [[TMP257]], align 4
+// CHECK-NEXT:    [[TMP259:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP260:%.*]] = load <2 x half>, ptr [[TMP259]], align 4
+// CHECK-NEXT:    [[TMP261:%.*]] = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> [[TMP258]], <2 x half> [[TMP260]])
+// CHECK-NEXT:    [[CONV113:%.*]] = fpext <2 x half> [[TMP261]] to <2 x float>
+// CHECK-NEXT:    [[TMP262:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV114:%.*]] = fpext <2 x half> [[TMP262]] to <2 x float>
+// CHECK-NEXT:    [[ADD115:%.*]] = fadd <2 x float> [[CONV114]], [[CONV113]]
+// CHECK-NEXT:    [[CONV116:%.*]] = fptrunc <2 x float> [[ADD115]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV116]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP263:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP264:%.*]] = load <2 x half>, ptr [[TMP263]], align 4
+// CHECK-NEXT:    [[TMP265:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP266:%.*]] = load <2 x half>, ptr [[TMP265]], align 4
+// CHECK-NEXT:    [[TMP267:%.*]] = call <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half> [[TMP264]], <2 x half> [[TMP266]])
+// CHECK-NEXT:    [[CONV117:%.*]] = fpext <2 x half> [[TMP267]] to <2 x float>
+// CHECK-NEXT:    [[TMP268:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV118:%.*]] = fpext <2 x half> [[TMP268]] to <2 x float>
+// CHECK-NEXT:    [[ADD119:%.*]] = fadd <2 x float> [[CONV118]], [[CONV117]]
+// CHECK-NEXT:    [[CONV120:%.*]] = fptrunc <2 x float> [[ADD119]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV120]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP269:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP270:%.*]] = load <2 x half>, ptr [[TMP269]], align 4
+// CHECK-NEXT:    [[TMP271:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP272:%.*]] = load <2 x half>, ptr [[TMP271]], align 4
+// CHECK-NEXT:    [[TMP273:%.*]] = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> [[TMP270]], <2 x half> [[TMP272]])
+// CHECK-NEXT:    [[CONV121:%.*]] = fpext <2 x half> [[TMP273]] to <2 x float>
+// CHECK-NEXT:    [[TMP274:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV122:%.*]] = fpext <2 x half> [[TMP274]] to <2 x float>
+// CHECK-NEXT:    [[ADD123:%.*]] = fadd <2 x float> [[CONV122]], [[CONV121]]
+// CHECK-NEXT:    [[CONV124:%.*]] = fptrunc <2 x float> [[ADD123]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV124]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP275:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP276:%.*]] = load half, ptr [[TMP275]], align 2
+// CHECK-NEXT:    [[TMP277:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP278:%.*]] = load half, ptr [[TMP277]], align 2
+// CHECK-NEXT:    [[TMP279:%.*]] = call half @llvm.nvvm.fmax.xorsign.abs.f16(half [[TMP276]], half [[TMP278]])
+// CHECK-NEXT:    [[CONV125:%.*]] = fpext half [[TMP279]] to float
+// CHECK-NEXT:    [[TMP280:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP281:%.*]] = load half, ptr [[TMP280]], align 2
+// CHECK-NEXT:    [[CONV126:%.*]] = fpext half [[TMP281]] to float
+// CHECK-NEXT:    [[ADD127:%.*]] = fadd float [[CONV126]], [[CONV125]]
+// CHECK-NEXT:    [[TMP282:%.*]] = fptrunc float [[ADD127]] to half
+// CHECK-NEXT:    store half [[TMP282]], ptr [[TMP280]], align 2
+// CHECK-NEXT:    [[TMP283:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP284:%.*]] = load half, ptr [[TMP283]], align 2
+// CHECK-NEXT:    [[TMP285:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP286:%.*]] = load half, ptr [[TMP285]], align 2
+// CHECK-NEXT:    [[TMP287:%.*]] = call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16(half [[TMP284]], half [[TMP286]])
+// CHECK-NEXT:    [[CONV128:%.*]] = fpext half [[TMP287]] to float
+// CHECK-NEXT:    [[TMP288:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP289:%.*]] = load half, ptr [[TMP288]], align 2
+// CHECK-NEXT:    [[CONV129:%.*]] = fpext half [[TMP289]] to float
+// CHECK-NEXT:    [[ADD130:%.*]] = fadd float [[CONV129]], [[CONV128]]
+// CHECK-NEXT:    [[TMP290:%.*]] = fptrunc float [[ADD130]] to half
+// CHECK-NEXT:    store half [[TMP290]], ptr [[TMP288]], align 2
+// CHECK-NEXT:    [[TMP291:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP292:%.*]] = load half, ptr [[TMP291]], align 2
+// CHECK-NEXT:    [[TMP293:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP294:%.*]] = load half, ptr [[TMP293]], align 2
+// CHECK-NEXT:    [[TMP295:%.*]] = call half @llvm.nvvm.fmax.nan.xorsign.abs.f16(half [[TMP292]], half [[TMP294]])
+// CHECK-NEXT:    [[CONV131:%.*]] = fpext half [[TMP295]] to float
+// CHECK-NEXT:    [[TMP296:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP297:%.*]] = load half, ptr [[TMP296]], align 2
+// CHECK-NEXT:    [[CONV132:%.*]] = fpext half [[TMP297]] to float
+// CHECK-NEXT:    [[ADD133:%.*]] = fadd float [[CONV132]], [[CONV131]]
+// CHECK-NEXT:    [[TMP298:%.*]] = fptrunc float [[ADD133]] to half
+// CHECK-NEXT:    store half [[TMP298]], ptr [[TMP296]], align 2
+// CHECK-NEXT:    [[TMP299:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP300:%.*]] = load half, ptr [[TMP299]], align 2
+// CHECK-NEXT:    [[TMP301:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP302:%.*]] = load half, ptr [[TMP301]], align 2
+// CHECK-NEXT:    [[TMP303:%.*]] = call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16(half [[TMP300]], half [[TMP302]])
+// CHECK-NEXT:    [[CONV134:%.*]] = fpext half [[TMP303]] to float
+// CHECK-NEXT:    [[TMP304:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP305:%.*]] = load half, ptr [[TMP304]], align 2
+// CHECK-NEXT:    [[CONV135:%.*]] = fpext half [[TMP305]] to float
+// CHECK-NEXT:    [[ADD136:%.*]] = fadd float [[CONV135]], [[CONV134]]
+// CHECK-NEXT:    [[TMP306:%.*]] = fptrunc float [[ADD136]] to half
+// CHECK-NEXT:    store half [[TMP306]], ptr [[TMP304]], align 2
+// CHECK-NEXT:    [[TMP307:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP308:%.*]] = load <2 x half>, ptr [[TMP307]], align 4
+// CHECK-NEXT:    [[TMP309:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP310:%.*]] = load <2 x half>, ptr [[TMP309]], align 4
+// CHECK-NEXT:    [[TMP311:%.*]] = call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2(<2 x half> [[TMP308]], <2 x half> [[TMP310]])
+// CHECK-NEXT:    [[CONV137:%.*]] = fpext <2 x half> [[TMP311]] to <2 x float>
+// CHECK-NEXT:    [[TMP312:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV138:%.*]] = fpext <2 x half> [[TMP312]] to <2 x float>
+// CHECK-NEXT:    [[ADD139:%.*]] = fadd <2 x float> [[CONV138]], [[CONV137]]
+// CHECK-NEXT:    [[CONV140:%.*]] = fptrunc <2 x float> [[ADD139]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV140]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP313:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP314:%.*]] = load <2 x half>, ptr [[TMP313]], align 4
+// CHECK-NEXT:    [[TMP315:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP316:%.*]] = load <2 x half>, ptr [[TMP315]], align 4
+// CHECK-NEXT:    [[TMP317:%.*]] = call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> [[TMP314]], <2 x half> [[TMP316]])
+// CHECK-NEXT:    [[CONV141:%.*]] = fpext <2 x half> [[TMP317]] to <2 x float>
+// CHECK-NEXT:    [[TMP318:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV142:%.*]] = fpext <2 x half> [[TMP318]] to <2 x float>
+// CHECK-NEXT:    [[ADD143:%.*]] = fadd <2 x float> [[CONV142]], [[CONV141]]
+// CHECK-NEXT:    [[CONV144:%.*]] = fptrunc <2 x float> [[ADD143]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV144]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP319:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP320:%.*]] = load <2 x half>, ptr [[TMP319]], align 4
+// CHECK-NEXT:    [[TMP321:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP322:%.*]] = load <2 x half>, ptr [[TMP321]], align 4
+// CHECK-NEXT:    [[TMP323:%.*]] = call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> [[TMP320]], <2 x half> [[TMP322]])
+// CHECK-NEXT:    [[CONV145:%.*]] = fpext <2 x half> [[TMP323]] to <2 x float>
+// CHECK-NEXT:    [[TMP324:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV146:%.*]] = fpext <2 x half> [[TMP324]] to <2 x float>
+// CHECK-NEXT:    [[ADD147:%.*]] = fadd <2 x float> [[CONV146]], [[CONV145]]
+// CHECK-NEXT:    [[CONV148:%.*]] = fptrunc <2 x float> [[ADD147]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV148]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP325:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP326:%.*]] = load <2 x half>, ptr [[TMP325]], align 4
+// CHECK-NEXT:    [[TMP327:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP328:%.*]] = load <2 x half>, ptr [[TMP327]], align 4
+// CHECK-NEXT:    [[TMP329:%.*]] = call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> [[TMP326]], <2 x half> [[TMP328]])
+// CHECK-NEXT:    [[CONV149:%.*]] = fpext <2 x half> [[TMP329]] to <2 x float>
+// CHECK-NEXT:    [[TMP330:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV150:%.*]] = fpext <2 x half> [[TMP330]] to <2 x float>
+// CHECK-NEXT:    [[ADD151:%.*]] = fadd <2 x float> [[CONV150]], [[CONV149]]
+// CHECK-NEXT:    [[CONV152:%.*]] = fptrunc <2 x float> [[ADD151]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV152]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP331:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP332:%.*]] = call half @llvm.nvvm.ldg.global.f.f16.p0(ptr [[TMP331]], i32 2)
+// CHECK-NEXT:    [[CONV153:%.*]] = fpext half [[TMP332]] to float
+// CHECK-NEXT:    [[TMP333:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP334:%.*]] = load half, ptr [[TMP333]], align 2
+// CHECK-NEXT:    [[CONV154:%.*]] = fpext half [[TMP334]] to float
+// CHECK-NEXT:    [[ADD155:%.*]] = fadd float [[CONV154]], [[CONV153]]
+// CHECK-NEXT:    [[TMP335:%.*]] = fptrunc float [[ADD155]] to half
+// CHECK-NEXT:    store half [[TMP335]], ptr [[TMP333]], align 2
+// CHECK-NEXT:    [[TMP336:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP337:%.*]] = call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr [[TMP336]], i32 4)
+// CHECK-NEXT:    [[CONV156:%.*]] = fpext <2 x half> [[TMP337]] to <2 x float>
+// CHECK-NEXT:    [[TMP338:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV157:%.*]] = fpext <2 x half> [[TMP338]] to <2 x float>
+// CHECK-NEXT:    [[ADD158:%.*]] = fadd <2 x float> [[CONV157]], [[CONV156]]
+// CHECK-NEXT:    [[CONV159:%.*]] = fptrunc <2 x float> [[ADD158]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV159]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP339:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP340:%.*]] = call half @llvm.nvvm.ldu.global.f.f16.p0(ptr [[TMP339]], i32 2)
+// CHECK-NEXT:    [[CONV160:%.*]] = fpext half [[TMP340]] to float
+// CHECK-NEXT:    [[TMP341:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP342:%.*]] = load half, ptr [[TMP341]], align 2
+// CHECK-NEXT:    [[CONV161:%.*]] = fpext half [[TMP342]] to float
+// CHECK-NEXT:    [[ADD162:%.*]] = fadd float [[CONV161]], [[CONV160]]
+// CHECK-NEXT:    [[TMP343:%.*]] = fptrunc float [[ADD162]] to half
+// CHECK-NEXT:    store half [[TMP343]], ptr [[TMP341]], align 2
+// CHECK-NEXT:    [[TMP344:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP345:%.*]] = call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0(ptr [[TMP344]], i32 4)
+// CHECK-NEXT:    [[CONV163:%.*]] = fpext <2 x half> [[TMP345]] to <2 x float>
+// CHECK-NEXT:    [[TMP346:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[CONV164:%.*]] = fpext <2 x half> [[TMP346]] to <2 x float>
+// CHECK-NEXT:    [[ADD165:%.*]] = fadd <2 x float> [[CONV164]], [[CONV163]]
+// CHECK-NEXT:    [[CONV166:%.*]] = fptrunc <2 x float> [[ADD165]] to <2 x half>
+// CHECK-NEXT:    store <2 x half> [[CONV166]], ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[TMP347:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x half> [[TMP347]], i32 0
+// CHECK-NEXT:    [[CONV167:%.*]] = fpext half [[VECEXT]] to float
+// CHECK-NEXT:    [[TMP348:%.*]] = load <2 x half>, ptr [[RESV2]], align 4
+// CHECK-NEXT:    [[VECEXT168:%.*]] = extractelement <2 x half> [[TMP348]], i32 1
+// CHECK-NEXT:    [[CONV169:%.*]] = fpext half [[VECEXT168]] to float
+// CHECK-NEXT:    [[ADD170:%.*]] = fadd float [[CONV167]], [[CONV169]]
+// CHECK-NEXT:    [[TMP349:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4
+// CHECK-NEXT:    [[TMP350:%.*]] = load half, ptr [[TMP349]], align 2
+// CHECK-NEXT:    [[CONV171:%.*]] = fpext half [[TMP350]] to float
+// CHECK-NEXT:    [[ADD172:%.*]] = fadd float [[CONV171]], [[ADD170]]
+// CHECK-NEXT:    [[TMP351:%.*]] = fptrunc float [[ADD172]] to half
+// CHECK-NEXT:    store half [[TMP351]], ptr [[TMP349]], align 2
+// CHECK-NEXT:    ret void
+//
 __device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
   __fp16v2 resv2 = {0, 0};
   *out += __nvvm_ex2_approx_f16(*(__fp16 *)a);
@@ -66,54 +659,3 @@ __device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
 
   *out += resv2[0] + resv2[1];
 }
-
-// CHECK_ERROR: error: __nvvm_ex2_approx_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_ex2_approx_f16x2 requires native half type support.
-
-// CHECK_ERROR: error: __nvvm_fma_rn_relu_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_ftz_relu_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_relu_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_ftz_relu_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_ftz_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_sat_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_ftz_sat_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_ftz_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_sat_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fma_rn_ftz_sat_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_ftz_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_nan_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_ftz_nan_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_ftz_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_nan_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_ftz_nan_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_xorsign_abs_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_ftz_xorsign_abs_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_nan_xorsign_abs_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_ftz_nan_xorsign_abs_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_xorsign_abs_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_ftz_xorsign_abs_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_nan_xorsign_abs_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmin_ftz_nan_xorsign_abs_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_ftz_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_nan_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_ftz_nan_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_ftz_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_nan_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_ftz_nan_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_xorsign_abs_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_ftz_xorsign_abs_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_nan_xorsign_abs_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_ftz_nan_xorsign_abs_f16 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_xorsign_abs_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_ftz_xorsign_abs_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_nan_xorsign_abs_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_fmax_ftz_nan_xorsign_abs_f16x2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_ldg_h requires native half type support.
-// CHECK_ERROR: error: __nvvm_ldg_h2 requires native half type support.
-// CHECK_ERROR: error: __nvvm_ldu_h requires native half type support.
-// CHECK_ERROR: error: __nvvm_ldu_h2 requires native half type support.

>From 3a1cd6b5a7923d7845b0cf5664cdc08cae7aa402 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 9 Jul 2024 14:43:55 -0500
Subject: [PATCH 2/2] [Clang] Do not emit intrinsic math functions on GPU
 targets

Summary:
Currently, the GPU gets its math by using wrapper headers that eagerly
replace libcalls with calls to the vendor's math library. e.g.
```
// __clang_cuda_math.h
[[gnu::always_inline]] double sin(double __x) { return __nv_sin(__x); }
```

However, we want to be able to move away from including these headers.
When these headers are not included, the lack of `errno` on the GPU
target enables these to be transformed into intrinsic calls. These
intrinsic calls will then potentially not be supported by the backend,
see https://godbolt.org/z/oKvTevaE1.

Even in the case that these functions are supported, we still want to
use regular libcalls now so that the LTO linking will replace these
calls before they reach the backend.

This patch simply changes the logic to prevent emitting intrinsic
functions for the standard math library functions. This means that `sin`
will not be an intrinsic, but `__builtin_sin` will. A better solution
long-term would be to have a pass that does custom lowering of all of
these before LTO linking if possible.
---
 clang/lib/Basic/Targets/NVPTX.h        |   2 +
 clang/lib/CodeGen/CGBuiltin.cpp        |   6 +
 clang/test/CodeGen/gpu-math-libcalls.c | 814 +++++++++++++++++++++++++
 3 files changed, 822 insertions(+)
 create mode 100644 clang/test/CodeGen/gpu-math-libcalls.c

diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 9a985e46e22da..be43bb04fa2ed 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -75,6 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
 
   ArrayRef<Builtin::Info> getTargetBuiltins() const override;
 
+  bool useFP16ConversionIntrinsics() const override { return false; }
+
   bool
   initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
                  StringRef CPU,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6cc0d9485720c..89c27147a2bd9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2637,6 +2637,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       GenerateIntrinsics =
           ConstWithoutErrnoOrExceptions && ErrnoOverridenToFalseWithOpt;
   }
+  // The GPU targets do not want math intrinsics to reach the backend.
+  // TODO: We should add a custom pass to lower these early enough for LTO.
+  if (getTarget().getTriple().isNVPTX() || getTarget().getTriple().isAMDGPU())
+    GenerateIntrinsics = !getContext().BuiltinInfo.isPredefinedLibFunction(
+        BuiltinIDIfNoAsmLabel);
+
   if (GenerateIntrinsics) {
     switch (BuiltinIDIfNoAsmLabel) {
     case Builtin::BIceil:
diff --git a/clang/test/CodeGen/gpu-math-libcalls.c b/clang/test/CodeGen/gpu-math-libcalls.c
new file mode 100644
index 0000000000000..c7f819efd4d6e
--- /dev/null
+++ b/clang/test/CodeGen/gpu-math-libcalls.c
@@ -0,0 +1,814 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa %s -emit-llvm -o - | FileCheck %s --check-prefix AMDGPU
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda %s -emit-llvm -o - | FileCheck %s --check-prefix NVPTX
+
+double ceil(double);
+float ceilf(float);
+double copysign(double, double);
+float copysignf(float, float);
+double cos(double);
+float cosf(float);
+float coshf(float);
+double exp(double);
+double exp2(double);
+float exp2f(float);
+double exp10(double);
+float exp10f(float);
+float expf(float);
+double fabs(double);
+float fabsf(float);
+double floor(double);
+float floorf(float);
+double fma(double, double, double);
+float fmaf(float, float, float);
+double fmax(double, double);
+float fmaxf(float, float);
+double fmin(double, double);
+float fminf(float, float);
+double fmod(double, double);
+float fmodf(float, float);
+double ldexp(double, int);
+float ldexpf(float, int);
+long long llround(double);
+long long llroundf(float);
+double log(double);
+double log10(double);
+float log10f(float);
+double log2(double);
+float log2f(float);
+float logf(float);
+long lrint(double);
+long lrintf(float);
+long lround(double);
+long lroundf(float);
+double nearbyint(double);
+float nearbyintf(float);
+double pow(double, double);
+float powf(float, float);
+double rint(double);
+float rintf(float);
+double round(double);
+float roundf(float);
+double roundeven(double);
+float roundevenf(float);
+double sin(double);
+float sinf(float);
+double sqrt(double);
+float sqrtf(float);
+double tan(double);
+float tanf(float);
+float tanhf(float);
+double trunc(double);
+float truncf(float);
+
+// AMDGPU-LABEL: define dso_local void @ceil_test(
+// AMDGPU-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @ceil(double noundef 0.000000e+00) #[[ATTR4:[0-9]+]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @ceilf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.ceil.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.ceil.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.ceil.f16(half 0xH0000)
+// AMDGPU-NEXT:    [[TMP3:%.*]] = call double @llvm.ceil.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @ceil_test(
+// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @ceil(double noundef 0.000000e+00) #[[ATTR4:[0-9]+]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @ceilf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.ceil.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.ceil.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.ceil.f16(half 0xH0000)
+// NVPTX-NEXT:    [[TMP3:%.*]] = call double @llvm.ceil.f64(double 0.000000e+00)
+// NVPTX-NEXT:    ret void
+//
+void ceil_test(void) {
+  (void)ceil(0.0);
+  (void)ceilf(0.f);
+  (void)__builtin_ceil(0.);
+  (void)__builtin_ceilf(0.f);
+  (void)__builtin_ceilf16((_Float16)0.);
+  (void)__builtin_ceill(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @copysign_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @copysign(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @copysignf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @copysign_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @copysign(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @copysignf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void copysign_test(void) {
+  (void)copysign(0., 0.);
+  (void)copysignf(0.f, 0.f);
+  (void)__builtin_copysign(0., 0.);
+  (void)__builtin_copysignf(0.f, 0.f);
+  (void)__builtin_copysignf16(0., 0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @cos_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @cos(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @cosf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.cos.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.cos.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.cos.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @cos_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @cos(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @cosf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.cos.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.cos.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.cos.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void cos_test(void) {
+  (void)cos(0.);
+  (void)cosf(0.f);
+  (void)__builtin_cos(0.);
+  (void)__builtin_cosf(0.f);
+  (void)__builtin_cosf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @exp_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @exp(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @expf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.exp.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.exp.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.exp.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @exp_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @exp(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @expf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.exp.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.exp.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.exp.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void exp_test(void) {
+  (void)exp(0.);
+  (void)expf(0.f);
+  (void)__builtin_exp(0.);
+  (void)__builtin_expf(0.f);
+  (void)__builtin_expf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @exp2_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @exp2(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @exp2f(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.exp2.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.exp2.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.exp2.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @exp2_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @exp2(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @exp2f(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.exp2.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.exp2.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.exp2.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void exp2_test(void) {
+  (void)exp2(0.);
+  (void)exp2f(0.f);
+  (void)__builtin_exp2(0.);
+  (void)__builtin_exp2f(0.f);
+  (void)__builtin_exp2f16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @exp10_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @exp10(double noundef 0.000000e+00)
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @exp10f(float noundef 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.exp10.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.exp10.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.exp10.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @exp10_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @exp10(double noundef 0.000000e+00)
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @exp10f(float noundef 0.000000e+00)
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.exp10.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.exp10.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.exp10.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void exp10_test(void) {
+  (void)exp10(0.);
+  (void)exp10f(0.f);
+  (void)__builtin_exp10(0.);
+  (void)__builtin_exp10f(0.f);
+  (void)__builtin_exp10f16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @fabs_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @fabs(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @fabsf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call half @llvm.fabs.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @fabs_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @fabs(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @fabsf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call half @llvm.fabs.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void fabs_test(void) {
+  (void)fabs(0.);
+  (void)fabsf(0.f);
+  (void)__builtin_fabs(0.);
+  (void)__builtin_fabsf(0.f);
+  (void)__builtin_fabsf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @floor_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @floor(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @floorf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.floor.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.floor.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.floor.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @floor_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @floor(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @floorf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.floor.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.floor.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.floor.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void floor_test(void) {
+  (void)floor(0.);
+  (void)floorf(0.f);
+  (void)__builtin_floor(0.);
+  (void)__builtin_floorf(0.f);
+  (void)__builtin_floorf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @fma_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @fma(double noundef 0.000000e+00, double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @fmaf(float noundef 0.000000e+00, float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.fma.f32(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.fma.f16(half 0xH0000, half 0xH0000, half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @fma_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @fma(double noundef 0.000000e+00, double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @fmaf(float noundef 0.000000e+00, float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.fma.f32(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.fma.f16(half 0xH0000, half 0xH0000, half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void fma_test(void) {
+  (void)fma(0., 0., 0.);
+  (void)fmaf(0.f, 0.f, 0.f);
+  (void)__builtin_fma(0., 0., 0.);
+  (void)__builtin_fmaf(0.f, 0.f, 0.f);
+  (void)__builtin_fmaf16(0., 0., 0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @fmax_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @fmax(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @fmaxf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @fmax_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @fmax(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @fmaxf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    ret void
+//
+void fmax_test(void) {
+  (void)fmax(0., 0.);
+  (void)fmaxf(0.f, 0.f);
+  (void)__builtin_fmax(0., 0.);
+  (void)__builtin_fmaxf(0.f, 0.f);
+  (void)__builtin_fmaxf16(0., 0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @fmin_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @fmin(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @fminf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @fmin_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @fmin(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @fminf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    ret void
+//
+void fmin_test(void) {
+  (void)fmin(0., 0.);
+  (void)fminf(0.f, 0.f);
+  (void)__builtin_fmin(0., 0.);
+  (void)__builtin_fminf(0.f, 0.f);
+  (void)__builtin_fminf16(0., 0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @fmod_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @fmod(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @fmodf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @fmod_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @fmod(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @fmodf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    ret void
+//
+void fmod_test(void) {
+  (void)fmod(0., 0.);
+  (void)fmodf(0.f, 0.f);
+  (void)__builtin_fmod(0., 0.);
+  (void)__builtin_fmodf(0.f, 0.f);
+  (void)__builtin_fmodf16(0., 0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @log_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @log(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @logf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.log.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.log.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.log.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @log_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @log(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @logf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.log.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.log.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.log.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void log_test(void) {
+  (void)log(0.);
+  (void)logf(0.f);
+  (void)__builtin_log(0.);
+  (void)__builtin_logf(0.f);
+  (void)__builtin_logf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @log10_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @log10(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @log10f(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.log10.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.log10.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.log10.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @log10_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @log10(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @log10f(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.log10.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.log10.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.log10.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void log10_test(void) {
+  (void)log10(0.);
+  (void)log10f(0.f);
+  (void)__builtin_log10(0.);
+  (void)__builtin_log10f(0.f);
+  (void)__builtin_log10f16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @log2_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @log2(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @log2f(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.log2.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.log2.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.log2.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @log2_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @log2(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @log2f(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.log2.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.log2.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.log2.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void log2_test(void) {
+  (void)log2(0.);
+  (void)log2f(0.f);
+  (void)__builtin_log2(0.);
+  (void)__builtin_log2f(0.f);
+  (void)__builtin_log2f16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @nearbyint_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @nearbyint(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @nearbyintf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.nearbyint.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.nearbyint.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @nearbyint_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @nearbyint(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @nearbyintf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.nearbyint.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.nearbyint.f32(float 0.000000e+00)
+// NVPTX-NEXT:    ret void
+//
+void nearbyint_test(void) {
+  (void)nearbyint(0.);
+  (void)nearbyintf(0.f);
+  (void)__builtin_nearbyint(0.);
+  (void)__builtin_nearbyintf(0.f);
+}
+
+// AMDGPU-LABEL: define dso_local void @pow_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @pow(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @powf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.pow.f64(double 0.000000e+00, double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.pow.f32(float 0.000000e+00, float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.pow.f16(half 0xH0000, half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @pow_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @pow(double noundef 0.000000e+00, double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @powf(float noundef 0.000000e+00, float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.pow.f64(double 0.000000e+00, double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.pow.f32(float 0.000000e+00, float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.pow.f16(half 0xH0000, half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void pow_test(void) {
+  (void)pow(0., 0.);
+  (void)powf(0.f, 0.f);
+  (void)__builtin_pow(0., 0.);
+  (void)__builtin_powf(0.f, 0.f);
+  (void)__builtin_powf16(0., 0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @rint_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @rint(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @rintf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.rint.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.rint.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.rint.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @rint_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @rint(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @rintf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.rint.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.rint.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.rint.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void rint_test(void) {
+  (void)rint(0.);
+  (void)rintf(0.f);
+  (void)__builtin_rint(0.);
+  (void)__builtin_rintf(0.f);
+  (void)__builtin_rintf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @round_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @round(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @roundf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.round.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.round.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.round.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @round_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @round(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @roundf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.round.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.round.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.round.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void round_test(void) {
+  (void)round(0.);
+  (void)roundf(0.f);
+  (void)__builtin_round(0.);
+  (void)__builtin_roundf(0.f);
+  (void)__builtin_roundf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @roundeven_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @roundeven(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @roundevenf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.roundeven.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.roundeven.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.roundeven.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @roundeven_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @roundeven(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @roundevenf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.roundeven.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.roundeven.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.roundeven.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void roundeven_test(void) {
+  (void)roundeven(0.);
+  (void)roundevenf(0.f);
+  (void)__builtin_roundeven(0.);
+  (void)__builtin_roundevenf(0.f);
+  (void)__builtin_roundevenf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @sin_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @sin(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @sinf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.sin.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.sin.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @sin_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @sin(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @sinf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.sin.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.sin.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void sin_test(void) {
+  (void)sin(0.);
+  (void)sinf(0.f);
+  (void)__builtin_sin(0.);
+  (void)__builtin_sinf(0.f);
+  (void)__builtin_sinf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @sqrt_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @sqrt(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @sqrtf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.sqrt.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.sqrt.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.sqrt.f16(half 0xH0000)
+// AMDGPU-NEXT:    [[TMP3:%.*]] = call double @llvm.sqrt.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @sqrt_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @sqrt(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @sqrtf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.sqrt.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.sqrt.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.sqrt.f16(half 0xH0000)
+// NVPTX-NEXT:    [[TMP3:%.*]] = call double @llvm.sqrt.f64(double 0.000000e+00)
+// NVPTX-NEXT:    ret void
+//
+void sqrt_test(void) {
+  (void)sqrt(0.);
+  (void)sqrtf(0.f);
+  (void)__builtin_sqrt(0.);
+  (void)__builtin_sqrtf(0.f);
+  (void)__builtin_sqrtf16(0.);
+  (void)__builtin_elementwise_sqrt(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @tan_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @tan(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @tanf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.tan.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.tan.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.tan.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @tan_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @tan(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @tanf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.tan.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.tan.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.tan.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void tan_test(void) {
+  (void)tan(0.);
+  (void)tanf(0.f);
+  (void)__builtin_tan(0.);
+  (void)__builtin_tanf(0.f);
+  (void)__builtin_tanf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @trunc_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @trunc(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @truncf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.trunc.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.trunc.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.trunc.f16(half 0xH0000)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @trunc_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @trunc(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @truncf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.trunc.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.trunc.f32(float 0.000000e+00)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.trunc.f16(half 0xH0000)
+// NVPTX-NEXT:    ret void
+//
+void trunc_test(void) {
+  (void)trunc(0.);
+  (void)truncf(0.f);
+  (void)__builtin_trunc(0.);
+  (void)__builtin_truncf(0.f);
+  (void)__builtin_truncf16(0.);
+}
+
+// AMDGPU-LABEL: define dso_local void @lround_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i64 @lround(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call i64 @lroundf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.lround.i64.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lround.i64.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @lround_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call i64 @lround(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call i64 @lroundf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call i64 @llvm.lround.i64.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lround.i64.f32(float 0.000000e+00)
+// NVPTX-NEXT:    ret void
+//
+void lround_test(void) {
+  (void)lround(0.);
+  (void)lroundf(0.f);
+  (void)__builtin_lround(0.);
+  (void)__builtin_lroundf(0.f);
+}
+
+// AMDGPU-LABEL: define dso_local void @llround_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i64 @llround(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call i64 @llroundf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.llround.i64.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call i64 @llvm.llround.i64.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @llround_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call i64 @llround(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call i64 @llroundf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call i64 @llvm.llround.i64.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call i64 @llvm.llround.i64.f32(float 0.000000e+00)
+// NVPTX-NEXT:    ret void
+//
+void llround_test(void) {
+  (void)llround(0.);
+  (void)llroundf(0.f);
+  (void)__builtin_llround(0.);
+  (void)__builtin_llroundf(0.f);
+}
+
+// AMDGPU-LABEL: define dso_local void @lrint_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i64 @lrint(double noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call i64 @lrintf(float noundef 0.000000e+00) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call i64 @llvm.lrint.i64.f64(double 0.000000e+00)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lrint.i64.f32(float 0.000000e+00)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @lrint_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call i64 @lrint(double noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call i64 @lrintf(float noundef 0.000000e+00) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call i64 @llvm.lrint.i64.f64(double 0.000000e+00)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lrint.i64.f32(float 0.000000e+00)
+// NVPTX-NEXT:    ret void
+//
+void lrint_test(void) {
+  (void)lrint(0.);
+  (void)lrintf(0.f);
+  (void)__builtin_lrint(0.);
+  (void)__builtin_lrintf(0.f);
+}
+
+// AMDGPU-LABEL: define dso_local void @__builtin_ldexp_test(
+// AMDGPU-SAME: ) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[CALL:%.*]] = call double @ldexp(double noundef 0.000000e+00, i32 noundef 0) #[[ATTR4]]
+// AMDGPU-NEXT:    [[CALL1:%.*]] = call float @ldexpf(float noundef 0.000000e+00, i32 noundef 0) #[[ATTR4]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = call double @llvm.ldexp.f64.i32(double 0.000000e+00, i32 0)
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float 0.000000e+00, i32 0)
+// AMDGPU-NEXT:    [[TMP2:%.*]] = call half @llvm.ldexp.f16.i32(half 0xH0000, i32 0)
+// AMDGPU-NEXT:    ret void
+//
+// NVPTX-LABEL: define dso_local void @__builtin_ldexp_test(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CALL:%.*]] = call double @ldexp(double noundef 0.000000e+00, i32 noundef 0) #[[ATTR4]]
+// NVPTX-NEXT:    [[CALL1:%.*]] = call float @ldexpf(float noundef 0.000000e+00, i32 noundef 0) #[[ATTR4]]
+// NVPTX-NEXT:    [[TMP0:%.*]] = call double @llvm.ldexp.f64.i32(double 0.000000e+00, i32 0)
+// NVPTX-NEXT:    [[TMP1:%.*]] = call float @llvm.ldexp.f32.i32(float 0.000000e+00, i32 0)
+// NVPTX-NEXT:    [[TMP2:%.*]] = call half @llvm.ldexp.f16.i32(half 0xH0000, i32 0)
+// NVPTX-NEXT:    ret void
+//
+void __builtin_ldexp_test(void) {
+  (void)ldexp(0., 0);
+  (void)ldexpf(0.f, 0);
+  (void)__builtin_ldexp(0., 0);
+  (void)__builtin_ldexpf(0.f, 0);
+  (void)__builtin_ldexpf16(0., 0);
+}