[llvm] [NVPTX] Use fast-math flags when lowering sin, cos, frem (PR #133121)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 26 16:27:18 PDT 2025
================
@@ -0,0 +1,286 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s --enable-unsafe-fp-math | FileCheck %s --check-prefixes=FAST
+; RUN: llc < %s | FileCheck %s --check-prefixes=NORMAL
+
+
+target triple = "nvptx64-unknown-cuda"
+
+define half @frem_f16(half %a, half %b) {
+; FAST-LABEL: frem_f16(
+; FAST: {
+; FAST-NEXT: .reg .b16 %rs<4>;
+; FAST-NEXT: .reg .f32 %f<7>;
+; FAST-EMPTY:
+; FAST-NEXT: // %bb.0:
+; FAST-NEXT: ld.param.b16 %rs1, [frem_f16_param_0];
+; FAST-NEXT: ld.param.b16 %rs2, [frem_f16_param_1];
+; FAST-NEXT: cvt.f32.f16 %f1, %rs2;
+; FAST-NEXT: cvt.f32.f16 %f2, %rs1;
+; FAST-NEXT: div.approx.f32 %f3, %f2, %f1;
+; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3;
+; FAST-NEXT: neg.f32 %f5, %f4;
+; FAST-NEXT: fma.rn.f32 %f6, %f5, %f1, %f2;
+; FAST-NEXT: cvt.rn.f16.f32 %rs3, %f6;
+; FAST-NEXT: st.param.b16 [func_retval0], %rs3;
+; FAST-NEXT: ret;
+;
+; NORMAL-LABEL: frem_f16(
+; NORMAL: {
+; NORMAL-NEXT: .reg .pred %p<2>;
+; NORMAL-NEXT: .reg .b16 %rs<4>;
+; NORMAL-NEXT: .reg .f32 %f<8>;
+; NORMAL-EMPTY:
+; NORMAL-NEXT: // %bb.0:
+; NORMAL-NEXT: ld.param.b16 %rs1, [frem_f16_param_0];
+; NORMAL-NEXT: ld.param.b16 %rs2, [frem_f16_param_1];
+; NORMAL-NEXT: cvt.f32.f16 %f1, %rs2;
+; NORMAL-NEXT: cvt.f32.f16 %f2, %rs1;
+; NORMAL-NEXT: div.rn.f32 %f3, %f2, %f1;
+; NORMAL-NEXT: cvt.rzi.f32.f32 %f4, %f3;
+; NORMAL-NEXT: neg.f32 %f5, %f4;
+; NORMAL-NEXT: fma.rn.f32 %f6, %f5, %f1, %f2;
+; NORMAL-NEXT: testp.infinite.f32 %p1, %f1;
+; NORMAL-NEXT: selp.f32 %f7, %f2, %f6, %p1;
+; NORMAL-NEXT: cvt.rn.f16.f32 %rs3, %f7;
+; NORMAL-NEXT: st.param.b16 [func_retval0], %rs3;
+; NORMAL-NEXT: ret;
+ %r = frem half %a, %b
+ ret half %r
+}
+
+define float @frem_f32(float %a, float %b) {
+; FAST-LABEL: frem_f32(
+; FAST: {
+; FAST-NEXT: .reg .f32 %f<7>;
+; FAST-EMPTY:
+; FAST-NEXT: // %bb.0:
+; FAST-NEXT: ld.param.f32 %f1, [frem_f32_param_0];
+; FAST-NEXT: ld.param.f32 %f2, [frem_f32_param_1];
+; FAST-NEXT: div.approx.f32 %f3, %f1, %f2;
+; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3;
+; FAST-NEXT: neg.f32 %f5, %f4;
+; FAST-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1;
+; FAST-NEXT: st.param.f32 [func_retval0], %f6;
+; FAST-NEXT: ret;
+;
+; NORMAL-LABEL: frem_f32(
+; NORMAL: {
+; NORMAL-NEXT: .reg .pred %p<2>;
+; NORMAL-NEXT: .reg .f32 %f<8>;
+; NORMAL-EMPTY:
+; NORMAL-NEXT: // %bb.0:
+; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_param_0];
+; NORMAL-NEXT: ld.param.f32 %f2, [frem_f32_param_1];
+; NORMAL-NEXT: div.rn.f32 %f3, %f1, %f2;
+; NORMAL-NEXT: cvt.rzi.f32.f32 %f4, %f3;
+; NORMAL-NEXT: neg.f32 %f5, %f4;
+; NORMAL-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1;
+; NORMAL-NEXT: testp.infinite.f32 %p1, %f2;
+; NORMAL-NEXT: selp.f32 %f7, %f1, %f6, %p1;
+; NORMAL-NEXT: st.param.f32 [func_retval0], %f7;
+; NORMAL-NEXT: ret;
+ %r = frem float %a, %b
+ ret float %r
+}
+
+define double @frem_f64(double %a, double %b) {
+; FAST-LABEL: frem_f64(
+; FAST: {
+; FAST-NEXT: .reg .f64 %fd<7>;
+; FAST-EMPTY:
+; FAST-NEXT: // %bb.0:
+; FAST-NEXT: ld.param.f64 %fd1, [frem_f64_param_0];
+; FAST-NEXT: ld.param.f64 %fd2, [frem_f64_param_1];
+; FAST-NEXT: div.rn.f64 %fd3, %fd1, %fd2;
+; FAST-NEXT: cvt.rzi.f64.f64 %fd4, %fd3;
+; FAST-NEXT: neg.f64 %fd5, %fd4;
+; FAST-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
+; FAST-NEXT: st.param.f64 [func_retval0], %fd6;
+; FAST-NEXT: ret;
+;
+; NORMAL-LABEL: frem_f64(
+; NORMAL: {
+; NORMAL-NEXT: .reg .pred %p<2>;
+; NORMAL-NEXT: .reg .f64 %fd<8>;
+; NORMAL-EMPTY:
+; NORMAL-NEXT: // %bb.0:
+; NORMAL-NEXT: ld.param.f64 %fd1, [frem_f64_param_0];
+; NORMAL-NEXT: ld.param.f64 %fd2, [frem_f64_param_1];
+; NORMAL-NEXT: div.rn.f64 %fd3, %fd1, %fd2;
+; NORMAL-NEXT: cvt.rzi.f64.f64 %fd4, %fd3;
+; NORMAL-NEXT: neg.f64 %fd5, %fd4;
+; NORMAL-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
+; NORMAL-NEXT: testp.infinite.f64 %p1, %fd2;
+; NORMAL-NEXT: selp.f64 %fd7, %fd1, %fd6, %p1;
+; NORMAL-NEXT: st.param.f64 [func_retval0], %fd7;
+; NORMAL-NEXT: ret;
+ %r = frem double %a, %b
+ ret double %r
+}
+
+define half @frem_f16_ninf(half %a, half %b) {
+; FAST-LABEL: frem_f16_ninf(
+; FAST: {
+; FAST-NEXT: .reg .b16 %rs<4>;
+; FAST-NEXT: .reg .f32 %f<7>;
+; FAST-EMPTY:
+; FAST-NEXT: // %bb.0:
+; FAST-NEXT: ld.param.b16 %rs1, [frem_f16_ninf_param_0];
+; FAST-NEXT: ld.param.b16 %rs2, [frem_f16_ninf_param_1];
+; FAST-NEXT: cvt.f32.f16 %f1, %rs2;
+; FAST-NEXT: cvt.f32.f16 %f2, %rs1;
+; FAST-NEXT: div.approx.f32 %f3, %f2, %f1;
+; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3;
+; FAST-NEXT: neg.f32 %f5, %f4;
+; FAST-NEXT: fma.rn.f32 %f6, %f5, %f1, %f2;
----------------
AlexMaclean wrote:
Makes sense, I've switched the test to run with sm_60.
https://github.com/llvm/llvm-project/pull/133121
More information about the llvm-commits
mailing list