[llvm] [X86][SelectionDAG] - Add support for llvm.canonicalize intrinsic (PR #106370)
Pawan Nirpal via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 04:14:59 PDT 2024
https://github.com/pawan-nirpal-031 created https://github.com/llvm/llvm-project/pull/106370
Enable support for fcanonicalize intrinsic lowering.
>From a824dede98e9a979dd432d0a72b01ad730474245 Mon Sep 17 00:00:00 2001
From: Pawan Anil Nirpal <pawan.anil.nirpal at intel.com>
Date: Wed, 28 Aug 2024 13:09:30 +0200
Subject: [PATCH] [X86][SelectionDAG] - Add support for llvm.canonicalize
intrinsic
Enable support for fcanonicalize intrinsic lowering.
---
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 50 +++
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 46 +++
.../CodeGen/X86/canonicalize-constants.ll | 210 +++++++++++++
.../CodeGen/X86/canonicalize-subnormals.ll | 287 ++++++++++++++++++
llvm/test/CodeGen/X86/canonicalize-vars.ll | 193 ++++++++++++
5 files changed, 786 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/canonicalize-constants.ll
create mode 100644 llvm/test/CodeGen/X86/canonicalize-subnormals.ll
create mode 100644 llvm/test/CodeGen/X86/canonicalize-vars.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 74e3a898569bea..c1679b1002df5e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1275,6 +1275,56 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
}
}
break;
+ case ISD::FCANONICALIZE: {
+ const Triple &TT = DAG.getTarget().getTargetTriple();
+ if (TT.getArch() == Triple::x86 || TT.getArch() == Triple::x86_64) {
+ SDValue Operand = Node->getOperand(0);
+ SDLoc dl(Node);
+ EVT VT = Operand.getValueType();
+
+ if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Operand)) {
+ const APFloat &C = CFP->getValueAPF();
+ if (C.isDenormal()) {
+ DenormalMode Mode =
+ DAG.getMachineFunction().getDenormalMode(C.getSemantics());
+ assert((Mode != DenormalMode::getPositiveZero()) &&
+ "Positive denormal mode is not valid for X86 target.");
+ if (Mode == DenormalMode::getPreserveSign()) {
+ SDValue SDZero =
+ DAG.getConstantFP((C.isNegative() ? -0.0 : 0.0), dl, VT);
+ ConstantFPSDNode *ZeroConstFP = cast<ConstantFPSDNode>(SDZero);
+ SDValue CanonZeroFPLoad = ExpandConstantFP(ZeroConstFP, true);
+ DAG.ReplaceAllUsesWith(Node, CanonZeroFPLoad.getNode());
+ LLVM_DEBUG(dbgs()
+ << "Legalized Denormal under mode PreserveSign\n");
+ return;
+ } else if (Mode == DenormalMode::getIEEE()) {
+ DAG.ReplaceAllUsesWith(Node, Operand.getNode());
+ LLVM_DEBUG(dbgs() << "Legalized Denormal under mode IEEE\n");
+ return;
+ }
+ } else if (C.isNaN() && C.isSignaling()) {
+ APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+ SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
+ ConstantFPSDNode *QNaNConstFP = cast<ConstantFPSDNode>(QuitNaN);
+ SDValue QNanLoad = ExpandConstantFP(QNaNConstFP, true);
+ DAG.ReplaceAllUsesWith(Node, QNanLoad.getNode());
+ LLVM_DEBUG(dbgs() << "Legalized Signaling NaN to Quiet NaN\n");
+ return;
+ }
+ } else if (Operand.isUndef()) {
+ APFloat CanonicalQNaN = APFloat::getQNaN(VT.getFltSemantics());
+ SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
+ ConstantFPSDNode *QNaNConstFP = cast<ConstantFPSDNode>(QuitNaN);
+ SDValue QNanLoad = ExpandConstantFP(QNaNConstFP, true);
+ DAG.ReplaceAllUsesWith(Node, QNanLoad.getNode());
+ LLVM_DEBUG(dbgs() << "Legalized Undef to Quiet NaN\n");
+ return;
+ }
+ break;
+ }
+ break;
+ }
case ISD::FSHL:
case ISD::FSHR:
case ISD::SRL_PARTS:
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d0a54ab8993c26..4bb8c9afd23edc 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5271,6 +5271,52 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case ISD::FCANONICALIZE: {
+ SDValue Operand = Node->getOperand(0);
+ EVT VT = Node->getValueType(0);
+
+ // Perform canonicalization for constants. Replace the operand by a load
+ // from constant pool for this constant. At this point subnoraml values like
+ // denormals, snans have been canonicalized so no need to deal with those
+ // cases.
+ if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Operand)) {
+ const X86TargetLowering *X86Lowering =
+ static_cast<const X86TargetLowering *>(TLI);
+ if (const Constant *CV = X86Lowering->getTargetConstantFromLoad(Load)) {
+ const ConstantFP *CFP = dyn_cast<ConstantFP>(CV);
+ if (CFP) {
+ ReplaceNode(Node, Load);
+ return;
+ }
+ }
+ }
+
+ // Canonicalize normal non-constant/non-undef FP Nodes.
+ SDValue MulNode;
+ SDValue One;
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ One = CurDAG->getConstantFP(1.0f, dl, VT);
+ } else if (VT == MVT::f80) {
+ APFloat Val = APFloat::getOne(APFloat::x87DoubleExtended());
+ One = CurDAG->getConstantFP(Val, dl, VT);
+ } else if (VT == MVT::f16) {
+ APFloat Val(APFloat::IEEEhalf(), "1.0");
+ One = CurDAG->getConstantFP(Val, dl, VT);
+ } else if (VT == MVT::bf16) {
+ APFloat Val(APFloat::BFloat(), "1.0");
+ One = CurDAG->getConstantFP(Val, dl, VT);
+ } else {
+ // Is it better to assert? when we encounter an unknown FP type,Than to
+ // just replace with the operand! As this might be our last attempt at
+ // legalization.
+ ReplaceNode(Node, Operand.getNode());
+ return;
+ }
+ // TODO : Follow-up with tablegen pattern to generate mul * 1.0.
+ MulNode = CurDAG->getNode(ISD::FMUL, dl, VT, Operand, One);
+ ReplaceNode(Node, MulNode.getNode());
+ return;
+ }
case ISD::BRIND:
case X86ISD::NT_BRIND: {
if (Subtarget->isTargetNaCl())
diff --git a/llvm/test/CodeGen/X86/canonicalize-constants.ll b/llvm/test/CodeGen/X86/canonicalize-constants.ll
new file mode 100644
index 00000000000000..b71c74bcd4472b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/canonicalize-constants.ll
@@ -0,0 +1,210 @@
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 < %s | FileCheck %s
+
+define float @canon_fp32() {
+ ; CHECK-LABEL: .LCPI0_0:
+ ; CHECK: .long 0x40400000 # float 3
+ ; CHECK-LABEL: canon_fp32
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovss .LCPI0_0(%rip), %xmm0 # xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+ ; CHECK-NEXT: retq
+ %canonicalized = call float @llvm.canonicalize.f32(float 3.0)
+ ret float %canonicalized
+}
+
+define half @canon_fp16() {
+ ; CHECK-LABEL: .LCPI1_0:
+ ; CHECK: .short 0x4200 # half 3
+ ; CHECK-LABEL: canon_fp16
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsh .LCPI1_0(%rip), %xmm0
+ ; CHECK-NEXT: retq
+ %canonicalized = call half @llvm.canonicalize.f16(half 0xH4200) ; half 3.0
+ ret half %canonicalized
+}
+
+define double @canon_fp64() {
+ ; CHECK-LABEL: .LCPI2_0:
+ ; CHECK: .quad 0x4008000000000000 # double 3
+ ; CHECK-LABEL: canon_fp64
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI2_0(%rip), %xmm0
+ ; CHECK-NEXT: retq
+ %canonicalized = call double @llvm.canonicalize.f64(double 3.0)
+ ret double %canonicalized
+}
+
+define x86_fp80 @canon_fp80() {
+ ; CHECK-LABEL: .LCPI3_0:
+ ; CHECK: .long 0x42b40000 # float 90
+ ; CHECK-LABEL: canon_fp80
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: flds .LCPI3_0(%rip)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000) ; 90.0
+ ret x86_fp80 %canonicalized
+}
+
+
+define x86_fp80 @complex_canonicalize_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
+entry:
+ ; CHECK-LABEL: .LCPI4_0:
+ ; CHECK: .long 0x42b40000 # float 90
+ ; CHECK-LABEL: complex_canonicalize_x86_fp80
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: fldt 24(%rsp)
+ ; CHECK-NEXT: flds .LCPI4_0(%rip)
+ ; CHECK-NEXT: fsubp %st, %st(1)
+ ; CHECK-NEXT: retq
+
+ %mul1 = fsub x86_fp80 %a, %b
+ %add = fadd x86_fp80 %mul1, %b
+ %mul2 = fsub x86_fp80 %add, %mul1
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000)
+ %result = fsub x86_fp80 %canonicalized, %b
+ ret x86_fp80 %result
+}
+
+define double @complex_canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
+start:
+ ; CHECK-LABEL: .LCPI5_0:
+ ; CHECK: .quad 0x4008000000000000 # double 3
+ ; CHECK-LABEL: complex_canonicalize_fp64
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI5_0(%rip), %xmm0
+ ; CHECK-NEXT: retq
+
+ %c = fcmp olt double %a, %b
+ %d = fcmp uno double %a, 0.000000e+00
+ %or.cond.i.i = or i1 %d, %c
+ %e = select i1 %or.cond.i.i, double %b, double %a
+ %f = tail call double @llvm.canonicalize.f64(double 3.0) #2
+ ret double %f
+}
+
+define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+ ; CHECK-LAEBL: test_fold_canonicalize_p0_f32
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: vmovss %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+ %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+ ; CHECK-LAEBL: .LCPI7_0:
+ ; CHECK: .long 0x80000000 # float -0
+ ; CHECK-LAEBL: test_fold_canonicalize_n0_f32
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovss .LCPI7_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovss %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+ %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+
+define void @v_test_canonicalize_p90_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
+ ; CHECK-LAEBL: .LCPI8_0:
+ ; CHECK: .long 0x42b40000 # float 90
+ ; CHECK-LAEBL: v_test_canonicalize_p90_x86_fp80
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: flds .LCPI8_0(%rip)
+ ; CHECK-NEXT: fstpt (%rdi)
+ ; CHECK-NEXT: retq
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000)
+ store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_p3__half(half addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI9_0:
+ ; CHECK: .short 0x4200 # half 3
+ ; CHECK-LABEL: v_test_canonicalize_p3__half:
+ ; CHECK: # %bb.0: # %entry
+ ; CHECK-NEXT: vmovsh .LCPI9_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsh %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+entry:
+ %canonicalized = call half @llvm.canonicalize.f16(half 0xH4200)
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_p3_f64(double addrspace(1)* %out) #1 {
+ ; CHECK-LABEL: .LCPI10_0:
+ ; CHECK: .quad 0x4008000000000000 # double 3
+ ; CHECK-LAEBL: v_test_canonicalize_p3_f64
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI10_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+entry:
+ %canonicalized = call double @llvm.canonicalize.f64(double 3.0)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_p3__bfloat(bfloat addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI11_0:
+ ; CHECK: .long 0x40400000 # float 3
+ ; CHECK-LABEL: v_test_canonicalize_p3__bfloat:
+ ; CHECK: # %bb.0: # %entry
+ ; CHECK-NEXT: vmovss .LCPI11_0(%rip), %xmm0 # xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+ ; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+ ; CHECK-NEXT: vpextrw $0, %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+entry:
+ %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 3.0)
+ store bfloat %canonicalized, bfloat addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_n3__bfloat(bfloat addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI12_0:
+ ; CHECK: .long 0xc0400000 # float -3
+ ; CHECK-LABEL: v_test_canonicalize_n3__bfloat:
+ ; CHECK: # %bb.0: # %entry
+ ; CHECK-NEXT: vmovss .LCPI12_0(%rip), %xmm0 # xmm0 = [-3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+ ; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+ ; CHECK-NEXT: vpextrw $0, %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+entry:
+ %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -3.0)
+ store bfloat %canonicalized, bfloat addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_n90_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
+ ; CHECK-LAEBL: .LCPI13_0:
+ ; CHECK: .long 0xc2b40000 # float -90
+ ; CHECK-LAEBL: v_test_canonicalize_n90_x86_fp80
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: flds .LCPI13_0(%rip)
+ ; CHECK-NEXT: fstpt (%rdi)
+ ; CHECK-NEXT: retq
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xKC005B400000000000000)
+ store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_n3__half(half addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI14_0:
+ ; CHECK: .short 0xc200 # half -3
+ ; CHECK-LABEL: v_test_canonicalize_n3__half:
+ ; CHECK: # %bb.0: # %entry
+ ; CHECK-NEXT: vmovsh .LCPI14_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsh %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+entry:
+ %canonicalized = call half @llvm.canonicalize.f16(half 0xHC200)
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/canonicalize-subnormals.ll b/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
new file mode 100644
index 00000000000000..8e7e04c2a67dc8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
@@ -0,0 +1,287 @@
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck %s
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL %s
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL %s
+
+define void @canonicalize_denormal1_f32_pre_sign(float addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI0_0:
+ ; CHECK: .long 0x80000000 # float -0
+ ; CHECK-LABEL: canonicalize_denormal1_f32_pre_sign:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovss .LCPI0_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovss %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_f64_pre_sign(double addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI1_0:
+ ; CHECK: .quad 0x8000000000000000 # double -0
+ ; CHECK-LABEL: canonicalize_denormal1_f64_pre_sign:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI1_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+
+define void @canonicalize_qnan_f64(double addrspace(1)* %out) {
+ ;cCHECK-LABEL: .LCPI2_0:
+ ;cCHECK: .quad 0x7ff8000000000000 # double NaN
+ ; CHECK-LABEL: canonicalize_qnan_f64:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI2_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) {
+ ;cCHECK-LABEL: .LCPI3_0:
+ ;cCHECK: .quad 0xffffffffffffffff # double NaN
+ ; CHECK-LABEL: canonicalize_qnan_value_neg1_f64:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI3_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI4_0:
+ ; CHECK: .quad 0xfffffffffffffffe # double NaN
+ ; CHECK-LABEL: canonicalize_qnan_value_neg2_f64:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI4_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_snan0_value_f64(double addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI5_0:
+ ; CHECK: .quad 0x7ff8000000000000 # double NaN
+ ; CHECK-LABEL: canonicalize_snan0_value_f64:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI5_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_undef(double addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI6_0:
+ ; CHECK: .quad 0x7ff8000000000000 # double NaN
+ ; CHECK-LABEL: canonicalize_undef:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd .LCPI6_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double undef)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_f32_ieee(float addrspace(1)* %out) {
+ ; IEEE-DENORMAL-LABEL: .LCPI7_0:
+ ; IEEE-DENORMAL: .long 0x807fffff # float -1.17549421E-38
+ ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_f32_ieee:
+ ; IEEE-DENORMAL: # %bb.0:
+ ; IEEE-DENORMAL-NEXT: vmovss .LCPI7_0(%rip), %xmm0
+ ; IEEE-DENORMAL-NEXT: vmovss %xmm0, (%rdi)
+ ; IEEE-DENORMAL-NEXT: retq
+
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_f64_ieee(double addrspace(1)* %out) {
+ ; IEEE-DENORMAL-LABEL: .LCPI8_0:
+ ; IEEE-DENORMAL: .quad 0x800fffffffffffff # double -2.2250738585072009E-308
+ ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_f64_ieee:
+ ; IEEE-DENORMAL: # %bb.0:
+ ; IEEE-DENORMAL-NEXT: vmovsd .LCPI8_0(%rip), %xmm0
+ ; IEEE-DENORMAL-NEXT: vmovsd %xmm0, (%rdi)
+ ; IEEE-DENORMAL-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_f32_dynamic(float addrspace(1)* %out) {
+ ; DYN-DENORMAL-LABEL: .LCPI9_0:
+ ; DYN-DENORMAL: .long 0x807fffff # float -1.17549421E-38
+ ; DYN-DENORMAL-LABEL: canonicalize_denormal1_f32_dynamic:
+ ; DYN-DENORMAL: # %bb.0:
+ ; DYN-DENORMAL-NEXT: vmovss .LCPI9_0(%rip), %xmm0
+ ; DYN-DENORMAL-NEXT: vmovss %xmm0, (%rdi)
+ ; DYN-DENORMAL-NEXT: retq
+
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_f64_dynamic(double addrspace(1)* %out) {
+ ; DYN-DENORMAL-LABEL: .LCPI10_0:
+ ; DYN-DENORMAL: .quad 0x800fffffffffffff # double -2.2250738585072009E-308
+ ; DYN-DENORMAL-LABEL: canonicalize_denormal1_f64_dynamic:
+ ; DYN-DENORMAL: # %bb.0:
+ ; DYN-DENORMAL-NEXT: vmovsd .LCPI10_0(%rip), %xmm0
+ ; DYN-DENORMAL-NEXT: vmovsd %xmm0, (%rdi)
+ ; DYN-DENORMAL-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_bfloat_pre_sign(bfloat addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI11_0:
+ ; CHECK: .long 0x80000000 # float -0
+ ; CHECK-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovss .LCPI11_0(%rip), %xmm0
+ ; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+ ; CHECK-NEXT: vpextrw $0, %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
+ store bfloat %canonicalized, bfloat addrspace(1)* %out
+ ret void
+}
+
+
+define void @canonicalize_denormal1_bfloat_ieee(bfloat addrspace(1)* %out) {
+ ; IEEE-DENORMAL-LABEL: .LCPI12_0:
+ ; IEEE-DENORMAL: .long 0x80000000 # float -0
+ ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_bfloat_ieee:
+ ; IEEE-DENORMAL: # %bb.0:
+ ; IEEE-DENORMAL-NEXT: vmovss .LCPI12_0(%rip), %xmm0
+ ; IEEE-DENORMAL-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+ ; IEEE-DENORMAL-NEXT: vpextrw $0, %xmm0, (%rdi)
+ ; IEEE-DENORMAL-NEXT: retq
+
+ %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
+ store bfloat %canonicalized, bfloat addrspace(1)* %out
+ ret void
+}
+
+
+define void @canonicalize_denormal1_bfloat_dynamic(bfloat addrspace(1)* %out) {
+ ; DYN-DENORMAL-LABEL: .LCPI13_0:
+ ; DYN-DENORMAL: .long 0x80000000 # float -0
+ ; DYN-DENORMAL-LABEL: canonicalize_denormal1_bfloat_dynamic:
+ ; DYN-DENORMAL: # %bb.0:
+ ; DYN-DENORMAL-NEXT: vmovss .LCPI13_0(%rip), %xmm0
+ ; DYN-DENORMAL-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+ ; DYN-DENORMAL-NEXT: vpextrw $0, %xmm0, (%rdi)
+ ; DYN-DENORMAL-NEXT: retq
+
+ %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
+ store bfloat %canonicalized, bfloat addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_half_pre_sign(half addrspace(1)* %out) {
+ ; CHECK-LABEL: .LCPI14_0:
+ ; CHECK: .short 0x8000 # half -0
+ ; CHECK-LABEL: canonicalize_denormal1_half_pre_sign:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsh .LCPI14_0(%rip), %xmm0
+ ; CHECK-NEXT: vmovsh %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+
+ %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
+
+
+define void @canonicalize_denormal1_half_ieee(half addrspace(1)* %out) {
+ ; IEEE-DENORMAL-LABEL: .LCPI15_0:
+ ; IEEE-DENORMAL: .short 0x8000 # half -0
+ ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_half_ieee:
+ ; IEEE-DENORMAL: # %bb.0:
+ ; IEEE-DENORMAL-NEXT: vmovsh .LCPI15_0(%rip), %xmm0
+ ; IEEE-DENORMAL-NEXT: vmovsh %xmm0, (%rdi)
+ ; IEEE-DENORMAL-NEXT: retq
+
+ %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_half_dynamic(half addrspace(1)* %out) {
+ ; DYN-DENORMAL-LABEL: .LCPI16_0:
+ ; DYN-DENORMAL: .short 0x8000 # half -0
+ ; DYN-DENORMAL-LABEL: canonicalize_denormal1_half_dynamic:
+ ; DYN-DENORMAL: # %bb.0:
+ ; DYN-DENORMAL-NEXT: vmovsh .LCPI16_0(%rip), %xmm0
+ ; DYN-DENORMAL-NEXT: vmovsh %xmm0, (%rdi)
+ ; DYN-DENORMAL-NEXT: retq
+
+ %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_x86_fp80_pre_sign(x86_fp80 addrspace(1)* %out) {
+ ; CHECK-LAEBL: .LCPI17_0:
+ ; CHECK: .long 0x00000000 # float 0
+ ; CHECK-LAEBL: canonicalize_denormal1_x86_fp80_pre_sign
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: flds .LCPI17_0(%rip)
+ ; CHECK-NEXT: fstpt (%rdi)
+ ; CHECK-NEXT: retq
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
+ store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_x86_fp80_dynamic(x86_fp80 addrspace(1)* %out) {
+ ; DYN-DENORMAL-LAEBL: .LCPI17_0:
+ ; DYN-DENORMAL: .quad 0x0000000000000001 # x86_fp80 3.64519953188247460253E-4951
+ ; DYN-DENORMAL-LAEBL: canonicalize_denormal1_x86_fp80_dynamic
+ ; DYN-DENORMAL: # %bb.0:
+ ; DYN-DENORMAL-NEXT: fldt .LCPI17_0(%rip)
+ ; DYN-DENORMAL-NEXT: fstpt (%rdi)
+ ; DYN-DENORMAL-NEXT: retq
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
+ store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_denormal1_x86_fp80_ieee(x86_fp80 addrspace(1)* %out) {
+ ; IEEE-DENORMAL-LAEBL: .LCPI17_0:
+ ; IEEE-DENORMAL: .quad 0x0000000000000001 # x86_fp80 3.64519953188247460253E-4951
+ ; IEEE-DENORMAL-LAEBL: canonicalize_denormal1_x86_fp80_ieee
+ ; IEEE-DENORMAL: # %bb.0:
+ ; IEEE-DENORMAL-NEXT: fldt .LCPI17_0(%rip)
+ ; IEEE-DENORMAL-NEXT: fstpt (%rdi)
+ ; IEEE-DENORMAL-NEXT: retq
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
+ store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+ ret void
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll
new file mode 100644
index 00000000000000..c1b5dd0dddcd2b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 < %s | FileCheck %s
+
+define half @complex_canonicalize_fmul_half(half %a, half %b) {
+; CHECK-LABEL: complex_canonicalize_fmul_half:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vsubsh %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+entry:
+
+ %mul1 = fsub half %a, %b
+ %add = fadd half %mul1, %b
+ %mul2 = fsub half %add, %mul1
+ %canonicalized = call half @llvm.canonicalize.f16(half %mul2)
+ %result = fsub half %canonicalized, %b
+ ret half %result
+}
+
+define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
+entry:
+ ; CHECK-LABEL: complex_canonicalize_fmul_x86_fp80
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: fldt 24(%rsp)
+ ; CHECK-NEXT: fldt 8(%rsp)
+ ; CHECK-NEXT: fsub %st(1), %st
+ ; CHECK-NEXT: fld %st(0)
+ ; CHECK-NEXT: fadd %st(2), %st
+ ; CHECK-NEXT: fsubp %st, %st(1)
+ ; CHECK-NEXT: fsubp %st, %st(1)
+ ; CHECK-NEXT: retq
+
+ %mul1 = fsub x86_fp80 %a, %b
+ %add = fadd x86_fp80 %mul1, %b
+ %mul2 = fsub x86_fp80 %add, %mul1
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %mul2)
+ %result = fsub x86_fp80 %canonicalized, %b
+ ret x86_fp80 %result
+}
+
+define bfloat @complex_canonicalize_fmul_bfloat(bfloat %a, bfloat %b) {
+; CHECK-LABEL: complex_canonicalize_fmul_bfloat:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmovw %xmm0, %eax
+; CHECK-NEXT: vmovw %xmm1, %ecx
+; CHECK-NEXT: shll $16, %ecx
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: vmovd %eax, %xmm1
+; CHECK-NEXT: vsubss %xmm0, %xmm1, %xmm1
+; CHECK-NEXT: vcvtneps2bf16 %xmm1, %xmm1
+; CHECK-NEXT: vmovw %xmm1, %eax
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: vmovd %eax, %xmm1
+; CHECK-NEXT: vaddss %xmm0, %xmm1, %xmm2
+; CHECK-NEXT: vcvtneps2bf16 %xmm2, %xmm2
+; CHECK-NEXT: vmovw %xmm2, %eax
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: vmovd %eax, %xmm2
+; CHECK-NEXT: vsubss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vcvtneps2bf16 %xmm1, %xmm1
+; CHECK-NEXT: vmovw %xmm1, %eax
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: vmovd %eax, %xmm1
+; CHECK-NEXT: vcvtneps2bf16 %xmm1, %xmm1
+; CHECK-NEXT: vmovw %xmm1, %eax
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: vmovd %eax, %xmm1
+; CHECK-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; CHECK-NEXT: vmovw %xmm0, %eax
+; CHECK-NEXT: vmovw %eax, %xmm0
+; CHECK-NEXT: retq
+
+entry:
+
+ %sub1 = fsub bfloat %a, %b
+ %add = fadd bfloat %sub1, %b
+ %sub2 = fsub bfloat %add, %sub1
+ %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %sub2)
+ %result = fsub bfloat %canonicalized, %b
+ ret bfloat %result
+}
+
+define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
+start:
+ ; CHECK-LABEL: canonicalize_fp64:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
+ ; CHECK-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+ ; CHECK-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+ ; CHECK-NEXT: vmovapd %xmm2, %xmm0
+ ; CHECK-NEXT: retq
+
+ %c = fcmp olt double %a, %b
+ %d = fcmp uno double %a, 0.000000e+00
+ %or.cond.i.i = or i1 %d, %c
+ %e = select i1 %or.cond.i.i, double %b, double %a
+ %f = tail call double @llvm.canonicalize.f64(double %e) #2
+ ret double %f
+}
+
+define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
+start:
+ ; CHECK-LABEL: canonicalize_fp32:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+ ; CHECK-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+ ; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+ ; CHECK-NEXT: vmovaps %xmm2, %xmm0
+ ; CHECK-NEXT: retq
+
+ %cc = fcmp olt float %aa, %bb
+ %dd = fcmp uno float %aa, 0.000000e+00
+ %or.cond.i.i.x = or i1 %dd, %cc
+ %ee = select i1 %or.cond.i.i.x, float %bb, float %aa
+ %ff = tail call float @llvm.canonicalize.f32(float %ee) #2
+ ret float %ff
+}
+
+define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+ ; CHECK-LAEBL: v_test_canonicalize_var_f32
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovss (%rdi), %xmm0
+ ; CHECK-NEXT: vmovss %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+ %val = load float, float addrspace(1)* %out
+ %canonicalized = call float @llvm.canonicalize.f32(float %val)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
+ ; CHECK-LAEBL: v_test_canonicalize_x86_fp80
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: fldt (%rdi)
+ ; CHECK-NEXT: fstpt (%rdi)
+ ; CHECK-NEXT: retq
+ %val = load x86_fp80, x86_fp80 addrspace(1)* %out
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %val)
+ store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize__half(half addrspace(1)* %out) {
+; CHECK-LABEL: v_test_canonicalize__half:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmovsh (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vmovsh %xmm0, (%rdi)
+; CHECK-NEXT: retq
+entry:
+ %val = load half, half addrspace(1)* %out
+ %canonicalized = call half @llvm.canonicalize.f16(half %val)
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+ ; CHECK-LAEBL: v_test_canonicalize_var_f64
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+ ; CHECK-NEXT: vmovsd %xmm0, (%rdi)
+ ; CHECK-NEXT: retq
+ %val = load double, double addrspace(1)* %out
+ %canonicalized = call double @llvm.canonicalize.f64(double %val)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize__bfloat(bfloat addrspace(1)* %out) {
+; CHECK-LABEL: v_test_canonicalize__bfloat:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzwl (%rdi), %eax
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; CHECK-NEXT: vpextrw $0, %xmm0, (%rdi)
+; CHECK-NEXT: retq
+
+entry:
+ %val = load bfloat, bfloat addrspace(1)* %out
+ %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
+ store bfloat %canonicalized, bfloat addrspace(1)* %out
+ ret void
+}
+
+declare double @llvm.canonicalize.f64(double)
+declare float @llvm.canonicalize.f32(float)
+declare bfloat @llvm.canonicalize.bf16(bfloat)
+declare x86_fp80 @llvm.canonicalize.f80(x86_fp80)
+declare half @llvm.canonicalize.f16(half)
\ No newline at end of file
More information about the llvm-commits
mailing list