[llvm] [X86][SelectionDAG] - Add support for llvm.canonicalize intrinsic (PR #106370)

Pawan Nirpal via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 10 06:11:37 PDT 2024


https://github.com/pawan-nirpal-031 updated https://github.com/llvm/llvm-project/pull/106370

>From a824dede98e9a979dd432d0a72b01ad730474245 Mon Sep 17 00:00:00 2001
From: Pawan Anil Nirpal <pawan.anil.nirpal at intel.com>
Date: Wed, 28 Aug 2024 13:09:30 +0200
Subject: [PATCH 1/4] [X86][SelectionDAG] - Add support for llvm.canonicalize
 intrinsic

Enable support for fcanonicalize intrinsic lowering.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  50 +++
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |  46 +++
 .../CodeGen/X86/canonicalize-constants.ll     | 210 +++++++++++++
 .../CodeGen/X86/canonicalize-subnormals.ll    | 287 ++++++++++++++++++
 llvm/test/CodeGen/X86/canonicalize-vars.ll    | 193 ++++++++++++
 5 files changed, 786 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/canonicalize-constants.ll
 create mode 100644 llvm/test/CodeGen/X86/canonicalize-subnormals.ll
 create mode 100644 llvm/test/CodeGen/X86/canonicalize-vars.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 74e3a898569bea..c1679b1002df5e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1275,6 +1275,56 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       }
     }
     break;
+    case ISD::FCANONICALIZE: {
+      const Triple &TT = DAG.getTarget().getTargetTriple();
+      if (TT.getArch() == Triple::x86 || TT.getArch() == Triple::x86_64) {
+        SDValue Operand = Node->getOperand(0);
+        SDLoc dl(Node);
+        EVT VT = Operand.getValueType();
+
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Operand)) {
+          const APFloat &C = CFP->getValueAPF();
+          if (C.isDenormal()) {
+            DenormalMode Mode =
+                DAG.getMachineFunction().getDenormalMode(C.getSemantics());
+            assert((Mode != DenormalMode::getPositiveZero()) &&
+                   "Positive denormal mode is not valid for X86 target.");
+            if (Mode == DenormalMode::getPreserveSign()) {
+              SDValue SDZero =
+                  DAG.getConstantFP((C.isNegative() ? -0.0 : 0.0), dl, VT);
+              ConstantFPSDNode *ZeroConstFP = cast<ConstantFPSDNode>(SDZero);
+              SDValue CanonZeroFPLoad = ExpandConstantFP(ZeroConstFP, true);
+              DAG.ReplaceAllUsesWith(Node, CanonZeroFPLoad.getNode());
+              LLVM_DEBUG(dbgs()
+                         << "Legalized Denormal under mode PreserveSign\n");
+              return;
+            } else if (Mode == DenormalMode::getIEEE()) {
+              DAG.ReplaceAllUsesWith(Node, Operand.getNode());
+              LLVM_DEBUG(dbgs() << "Legalized Denormal under mode IEEE\n");
+              return;
+            }
+          } else if (C.isNaN() && C.isSignaling()) {
+            APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+            SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
+            ConstantFPSDNode *QNaNConstFP = cast<ConstantFPSDNode>(QuitNaN);
+            SDValue QNanLoad = ExpandConstantFP(QNaNConstFP, true);
+            DAG.ReplaceAllUsesWith(Node, QNanLoad.getNode());
+            LLVM_DEBUG(dbgs() << "Legalized Signaling NaN to Quiet NaN\n");
+            return;
+          }
+        } else if (Operand.isUndef()) {
+          APFloat CanonicalQNaN = APFloat::getQNaN(VT.getFltSemantics());
+          SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
+          ConstantFPSDNode *QNaNConstFP = cast<ConstantFPSDNode>(QuitNaN);
+          SDValue QNanLoad = ExpandConstantFP(QNaNConstFP, true);
+          DAG.ReplaceAllUsesWith(Node, QNanLoad.getNode());
+          LLVM_DEBUG(dbgs() << "Legalized Undef to Quiet NaN\n");
+          return;
+        }
+        break;
+      }
+      break;
+    }
     case ISD::FSHL:
     case ISD::FSHR:
     case ISD::SRL_PARTS:
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d0a54ab8993c26..4bb8c9afd23edc 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5271,6 +5271,52 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+  case ISD::FCANONICALIZE: {
+    SDValue Operand = Node->getOperand(0);
+    EVT VT = Node->getValueType(0);
+
+    // Perform canonicalization for constants. Replace the operand by a load
+    // from constant pool for this constant. At this point subnoraml values like
+    // denormals, snans have been canonicalized so no need to deal with those
+    // cases.
+    if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Operand)) {
+      const X86TargetLowering *X86Lowering =
+          static_cast<const X86TargetLowering *>(TLI);
+      if (const Constant *CV = X86Lowering->getTargetConstantFromLoad(Load)) {
+        const ConstantFP *CFP = dyn_cast<ConstantFP>(CV);
+        if (CFP) {
+          ReplaceNode(Node, Load);
+          return;
+        }
+      }
+    }
+
+    // Canonicalize normal non-constant/non-undef FP Nodes.
+    SDValue MulNode;
+    SDValue One;
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      One = CurDAG->getConstantFP(1.0f, dl, VT);
+    } else if (VT == MVT::f80) {
+      APFloat Val = APFloat::getOne(APFloat::x87DoubleExtended());
+      One = CurDAG->getConstantFP(Val, dl, VT);
+    } else if (VT == MVT::f16) {
+      APFloat Val(APFloat::IEEEhalf(), "1.0");
+      One = CurDAG->getConstantFP(Val, dl, VT);
+    } else if (VT == MVT::bf16) {
+      APFloat Val(APFloat::BFloat(), "1.0");
+      One = CurDAG->getConstantFP(Val, dl, VT);
+    } else {
+      // Is it better to assert? when we encounter an unknown FP type,Than to
+      // just replace with the operand! As this might be our last attempt at
+      // legalization.
+      ReplaceNode(Node, Operand.getNode());
+      return;
+    }
+    // TODO : Follow-up with tablegen pattern to generate mul * 1.0.
+    MulNode = CurDAG->getNode(ISD::FMUL, dl, VT, Operand, One);
+    ReplaceNode(Node, MulNode.getNode());
+    return;
+  }
   case ISD::BRIND:
   case X86ISD::NT_BRIND: {
     if (Subtarget->isTargetNaCl())
diff --git a/llvm/test/CodeGen/X86/canonicalize-constants.ll b/llvm/test/CodeGen/X86/canonicalize-constants.ll
new file mode 100644
index 00000000000000..b71c74bcd4472b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/canonicalize-constants.ll
@@ -0,0 +1,210 @@
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 < %s | FileCheck %s
+
+define float @canon_fp32() {
+  ; CHECK-LABEL: .LCPI0_0:
+  ; CHECK: .long	0x40400000                      # float 3
+  ; CHECK-LABEL: canon_fp32
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vmovss	.LCPI0_0(%rip), %xmm0           # xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+  ; CHECK-NEXT: retq
+  %canonicalized = call float @llvm.canonicalize.f32(float 3.0)
+  ret float %canonicalized
+}
+
+define half @canon_fp16() {
+  ; CHECK-LABEL: .LCPI1_0:
+  ; CHECK: .short	0x4200                          # half 3
+  ; CHECK-LABEL: canon_fp16
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vmovsh	.LCPI1_0(%rip), %xmm0
+  ; CHECK-NEXT: retq
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH4200) ; half 3.0
+  ret half %canonicalized
+}
+
+define double @canon_fp64() {
+  ; CHECK-LABEL: .LCPI2_0:
+  ; CHECK: .quad	0x4008000000000000              # double 3
+  ; CHECK-LABEL: canon_fp64
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vmovsd	.LCPI2_0(%rip), %xmm0
+  ; CHECK-NEXT: retq
+  %canonicalized = call double @llvm.canonicalize.f64(double 3.0)
+  ret double %canonicalized
+}
+
+define x86_fp80 @canon_fp80() {
+  ; CHECK-LABEL: .LCPI3_0:
+  ; CHECK: .long	0x42b40000                      # float 90
+  ; CHECK-LABEL: canon_fp80
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: flds	.LCPI3_0(%rip)
+  ; CHECK-NEXT: retq
+
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000) ; 90.0
+  ret x86_fp80 %canonicalized
+}
+
+
+define x86_fp80 @complex_canonicalize_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
+entry:
+  ; CHECK-LABEL: .LCPI4_0:
+  ; CHECK: .long	0x42b40000                      # float 90
+  ; CHECK-LABEL: complex_canonicalize_x86_fp80
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	fldt	24(%rsp)
+  ; CHECK-NEXT: 	flds	.LCPI4_0(%rip)
+  ; CHECK-NEXT: 	fsubp	%st, %st(1)
+  ; CHECK-NEXT: 	retq
+
+  %mul1 = fsub x86_fp80 %a, %b
+  %add = fadd x86_fp80 %mul1, %b
+  %mul2 = fsub x86_fp80 %add, %mul1
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000)
+  %result = fsub x86_fp80 %canonicalized, %b
+  ret x86_fp80 %result
+}
+
+define double @complex_canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
+start:
+  ; CHECK-LABEL: .LCPI5_0:
+  ; CHECK: .quad	0x4008000000000000              # double 3
+  ; CHECK-LABEL: complex_canonicalize_fp64
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vmovsd	.LCPI5_0(%rip), %xmm0
+  ; CHECK-NEXT: retq
+
+  %c = fcmp olt double %a, %b
+  %d = fcmp uno double %a, 0.000000e+00
+  %or.cond.i.i = or i1 %d, %c
+  %e = select i1 %or.cond.i.i, double %b, double %a
+  %f = tail call double @llvm.canonicalize.f64(double 3.0) #2
+  ret double %f
+}
+
+define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+  ; CHECK-LAEBL: test_fold_canonicalize_p0_f32
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vxorps	%xmm0, %xmm0, %xmm0
+  ; CHECK-NEXT: vmovss	%xmm0, (%rdi)
+  ; CHECK-NEXT: retq
+  %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+  ; CHECK-LAEBL: .LCPI7_0:
+  ; CHECK: .long	0x80000000                      # float -0
+  ; CHECK-LAEBL: test_fold_canonicalize_n0_f32
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vmovss	.LCPI7_0(%rip), %xmm0
+  ; CHECK-NEXT: vmovss	%xmm0, (%rdi)
+  ; CHECK-NEXT: retq
+  %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+
+define void @v_test_canonicalize_p90_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
+  ; CHECK-LAEBL: .LCPI8_0:
+  ; CHECK: .long	0x42b40000                      # float 90
+  ; CHECK-LAEBL: v_test_canonicalize_p90_x86_fp80
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: flds	.LCPI8_0(%rip)
+  ; CHECK-NEXT: fstpt	(%rdi)
+  ; CHECK-NEXT: retq
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000)
+  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize_p3__half(half addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI9_0:
+  ; CHECK: .short	0x4200                          # half 3
+  ; CHECK-LABEL: v_test_canonicalize_p3__half:
+  ; CHECK:       # %bb.0: # %entry
+  ; CHECK-NEXT:   vmovsh	.LCPI9_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsh	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+entry:
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xH4200)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize_p3_f64(double addrspace(1)* %out) #1 {
+  ; CHECK-LABEL: .LCPI10_0:
+  ; CHECK: .quad	0x4008000000000000              # double 3
+  ; CHECK-LAEBL: v_test_canonicalize_p3_f64
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT:       vmovsd	.LCPI10_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+entry:
+  %canonicalized = call double @llvm.canonicalize.f64(double 3.0)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize_p3__bfloat(bfloat addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI11_0:
+  ; CHECK: .long	0x40400000                      # float 3
+  ; CHECK-LABEL: v_test_canonicalize_p3__bfloat:
+  ; CHECK:       # %bb.0: # %entry
+  ; CHECK-NEXT: 	vmovss	.LCPI11_0(%rip), %xmm0          # xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+  ; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
+  ; CHECK-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+entry:
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 3.0)
+  store bfloat %canonicalized, bfloat addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize_n3__bfloat(bfloat addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI12_0:
+  ; CHECK: .long	0xc0400000                      # float -3
+  ; CHECK-LABEL: v_test_canonicalize_n3__bfloat:
+  ; CHECK:       # %bb.0: # %entry
+  ; CHECK-NEXT: 	vmovss	.LCPI12_0(%rip), %xmm0          # xmm0 = [-3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+  ; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
+  ; CHECK-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+entry:
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -3.0)
+  store bfloat %canonicalized, bfloat addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize_n90_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
+  ; CHECK-LAEBL: .LCPI13_0:
+  ; CHECK: .long	0xc2b40000                      # float -90
+  ; CHECK-LAEBL: v_test_canonicalize_n90_x86_fp80
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: flds	.LCPI13_0(%rip)
+  ; CHECK-NEXT: fstpt	(%rdi)
+  ; CHECK-NEXT: retq
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xKC005B400000000000000)
+  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize_n3__half(half addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI14_0:
+  ; CHECK: .short	0xc200                          # half -3
+  ; CHECK-LABEL: v_test_canonicalize_n3__half:
+  ; CHECK:       # %bb.0: # %entry
+  ; CHECK-NEXT:   vmovsh	.LCPI14_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsh	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+entry:
+  %canonicalized = call half @llvm.canonicalize.f16(half 0xHC200)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/canonicalize-subnormals.ll b/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
new file mode 100644
index 00000000000000..8e7e04c2a67dc8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
@@ -0,0 +1,287 @@
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck %s
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL %s 
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL %s
+
+define void @canonicalize_denormal1_f32_pre_sign(float addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI0_0:
+  ; CHECK: 	.long	0x80000000                      # float -0
+  ; CHECK-LABEL: canonicalize_denormal1_f32_pre_sign:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovss	.LCPI0_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovss	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_f64_pre_sign(double addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI1_0:
+  ; CHECK: 	.quad	0x8000000000000000              # double -0
+  ; CHECK-LABEL: canonicalize_denormal1_f64_pre_sign:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovsd	.LCPI1_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+
+define void @canonicalize_qnan_f64(double addrspace(1)* %out) {
+  ;cCHECK-LABEL: .LCPI2_0:
+  ;cCHECK: 	.quad	0x7ff8000000000000              # double NaN
+  ; CHECK-LABEL: canonicalize_qnan_f64:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovsd	.LCPI2_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) {
+  ;cCHECK-LABEL: .LCPI3_0:
+  ;cCHECK: 	.quad	0xffffffffffffffff              # double NaN
+  ; CHECK-LABEL: canonicalize_qnan_value_neg1_f64:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovsd	.LCPI3_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI4_0:
+  ; CHECK: 	.quad	0xfffffffffffffffe              # double NaN
+  ; CHECK-LABEL: canonicalize_qnan_value_neg2_f64:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovsd	.LCPI4_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_snan0_value_f64(double addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI5_0:
+  ; CHECK: 	.quad	0x7ff8000000000000              # double NaN
+  ; CHECK-LABEL: canonicalize_snan0_value_f64:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovsd	.LCPI5_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_undef(double addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI6_0:
+  ; CHECK: 	.quad	0x7ff8000000000000              # double NaN
+  ; CHECK-LABEL: canonicalize_undef:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovsd	.LCPI6_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call double @llvm.canonicalize.f64(double undef)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_f32_ieee(float addrspace(1)* %out) {
+  ; IEEE-DENORMAL-LABEL: .LCPI7_0:
+  ; IEEE-DENORMAL: 	.long	0x807fffff                      # float -1.17549421E-38
+  ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_f32_ieee:
+  ; IEEE-DENORMAL: # %bb.0:
+  ; IEEE-DENORMAL-NEXT: 	vmovss	.LCPI7_0(%rip), %xmm0
+  ; IEEE-DENORMAL-NEXT: 	vmovss	%xmm0, (%rdi)
+  ; IEEE-DENORMAL-NEXT: 	retq
+
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_f64_ieee(double addrspace(1)* %out) {
+  ; IEEE-DENORMAL-LABEL: .LCPI8_0:
+  ; IEEE-DENORMAL: 	.quad	0x800fffffffffffff              # double -2.2250738585072009E-308
+  ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_f64_ieee:
+  ; IEEE-DENORMAL: # %bb.0:
+  ; IEEE-DENORMAL-NEXT: 	vmovsd	.LCPI8_0(%rip), %xmm0
+  ; IEEE-DENORMAL-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; IEEE-DENORMAL-NEXT: 	retq
+
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_f32_dynamic(float addrspace(1)* %out) {
+  ; DYN-DENORMAL-LABEL: .LCPI9_0:
+  ; DYN-DENORMAL: 	.long	0x807fffff                      # float -1.17549421E-38
+  ; DYN-DENORMAL-LABEL: canonicalize_denormal1_f32_dynamic:
+  ; DYN-DENORMAL: # %bb.0:
+  ; DYN-DENORMAL-NEXT: 	vmovss	.LCPI9_0(%rip), %xmm0
+  ; DYN-DENORMAL-NEXT: 	vmovss	%xmm0, (%rdi)
+  ; DYN-DENORMAL-NEXT: 	retq
+
+  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_f64_dynamic(double addrspace(1)* %out) {
+  ; DYN-DENORMAL-LABEL: .LCPI10_0:
+  ; DYN-DENORMAL: 	.quad	0x800fffffffffffff              # double -2.2250738585072009E-308
+  ; DYN-DENORMAL-LABEL: canonicalize_denormal1_f64_dynamic:
+  ; DYN-DENORMAL: # %bb.0:
+  ; DYN-DENORMAL-NEXT: 	vmovsd	.LCPI10_0(%rip), %xmm0
+  ; DYN-DENORMAL-NEXT: 	vmovsd	%xmm0, (%rdi)
+  ; DYN-DENORMAL-NEXT: 	retq
+
+  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_bfloat_pre_sign(bfloat addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI11_0:
+  ; CHECK: 	.long	0x80000000                      # float -0
+  ; CHECK-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovss	.LCPI11_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
+  ; CHECK-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
+  store bfloat %canonicalized, bfloat addrspace(1)* %out
+  ret void
+}
+
+
+define void @canonicalize_denormal1_bfloat_ieee(bfloat addrspace(1)* %out) {
+  ; IEEE-DENORMAL-LABEL: .LCPI12_0:
+  ; IEEE-DENORMAL: 	.long	0x80000000                      # float -0
+  ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_bfloat_ieee:
+  ; IEEE-DENORMAL: # %bb.0:
+  ; IEEE-DENORMAL-NEXT: 	vmovss	.LCPI12_0(%rip), %xmm0
+  ; IEEE-DENORMAL-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
+  ; IEEE-DENORMAL-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
+  ; IEEE-DENORMAL-NEXT: 	retq
+
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
+  store bfloat %canonicalized, bfloat addrspace(1)* %out
+  ret void
+}
+
+
+define void @canonicalize_denormal1_bfloat_dynamic(bfloat addrspace(1)* %out) {
+  ; DYN-DENORMAL-LABEL: .LCPI13_0:
+  ; DYN-DENORMAL: 	.long	0x80000000                      # float -0
+  ; DYN-DENORMAL-LABEL: canonicalize_denormal1_bfloat_dynamic:
+  ; DYN-DENORMAL: # %bb.0:
+  ; DYN-DENORMAL-NEXT: 	vmovss	.LCPI13_0(%rip), %xmm0
+  ; DYN-DENORMAL-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
+  ; DYN-DENORMAL-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
+  ; DYN-DENORMAL-NEXT: 	retq
+
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
+  store bfloat %canonicalized, bfloat addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_half_pre_sign(half addrspace(1)* %out) {
+  ; CHECK-LABEL: .LCPI14_0:
+  ; CHECK: 	.short	0x8000                      # half -0
+  ; CHECK-LABEL: canonicalize_denormal1_half_pre_sign:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmovsh	.LCPI14_0(%rip), %xmm0
+  ; CHECK-NEXT: 	vmovsh	%xmm0, (%rdi)
+  ; CHECK-NEXT: 	retq
+
+  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+
+define void @canonicalize_denormal1_half_ieee(half addrspace(1)* %out) {
+  ; IEEE-DENORMAL-LABEL: .LCPI15_0:
+  ; IEEE-DENORMAL: 	.short	0x8000                          # half -0
+  ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_half_ieee:
+  ; IEEE-DENORMAL: # %bb.0:
+  ; IEEE-DENORMAL-NEXT: 	vmovsh	.LCPI15_0(%rip), %xmm0
+  ; IEEE-DENORMAL-NEXT: 	vmovsh	%xmm0, (%rdi)
+  ; IEEE-DENORMAL-NEXT: 	retq
+
+  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_half_dynamic(half addrspace(1)* %out) {
+  ; DYN-DENORMAL-LABEL: .LCPI16_0:
+  ; DYN-DENORMAL: 	.short	0x8000                          # half -0
+  ; DYN-DENORMAL-LABEL: canonicalize_denormal1_half_dynamic:
+  ; DYN-DENORMAL: # %bb.0:
+  ; DYN-DENORMAL-NEXT: 	vmovsh	.LCPI16_0(%rip), %xmm0
+  ; DYN-DENORMAL-NEXT: 	vmovsh	%xmm0, (%rdi)
+  ; DYN-DENORMAL-NEXT: 	retq
+
+  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_x86_fp80_pre_sign(x86_fp80 addrspace(1)* %out) {
+    ; CHECK-LAEBL: .LCPI17_0:
+    ; CHECK: .long	0x00000000                      # float 0
+    ; CHECK-LAEBL: canonicalize_denormal1_x86_fp80_pre_sign
+    ; CHECK: # %bb.0:
+    ; CHECK-NEXT: flds	.LCPI17_0(%rip)
+    ; CHECK-NEXT: fstpt	(%rdi)
+    ; CHECK-NEXT: retq
+    %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
+    store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+    ret void
+}
+
+define void @canonicalize_denormal1_x86_fp80_dynamic(x86_fp80 addrspace(1)* %out) {
+  ; DYN-DENORMAL-LAEBL: .LCPI17_0:
+  ; DYN-DENORMAL: .quad	0x0000000000000001              # x86_fp80 3.64519953188247460253E-4951
+  ; DYN-DENORMAL-LAEBL: canonicalize_denormal1_x86_fp80_dynamic
+  ; DYN-DENORMAL: # %bb.0:
+  ; DYN-DENORMAL-NEXT: fldt	.LCPI17_0(%rip)
+  ; DYN-DENORMAL-NEXT: fstpt	(%rdi)
+  ; DYN-DENORMAL-NEXT: retq
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
+  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+  ret void
+}
+
+define void @canonicalize_denormal1_x86_fp80_ieee(x86_fp80 addrspace(1)* %out) {
+  ; IEEE-DENORMAL-LAEBL: .LCPI17_0:
+  ; IEEE-DENORMAL: .quad	0x0000000000000001              # x86_fp80 3.64519953188247460253E-4951
+  ; IEEE-DENORMAL-LAEBL: canonicalize_denormal1_x86_fp80_ieee
+  ; IEEE-DENORMAL: # %bb.0:
+  ; IEEE-DENORMAL-NEXT: fldt	.LCPI17_0(%rip)
+  ; IEEE-DENORMAL-NEXT: fstpt	(%rdi)
+  ; IEEE-DENORMAL-NEXT: retq
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
+  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll
new file mode 100644
index 00000000000000..c1b5dd0dddcd2b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
+; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 < %s | FileCheck %s
+
+define half @complex_canonicalize_fmul_half(half %a, half %b) {
+; CHECK-LABEL: complex_canonicalize_fmul_half:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT: 	vsubsh	%xmm1, %xmm0, %xmm0
+; CHECK-NEXT: 	vaddsh	%xmm1, %xmm0, %xmm2
+; CHECK-NEXT: 	vsubsh	%xmm0, %xmm2, %xmm0
+; CHECK-NEXT: 	vsubsh	%xmm1, %xmm0, %xmm0
+; CHECK-NEXT: 	retq
+entry:
+
+  %mul1 = fsub half %a, %b
+  %add = fadd half %mul1, %b
+  %mul2 = fsub half %add, %mul1
+  %canonicalized = call half @llvm.canonicalize.f16(half %mul2)
+  %result = fsub half %canonicalized, %b
+  ret half %result
+}
+
+define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
+entry:
+  ; CHECK-LABEL: complex_canonicalize_fmul_x86_fp80
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	fldt	24(%rsp)
+  ; CHECK-NEXT: 	fldt	8(%rsp)
+  ; CHECK-NEXT: 	fsub	%st(1), %st
+  ; CHECK-NEXT: 	fld	%st(0)
+  ; CHECK-NEXT: 	fadd	%st(2), %st
+  ; CHECK-NEXT: 	fsubp	%st, %st(1)
+  ; CHECK-NEXT: 	fsubp	%st, %st(1)
+  ; CHECK-NEXT: 	retq
+
+  %mul1 = fsub x86_fp80 %a, %b
+  %add = fadd x86_fp80 %mul1, %b
+  %mul2 = fsub x86_fp80 %add, %mul1
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %mul2)
+  %result = fsub x86_fp80 %canonicalized, %b
+  ret x86_fp80 %result
+}
+
+define bfloat @complex_canonicalize_fmul_bfloat(bfloat %a, bfloat %b) {
+; CHECK-LABEL: complex_canonicalize_fmul_bfloat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT: 	vmovw	%xmm0, %eax
+; CHECK-NEXT: 	vmovw	%xmm1, %ecx
+; CHECK-NEXT: 	shll	$16, %ecx
+; CHECK-NEXT: 	vmovd	%ecx, %xmm0
+; CHECK-NEXT: 	shll	$16, %eax
+; CHECK-NEXT: 	vmovd	%eax, %xmm1
+; CHECK-NEXT: 	vsubss	%xmm0, %xmm1, %xmm1
+; CHECK-NEXT: 	vcvtneps2bf16	%xmm1, %xmm1
+; CHECK-NEXT: 	vmovw	%xmm1, %eax
+; CHECK-NEXT: 	shll	$16, %eax
+; CHECK-NEXT: 	vmovd	%eax, %xmm1
+; CHECK-NEXT: 	vaddss	%xmm0, %xmm1, %xmm2
+; CHECK-NEXT: 	vcvtneps2bf16	%xmm2, %xmm2
+; CHECK-NEXT: 	vmovw	%xmm2, %eax
+; CHECK-NEXT: 	shll	$16, %eax
+; CHECK-NEXT: 	vmovd	%eax, %xmm2
+; CHECK-NEXT: 	vsubss	%xmm1, %xmm2, %xmm1
+; CHECK-NEXT: 	vcvtneps2bf16	%xmm1, %xmm1
+; CHECK-NEXT: 	vmovw	%xmm1, %eax
+; CHECK-NEXT: 	shll	$16, %eax
+; CHECK-NEXT: 	vmovd	%eax, %xmm1
+; CHECK-NEXT: 	vcvtneps2bf16	%xmm1, %xmm1
+; CHECK-NEXT: 	vmovw	%xmm1, %eax
+; CHECK-NEXT: 	shll	$16, %eax
+; CHECK-NEXT: 	vmovd	%eax, %xmm1
+; CHECK-NEXT: 	vsubss	%xmm0, %xmm1, %xmm0
+; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
+; CHECK-NEXT: 	vmovw	%xmm0, %eax
+; CHECK-NEXT: 	vmovw	%eax, %xmm0
+; CHECK-NEXT: 	retq
+
+entry:
+
+  %sub1 = fsub bfloat %a, %b
+  %add = fadd bfloat %sub1, %b
+  %sub2 = fsub bfloat %add, %sub1
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %sub2)
+  %result = fsub bfloat %canonicalized, %b
+  ret bfloat %result
+}
+
+define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
+start:
+  ; CHECK-LABEL: canonicalize_fp64:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: 	vmaxsd	%xmm0, %xmm1, %xmm2
+  ; CHECK-NEXT: 	vcmpunordsd	%xmm0, %xmm0, %k1
+  ; CHECK-NEXT: 	vmovsd	%xmm1, %xmm2, %xmm2 {%k1}
+  ; CHECK-NEXT: 	vmovapd	%xmm2, %xmm0
+  ; CHECK-NEXT: 	retq
+
+  %c = fcmp olt double %a, %b
+  %d = fcmp uno double %a, 0.000000e+00
+  %or.cond.i.i = or i1 %d, %c
+  %e = select i1 %or.cond.i.i, double %b, double %a
+  %f = tail call double @llvm.canonicalize.f64(double %e) #2
+  ret double %f
+}
+
+define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
+start:
+  ; CHECK-LABEL: canonicalize_fp32:
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+  ; CHECK-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+  ; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+  ; CHECK-NEXT: vmovaps %xmm2, %xmm0
+  ; CHECK-NEXT: retq
+
+  %cc = fcmp olt float %aa, %bb
+  %dd = fcmp uno float %aa, 0.000000e+00
+  %or.cond.i.i.x = or i1 %dd, %cc
+  %ee = select i1 %or.cond.i.i.x, float %bb, float %aa
+  %ff = tail call float @llvm.canonicalize.f32(float %ee) #2
+  ret float %ff
+}
+
+define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+  ; CHECK-LAEBL: v_test_canonicalize_var_f32
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vmovss	(%rdi), %xmm0                   
+	; CHECK-NEXT: vmovss	%xmm0, (%rdi)
+	; CHECK-NEXT: retq
+  %val = load float, float addrspace(1)* %out
+  %canonicalized = call float @llvm.canonicalize.f32(float %val)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
+  ; CHECK-LAEBL: v_test_canonicalize_x86_fp80
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: fldt	(%rdi)
+	; CHECK-NEXT: fstpt	(%rdi)
+	; CHECK-NEXT: retq
+  %val = load x86_fp80, x86_fp80 addrspace(1)* %out
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %val)
+  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize__half(half addrspace(1)* %out) {
+; CHECK-LABEL: v_test_canonicalize__half:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT: 	vmovsh	(%rdi), %xmm0                   # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: 	vmovsh	%xmm0, (%rdi)
+; CHECK-NEXT: 	retq
+entry:
+  %val = load half, half addrspace(1)* %out
+  %canonicalized = call half @llvm.canonicalize.f16(half %val)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+  ; CHECK-LAEBL: v_test_canonicalize_var_f64
+  ; CHECK: # %bb.0:
+  ; CHECK-NEXT: vmovsd	(%rdi), %xmm0                   # xmm0 = mem[0],zero                   
+	; CHECK-NEXT: vmovsd	%xmm0, (%rdi)
+	; CHECK-NEXT: retq
+  %val = load double, double addrspace(1)* %out
+  %canonicalized = call double @llvm.canonicalize.f64(double %val)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+define void @v_test_canonicalize__bfloat(bfloat addrspace(1)* %out) {
+; CHECK-LABEL: v_test_canonicalize__bfloat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT: 	movzwl	(%rdi), %eax
+; CHECK-NEXT: 	shll	$16, %eax
+; CHECK-NEXT: 	vmovd	%eax, %xmm0
+; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
+; CHECK-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
+; CHECK-NEXT: 	retq
+
+entry:
+  %val = load bfloat, bfloat addrspace(1)* %out
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
+  store bfloat %canonicalized, bfloat addrspace(1)* %out
+  ret void
+}
+
+declare double @llvm.canonicalize.f64(double)
+declare float @llvm.canonicalize.f32(float)
+declare bfloat @llvm.canonicalize.bf16(bfloat)
+declare x86_fp80 @llvm.canonicalize.f80(x86_fp80)
+declare half @llvm.canonicalize.f16(half)
\ No newline at end of file

>From 34d5244817bcd98c50bffea2a551b5b94722d855 Mon Sep 17 00:00:00 2001
From: Pawan Anil Nirpal <pawan.anil.nirpal at intel.com>
Date: Fri, 6 Sep 2024 09:24:44 +0200
Subject: [PATCH 2/4] Move combine operations to DAG combiner over from
 legalizer, address comments

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |   50 -
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |   46 -
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  120 +
 .../CodeGen/X86/canonicalize-constants.ll     |  593 ++++-
 .../CodeGen/X86/canonicalize-subnormals.ll    | 1929 +++++++++++++++--
 llvm/test/CodeGen/X86/canonicalize-vars.ll    |  997 ++++++++-
 6 files changed, 3270 insertions(+), 465 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c1679b1002df5e..74e3a898569bea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1275,56 +1275,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       }
     }
     break;
-    case ISD::FCANONICALIZE: {
-      const Triple &TT = DAG.getTarget().getTargetTriple();
-      if (TT.getArch() == Triple::x86 || TT.getArch() == Triple::x86_64) {
-        SDValue Operand = Node->getOperand(0);
-        SDLoc dl(Node);
-        EVT VT = Operand.getValueType();
-
-        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Operand)) {
-          const APFloat &C = CFP->getValueAPF();
-          if (C.isDenormal()) {
-            DenormalMode Mode =
-                DAG.getMachineFunction().getDenormalMode(C.getSemantics());
-            assert((Mode != DenormalMode::getPositiveZero()) &&
-                   "Positive denormal mode is not valid for X86 target.");
-            if (Mode == DenormalMode::getPreserveSign()) {
-              SDValue SDZero =
-                  DAG.getConstantFP((C.isNegative() ? -0.0 : 0.0), dl, VT);
-              ConstantFPSDNode *ZeroConstFP = cast<ConstantFPSDNode>(SDZero);
-              SDValue CanonZeroFPLoad = ExpandConstantFP(ZeroConstFP, true);
-              DAG.ReplaceAllUsesWith(Node, CanonZeroFPLoad.getNode());
-              LLVM_DEBUG(dbgs()
-                         << "Legalized Denormal under mode PreserveSign\n");
-              return;
-            } else if (Mode == DenormalMode::getIEEE()) {
-              DAG.ReplaceAllUsesWith(Node, Operand.getNode());
-              LLVM_DEBUG(dbgs() << "Legalized Denormal under mode IEEE\n");
-              return;
-            }
-          } else if (C.isNaN() && C.isSignaling()) {
-            APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
-            SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
-            ConstantFPSDNode *QNaNConstFP = cast<ConstantFPSDNode>(QuitNaN);
-            SDValue QNanLoad = ExpandConstantFP(QNaNConstFP, true);
-            DAG.ReplaceAllUsesWith(Node, QNanLoad.getNode());
-            LLVM_DEBUG(dbgs() << "Legalized Signaling NaN to Quiet NaN\n");
-            return;
-          }
-        } else if (Operand.isUndef()) {
-          APFloat CanonicalQNaN = APFloat::getQNaN(VT.getFltSemantics());
-          SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
-          ConstantFPSDNode *QNaNConstFP = cast<ConstantFPSDNode>(QuitNaN);
-          SDValue QNanLoad = ExpandConstantFP(QNaNConstFP, true);
-          DAG.ReplaceAllUsesWith(Node, QNanLoad.getNode());
-          LLVM_DEBUG(dbgs() << "Legalized Undef to Quiet NaN\n");
-          return;
-        }
-        break;
-      }
-      break;
-    }
     case ISD::FSHL:
     case ISD::FSHR:
     case ISD::SRL_PARTS:
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4bb8c9afd23edc..d0a54ab8993c26 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5271,52 +5271,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
-  case ISD::FCANONICALIZE: {
-    SDValue Operand = Node->getOperand(0);
-    EVT VT = Node->getValueType(0);
-
-    // Perform canonicalization for constants. Replace the operand by a load
-    // from constant pool for this constant. At this point subnoraml values like
-    // denormals, snans have been canonicalized so no need to deal with those
-    // cases.
-    if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Operand)) {
-      const X86TargetLowering *X86Lowering =
-          static_cast<const X86TargetLowering *>(TLI);
-      if (const Constant *CV = X86Lowering->getTargetConstantFromLoad(Load)) {
-        const ConstantFP *CFP = dyn_cast<ConstantFP>(CV);
-        if (CFP) {
-          ReplaceNode(Node, Load);
-          return;
-        }
-      }
-    }
-
-    // Canonicalize normal non-constant/non-undef FP Nodes.
-    SDValue MulNode;
-    SDValue One;
-    if (VT == MVT::f32 || VT == MVT::f64) {
-      One = CurDAG->getConstantFP(1.0f, dl, VT);
-    } else if (VT == MVT::f80) {
-      APFloat Val = APFloat::getOne(APFloat::x87DoubleExtended());
-      One = CurDAG->getConstantFP(Val, dl, VT);
-    } else if (VT == MVT::f16) {
-      APFloat Val(APFloat::IEEEhalf(), "1.0");
-      One = CurDAG->getConstantFP(Val, dl, VT);
-    } else if (VT == MVT::bf16) {
-      APFloat Val(APFloat::BFloat(), "1.0");
-      One = CurDAG->getConstantFP(Val, dl, VT);
-    } else {
-      // Is it better to assert? when we encounter an unknown FP type,Than to
-      // just replace with the operand! As this might be our last attempt at
-      // legalization.
-      ReplaceNode(Node, Operand.getNode());
-      return;
-    }
-    // TODO : Follow-up with tablegen pattern to generate mul * 1.0.
-    MulNode = CurDAG->getNode(ISD::FMUL, dl, VT, Operand, One);
-    ReplaceNode(Node, MulNode.getNode());
-    return;
-  }
   case ISD::BRIND:
   case X86ISD::NT_BRIND: {
     if (Subtarget->isTargetNaCl())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1a6be4eb5af1ef..4fc7c70764f565 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2533,6 +2533,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::STRICT_FMA,
                        ISD::FMINNUM,
                        ISD::FMAXNUM,
+                       ISD::FCANONICALIZE,
                        ISD::SUB,
                        ISD::LOAD,
                        ISD::LRINT,
@@ -57976,6 +57977,124 @@ static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+SDValue combineConstantCanonicalize(SDNode *Node, SelectionDAG &DAG) {
+  SDValue Operand = Node->getOperand(0);
+  SDLoc dl(Node);
+  EVT VT = Operand.getValueType();
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Operand)) {
+    const APFloat &C = CFP->getValueAPF();
+    if (C.isDenormal()) {
+      DenormalMode Mode =
+          DAG.getMachineFunction().getDenormalMode(C.getSemantics());
+      assert((Mode != DenormalMode::getPositiveZero()) &&
+             "Positive denormal mode is not valid for X86 target.");
+      if (Mode == DenormalMode::getPreserveSign()) {
+        SDValue SDZero =
+            DAG.getConstantFP((C.isNegative() ? -0.0 : 0.0), dl, VT);
+        return SDZero;
+      } else if (Mode == DenormalMode::getIEEE()) {
+        return Operand;
+      }
+    } else if (C.isNaN() && C.isSignaling()) {
+      APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+      SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
+      return QuitNaN;
+    }
+  }
+  return Operand;
+}
+
+SDValue findLastStrictOpChain(SDNode *N, SelectionDAG &DAG) {
+  assert(N!=nullptr && "Trying to find last chain for a NULL Node");
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue Op = N->getOperand(i);
+    if (Op.getValueType() == MVT::Other && Op.getNode()->isStrictFPOpcode())
+      return Op;
+  }
+  return DAG.getEntryNode();
+}
+
+bool isNonCanonicalizingOperation(SDNode *N) {
+  assert(N!=nullptr && "Trying to check canonical opcode for a NULL Node");
+  unsigned Opc = N->getOpcode();
+  switch (Opc) {
+  // Ensure these are the exasustive set of non canonicalizing opcodes. Add more
+  // if not.
+  case X86::RET:
+  case ISD::STORE:
+  case ISD::SETCC:
+  case X86ISD::FCMP:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool isUsedByNonCanonicalizingOp(SDNode *N) {
+  for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
+       ++UI) {
+    SDNode *User = *UI;
+    if (isNonCanonicalizingOperation(User))
+      return true;
+  }
+  return false;
+}
+
+SDValue combineCanonicalize(SDNode *Node, SelectionDAG &DAG) {
+  SDValue Operand = Node->getOperand(0);
+  EVT VT = Operand.getValueType();
+  SDLoc dl(Node);
+
+  if (auto *CFP = dyn_cast<ConstantFPSDNode>(Operand))
+    return combineConstantCanonicalize(Node, DAG);
+
+  if (Operand.isUndef()) {
+    APFloat CanonicalQNaN = APFloat::getQNaN(VT.getFltSemantics());
+    SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
+    return QuitNaN;
+  }
+
+  // Canonicalize scalar variable FP Nodes.
+  SDValue MulNode;
+  SDValue One;
+  if (VT == MVT::f32 || VT == MVT::f64) {
+    One = DAG.getConstantFP(1.0f, dl, VT);
+  } else if (VT == MVT::f80) {
+    APFloat Val = APFloat::getOne(APFloat::x87DoubleExtended());
+    One = DAG.getConstantFP(Val, dl, VT);
+  } else if (VT == MVT::f16) {
+    APFloat Val(APFloat::IEEEhalf(), "1.0");
+    One = DAG.getConstantFP(Val, dl, VT);
+  } else if (VT == MVT::bf16) {
+    APFloat Val(APFloat::BFloat(), "1.0");
+    One = DAG.getConstantFP(Val, dl, VT);
+  } else {
+    // Is it better to assert? when we encounter an unknown FP type,Than to
+    // just replace with the operand! As this might be our last attempt at
+    // legalization.
+    return Operand;
+  }
+
+  // Store, return, and compare are non-canonicalizing operations. If a
+  // non-canonicalizing operation uses the rest then mul * 1.0 must be generated
+  // int those cases.
+  // TODO: For now Preventing bf16 from generating strict_fmul as it
+  // leads to a crash SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
+  // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
+  // promote this operator's result!
+  if (isUsedByNonCanonicalizingOp(Node) && VT != MVT::bf16) {
+    SDValue Chain = findLastStrictOpChain(Node, DAG);
+    // TODO : Follow-up with tablegen pattern to generate mul * 1.0.
+    SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
+                                     {Chain, One, Operand});
+
+    return StrictFmul;
+  }
+
+  return Operand;
+  // TODO : Hanlde vectors.
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -58015,6 +58134,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
+  case ISD::FCANONICALIZE:  return combineCanonicalize(N,DAG);
   case ISD::BITREVERSE:     return combineBITREVERSE(N, DAG, DCI, Subtarget);
   case ISD::AVGCEILS:
   case ISD::AVGCEILU:
diff --git a/llvm/test/CodeGen/X86/canonicalize-constants.ll b/llvm/test/CodeGen/X86/canonicalize-constants.ll
index b71c74bcd4472b..b1a9733806d40e 100644
--- a/llvm/test/CodeGen/X86/canonicalize-constants.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-constants.ll
@@ -1,62 +1,185 @@
-; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
+; RUN: llc -mattr=sse -mtriple=x86_64 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -mattr=sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefix=SSE2
+; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX
+; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX2
+; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX512F
+; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX512BW
 
 define float @canon_fp32() {
-  ; CHECK-LABEL: .LCPI0_0:
-  ; CHECK: .long	0x40400000                      # float 3
-  ; CHECK-LABEL: canon_fp32
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vmovss	.LCPI0_0(%rip), %xmm0           # xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-  ; CHECK-NEXT: retq
+; SSE-LABEL: canon_fp32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canon_fp32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canon_fp32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canon_fp32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canon_fp32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canon_fp32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX512BW-NEXT:    retq
   %canonicalized = call float @llvm.canonicalize.f32(float 3.0)
   ret float %canonicalized
 }
 
 define half @canon_fp16() {
-  ; CHECK-LABEL: .LCPI1_0:
-  ; CHECK: .short	0x4200                          # half 3
-  ; CHECK-LABEL: canon_fp16
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vmovsh	.LCPI1_0(%rip), %xmm0
-  ; CHECK-NEXT: retq
+; SSE-LABEL: canon_fp16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canon_fp16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canon_fp16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canon_fp16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canon_fp16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canon_fp16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH4200) ; half 3.0
   ret half %canonicalized
 }
 
 define double @canon_fp64() {
-  ; CHECK-LABEL: .LCPI2_0:
-  ; CHECK: .quad	0x4008000000000000              # double 3
-  ; CHECK-LABEL: canon_fp64
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vmovsd	.LCPI2_0(%rip), %xmm0
-  ; CHECK-NEXT: retq
+; SSE-LABEL: canon_fp64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canon_fp64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canon_fp64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canon_fp64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canon_fp64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canon_fp64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double 3.0)
   ret double %canonicalized
 }
 
 define x86_fp80 @canon_fp80() {
-  ; CHECK-LABEL: .LCPI3_0:
-  ; CHECK: .long	0x42b40000                      # float 90
-  ; CHECK-LABEL: canon_fp80
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: flds	.LCPI3_0(%rip)
-  ; CHECK-NEXT: retq
-
+; SSE-LABEL: canon_fp80:
+; SSE:       # %bb.0:
+; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canon_fp80:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canon_fp80:
+; AVX:       # %bb.0:
+; AVX-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canon_fp80:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canon_fp80:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canon_fp80:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX512BW-NEXT:    retq
   %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000) ; 90.0
   ret x86_fp80 %canonicalized
 }
 
 
 define x86_fp80 @complex_canonicalize_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
+; SSE-LABEL: complex_canonicalize_x86_fp80:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: complex_canonicalize_x86_fp80:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: complex_canonicalize_x86_fp80:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: complex_canonicalize_x86_fp80:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: complex_canonicalize_x86_fp80:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: complex_canonicalize_x86_fp80:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX512BW-NEXT:    retq
 entry:
-  ; CHECK-LABEL: .LCPI4_0:
-  ; CHECK: .long	0x42b40000                      # float 90
-  ; CHECK-LABEL: complex_canonicalize_x86_fp80
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	fldt	24(%rsp)
-  ; CHECK-NEXT: 	flds	.LCPI4_0(%rip)
-  ; CHECK-NEXT: 	fsubp	%st, %st(1)
-  ; CHECK-NEXT: 	retq
-
   %mul1 = fsub x86_fp80 %a, %b
   %add = fadd x86_fp80 %mul1, %b
   %mul2 = fsub x86_fp80 %add, %mul1
@@ -66,14 +189,36 @@ entry:
 }
 
 define double @complex_canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
+; SSE-LABEL: complex_canonicalize_fp64:
+; SSE:       # %bb.0: # %start
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: complex_canonicalize_fp64:
+; SSE2:       # %bb.0: # %start
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: complex_canonicalize_fp64:
+; AVX:       # %bb.0: # %start
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: complex_canonicalize_fp64:
+; AVX2:       # %bb.0: # %start
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: complex_canonicalize_fp64:
+; AVX512F:       # %bb.0: # %start
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: complex_canonicalize_fp64:
+; AVX512BW:       # %bb.0: # %start
+; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX512BW-NEXT:    retq
 start:
-  ; CHECK-LABEL: .LCPI5_0:
-  ; CHECK: .quad	0x4008000000000000              # double 3
-  ; CHECK-LABEL: complex_canonicalize_fp64
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vmovsd	.LCPI5_0(%rip), %xmm0
-  ; CHECK-NEXT: retq
-
   %c = fcmp olt double %a, %b
   %d = fcmp uno double %a, 0.000000e+00
   %or.cond.i.i = or i1 %d, %c
@@ -83,24 +228,70 @@ start:
 }
 
 define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
-  ; CHECK-LAEBL: test_fold_canonicalize_p0_f32
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vxorps	%xmm0, %xmm0, %xmm0
-  ; CHECK-NEXT: vmovss	%xmm0, (%rdi)
-  ; CHECK-NEXT: retq
+; SSE-LABEL: test_fold_canonicalize_p0_f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl $0, (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: test_fold_canonicalize_p0_f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movl $0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fold_canonicalize_p0_f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl $0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_fold_canonicalize_p0_f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl $0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_fold_canonicalize_p0_f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movl $0, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_fold_canonicalize_p0_f32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movl $0, (%rdi)
+; AVX512BW-NEXT:    retq
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
 }
 
 define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
-  ; CHECK-LAEBL: .LCPI7_0:
-  ; CHECK: .long	0x80000000                      # float -0
-  ; CHECK-LAEBL: test_fold_canonicalize_n0_f32
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vmovss	.LCPI7_0(%rip), %xmm0
-  ; CHECK-NEXT: vmovss	%xmm0, (%rdi)
-  ; CHECK-NEXT: retq
+; SSE-LABEL: test_fold_canonicalize_n0_f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: test_fold_canonicalize_n0_f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fold_canonicalize_n0_f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_fold_canonicalize_n0_f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_fold_canonicalize_n0_f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: test_fold_canonicalize_n0_f32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; AVX512BW-NEXT:    retq
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -108,27 +299,84 @@ define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
 
 
 define void @v_test_canonicalize_p90_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
-  ; CHECK-LAEBL: .LCPI8_0:
-  ; CHECK: .long	0x42b40000                      # float 90
-  ; CHECK-LAEBL: v_test_canonicalize_p90_x86_fp80
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: flds	.LCPI8_0(%rip)
-  ; CHECK-NEXT: fstpt	(%rdi)
-  ; CHECK-NEXT: retq
+; SSE-LABEL: v_test_canonicalize_p90_x86_fp80:
+; SSE:       # %bb.0:
+; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; SSE-NEXT:    fstpt (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_p90_x86_fp80:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; SSE2-NEXT:    fstpt (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_p90_x86_fp80:
+; AVX:       # %bb.0:
+; AVX-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX-NEXT:    fstpt (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_p90_x86_fp80:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX2-NEXT:    fstpt (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_p90_x86_fp80:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX512F-NEXT:    fstpt (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_p90_x86_fp80:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX512BW-NEXT:    fstpt (%rdi)
+; AVX512BW-NEXT:    retq
   %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000)
   store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
   ret void
 }
 
 define void @v_test_canonicalize_p3__half(half addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI9_0:
-  ; CHECK: .short	0x4200                          # half 3
-  ; CHECK-LABEL: v_test_canonicalize_p3__half:
-  ; CHECK:       # %bb.0: # %entry
-  ; CHECK-NEXT:   vmovsh	.LCPI9_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsh	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; SSE-LABEL: v_test_canonicalize_p3__half:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pextrw $0, %xmm0, %eax
+; SSE-NEXT:    movw %ax, (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_p3__half:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_p3__half:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_p3__half:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_p3__half:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_p3__half:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX512BW-NEXT:    retq
 entry:
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH4200)
   store half %canonicalized, half addrspace(1)* %out
@@ -136,13 +384,41 @@ entry:
 }
 
 define void @v_test_canonicalize_p3_f64(double addrspace(1)* %out) #1 {
-  ; CHECK-LABEL: .LCPI10_0:
-  ; CHECK: .quad	0x4008000000000000              # double 3
-  ; CHECK-LAEBL: v_test_canonicalize_p3_f64
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT:       vmovsd	.LCPI10_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
+; SSE-LABEL: v_test_canonicalize_p3_f64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
+; SSE-NEXT:    movq %rax, (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_p3_f64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
+; SSE2-NEXT:    movq %rax, (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_p3_f64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
+; AVX-NEXT:    movq %rax, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_p3_f64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
+; AVX2-NEXT:    movq %rax, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_p3_f64:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
+; AVX512F-NEXT:    movq %rax, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_p3_f64:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
+; AVX512BW-NEXT:    movq %rax, (%rdi)
+; AVX512BW-NEXT:    retq
 entry:
   %canonicalized = call double @llvm.canonicalize.f64(double 3.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -150,15 +426,35 @@ entry:
 }
 
 define void @v_test_canonicalize_p3__bfloat(bfloat addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI11_0:
-  ; CHECK: .long	0x40400000                      # float 3
-  ; CHECK-LABEL: v_test_canonicalize_p3__bfloat:
-  ; CHECK:       # %bb.0: # %entry
-  ; CHECK-NEXT: 	vmovss	.LCPI11_0(%rip), %xmm0          # xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-  ; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
-  ; CHECK-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; SSE-LABEL: v_test_canonicalize_p3__bfloat:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movw $16448, (%rdi) # imm = 0x4040
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_p3__bfloat:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movw $16448, (%rdi) # imm = 0x4040
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_p3__bfloat:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    movw $16448, (%rdi) # imm = 0x4040
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_p3__bfloat:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movw $16448, (%rdi) # imm = 0x4040
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_p3__bfloat:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movw $16448, (%rdi) # imm = 0x4040
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_p3__bfloat:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movw $16448, (%rdi) # imm = 0x4040
+; AVX512BW-NEXT:    retq
 entry:
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 3.0)
   store bfloat %canonicalized, bfloat addrspace(1)* %out
@@ -166,15 +462,35 @@ entry:
 }
 
 define void @v_test_canonicalize_n3__bfloat(bfloat addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI12_0:
-  ; CHECK: .long	0xc0400000                      # float -3
-  ; CHECK-LABEL: v_test_canonicalize_n3__bfloat:
-  ; CHECK:       # %bb.0: # %entry
-  ; CHECK-NEXT: 	vmovss	.LCPI12_0(%rip), %xmm0          # xmm0 = [-3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-  ; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
-  ; CHECK-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; SSE-LABEL: v_test_canonicalize_n3__bfloat:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_n3__bfloat:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_n3__bfloat:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_n3__bfloat:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_n3__bfloat:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_n3__bfloat:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
+; AVX512BW-NEXT:    retq
 entry:
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -3.0)
   store bfloat %canonicalized, bfloat addrspace(1)* %out
@@ -182,29 +498,86 @@ entry:
 }
 
 define void @v_test_canonicalize_n90_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
-  ; CHECK-LAEBL: .LCPI13_0:
-  ; CHECK: .long	0xc2b40000                      # float -90
-  ; CHECK-LAEBL: v_test_canonicalize_n90_x86_fp80
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: flds	.LCPI13_0(%rip)
-  ; CHECK-NEXT: fstpt	(%rdi)
-  ; CHECK-NEXT: retq
+; SSE-LABEL: v_test_canonicalize_n90_x86_fp80:
+; SSE:       # %bb.0:
+; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; SSE-NEXT:    fstpt (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_n90_x86_fp80:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; SSE2-NEXT:    fstpt (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_n90_x86_fp80:
+; AVX:       # %bb.0:
+; AVX-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX-NEXT:    fstpt (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_n90_x86_fp80:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX2-NEXT:    fstpt (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_n90_x86_fp80:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX512F-NEXT:    fstpt (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_n90_x86_fp80:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; AVX512BW-NEXT:    fstpt (%rdi)
+; AVX512BW-NEXT:    retq
   %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xKC005B400000000000000)
   store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
   ret void
 }
 
 define void @v_test_canonicalize_n3__half(half addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI14_0:
-  ; CHECK: .short	0xc200                          # half -3
-  ; CHECK-LABEL: v_test_canonicalize_n3__half:
-  ; CHECK:       # %bb.0: # %entry
-  ; CHECK-NEXT:   vmovsh	.LCPI14_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsh	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; SSE-LABEL: v_test_canonicalize_n3__half:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pextrw $0, %xmm0, %eax
+; SSE-NEXT:    movw %ax, (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_n3__half:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_n3__half:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_n3__half:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_n3__half:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_n3__half:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX512BW-NEXT:    retq
 entry:
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHC200)
   store half %canonicalized, half addrspace(1)* %out
   ret void
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/X86/canonicalize-subnormals.ll b/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
index 8e7e04c2a67dc8..034da96271eb85 100644
--- a/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
@@ -1,30 +1,269 @@
-; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck %s
-; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL %s 
-; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
+; RUN: llc -mattr=sse2 -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-SSE2 %s
+; RUN: llc -mattr=sse2 -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-SSE2 %s
+; RUN: llc -mattr=sse2 -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-SSE2 %s
+; RUN: llc -mattr=+avx -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-AVX %s
+; RUN: llc -mattr=+avx -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-AVX %s
+; RUN: llc -mattr=+avx -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-AVX %s
+; RUN: llc -mattr=+avx2 -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-AVX2 %s
+; RUN: llc -mattr=+avx2 -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-AVX2 %s
+; RUN: llc -mattr=+avx2 -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-AVX2 %s
+; RUN: llc -mattr=+avx512f -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-AVX512F %s
+; RUN: llc -mattr=+avx512f -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-AVX512F %s
+; RUN: llc -mattr=+avx512f -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-AVX512F %s
+; RUN: llc -mattr=+avx512bw -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-AVX512BW %s
+; RUN: llc -mattr=+avx512bw -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-AVX512BW %s
+; RUN: llc -mattr=+avx512bw -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-AVX512BW %s
 
-define void @canonicalize_denormal1_f32_pre_sign(float addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI0_0:
-  ; CHECK: 	.long	0x80000000                      # float -0
-  ; CHECK-LABEL: canonicalize_denormal1_f32_pre_sign:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovss	.LCPI0_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovss	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
+define double @test_bad_subnormal() {
+; PRE-SIGN-SSE2-LABEL: test_bad_subnormal:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: test_bad_subnormal:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: test_bad_subnormal:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: test_bad_subnormal:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: test_bad_subnormal:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: test_bad_subnormal:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: test_bad_subnormal:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: test_bad_subnormal:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: test_bad_subnormal:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: test_bad_subnormal:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: test_bad_subnormal:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: test_bad_subnormal:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: test_bad_subnormal:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: test_bad_subnormal:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: test_bad_subnormal:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
+  %canon = call double @llvm.canonicalize(double 0x7ff8000000000001) ; Nan
+  ret double %canon
+}
 
+define void @canonicalize_denormal1_f32_pre_sign(float addrspace(1)* %out) {
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f32_pre_sign:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_pre_sign:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_pre_sign:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f32_pre_sign:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_pre_sign:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_pre_sign:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f32_pre_sign:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_pre_sign:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_pre_sign:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f32_pre_sign:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_pre_sign:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_pre_sign:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f32_pre_sign:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_pre_sign:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_pre_sign:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_f64_pre_sign(double addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI1_0:
-  ; CHECK: 	.quad	0x8000000000000000              # double -0
-  ; CHECK-LABEL: canonicalize_denormal1_f64_pre_sign:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovsd	.LCPI1_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f64_pre_sign:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_pre_sign:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_pre_sign:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f64_pre_sign:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_pre_sign:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_pre_sign:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f64_pre_sign:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_pre_sign:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_pre_sign:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f64_pre_sign:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_pre_sign:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_pre_sign:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f64_pre_sign:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_pre_sign:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_pre_sign:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -32,141 +271,875 @@ define void @canonicalize_denormal1_f64_pre_sign(double addrspace(1)* %out) {
 
 
 define void @canonicalize_qnan_f64(double addrspace(1)* %out) {
-  ;cCHECK-LABEL: .LCPI2_0:
-  ;cCHECK: 	.quad	0x7ff8000000000000              # double NaN
-  ; CHECK-LABEL: canonicalize_qnan_f64:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovsd	.LCPI2_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_qnan_f64:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_qnan_f64:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_qnan_f64:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_qnan_f64:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_qnan_f64:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_qnan_f64:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_qnan_f64:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_qnan_f64:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_qnan_f64:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_qnan_f64:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_qnan_f64:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_qnan_f64:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_qnan_f64:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_f64:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_f64:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) {
-  ;cCHECK-LABEL: .LCPI3_0:
-  ;cCHECK: 	.quad	0xffffffffffffffff              # double NaN
-  ; CHECK-LABEL: canonicalize_qnan_value_neg1_f64:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovsd	.LCPI3_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_qnan_value_neg1_f64:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movq $-1, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_qnan_value_neg1_f64:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movq $-1, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_qnan_value_neg1_f64:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movq $-1, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_qnan_value_neg1_f64:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movq $-1, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_qnan_value_neg1_f64:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movq $-1, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_qnan_value_neg1_f64:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movq $-1, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_qnan_value_neg1_f64:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movq $-1, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_qnan_value_neg1_f64:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movq $-1, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_qnan_value_neg1_f64:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movq $-1, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_qnan_value_neg1_f64:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movq $-1, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_qnan_value_neg1_f64:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movq $-1, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_qnan_value_neg1_f64:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movq $-1, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_qnan_value_neg1_f64:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movq $-1, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_value_neg1_f64:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movq $-1, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_value_neg1_f64:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movq $-1, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI4_0:
-  ; CHECK: 	.quad	0xfffffffffffffffe              # double NaN
-  ; CHECK-LABEL: canonicalize_qnan_value_neg2_f64:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovsd	.LCPI4_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_qnan_value_neg2_f64:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movq $-2, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_qnan_value_neg2_f64:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movq $-2, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_qnan_value_neg2_f64:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movq $-2, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_qnan_value_neg2_f64:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movq $-2, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_qnan_value_neg2_f64:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movq $-2, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_qnan_value_neg2_f64:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movq $-2, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_qnan_value_neg2_f64:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movq $-2, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_qnan_value_neg2_f64:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movq $-2, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_qnan_value_neg2_f64:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movq $-2, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_qnan_value_neg2_f64:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movq $-2, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_qnan_value_neg2_f64:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movq $-2, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_qnan_value_neg2_f64:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movq $-2, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_qnan_value_neg2_f64:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movq $-2, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_value_neg2_f64:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movq $-2, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_value_neg2_f64:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movq $-2, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_snan0_value_f64(double addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI5_0:
-  ; CHECK: 	.quad	0x7ff8000000000000              # double NaN
-  ; CHECK-LABEL: canonicalize_snan0_value_f64:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovsd	.LCPI5_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_snan0_value_f64:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_snan0_value_f64:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_snan0_value_f64:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_snan0_value_f64:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_snan0_value_f64:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_snan0_value_f64:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_snan0_value_f64:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_snan0_value_f64:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_snan0_value_f64:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_snan0_value_f64:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_snan0_value_f64:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_snan0_value_f64:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_snan0_value_f64:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_snan0_value_f64:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_snan0_value_f64:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_undef(double addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI6_0:
-  ; CHECK: 	.quad	0x7ff8000000000000              # double NaN
-  ; CHECK-LABEL: canonicalize_undef:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovsd	.LCPI6_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_undef:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_undef:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_undef:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_undef:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_undef:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_undef:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_undef:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_undef:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_undef:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_undef:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_undef:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_undef:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_undef:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_undef:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_undef:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double undef)
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_f32_ieee(float addrspace(1)* %out) {
-  ; IEEE-DENORMAL-LABEL: .LCPI7_0:
-  ; IEEE-DENORMAL: 	.long	0x807fffff                      # float -1.17549421E-38
-  ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_f32_ieee:
-  ; IEEE-DENORMAL: # %bb.0:
-  ; IEEE-DENORMAL-NEXT: 	vmovss	.LCPI7_0(%rip), %xmm0
-  ; IEEE-DENORMAL-NEXT: 	vmovss	%xmm0, (%rdi)
-  ; IEEE-DENORMAL-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f32_ieee:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_ieee:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_ieee:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f32_ieee:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_ieee:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_ieee:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f32_ieee:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_ieee:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_ieee:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f32_ieee:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_ieee:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_ieee:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f32_ieee:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_ieee:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_ieee:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_f64_ieee(double addrspace(1)* %out) {
-  ; IEEE-DENORMAL-LABEL: .LCPI8_0:
-  ; IEEE-DENORMAL: 	.quad	0x800fffffffffffff              # double -2.2250738585072009E-308
-  ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_f64_ieee:
-  ; IEEE-DENORMAL: # %bb.0:
-  ; IEEE-DENORMAL-NEXT: 	vmovsd	.LCPI8_0(%rip), %xmm0
-  ; IEEE-DENORMAL-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; IEEE-DENORMAL-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f64_ieee:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_ieee:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_ieee:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f64_ieee:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_ieee:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_ieee:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f64_ieee:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_ieee:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_ieee:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f64_ieee:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_ieee:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_ieee:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f64_ieee:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_ieee:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_ieee:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_f32_dynamic(float addrspace(1)* %out) {
-  ; DYN-DENORMAL-LABEL: .LCPI9_0:
-  ; DYN-DENORMAL: 	.long	0x807fffff                      # float -1.17549421E-38
-  ; DYN-DENORMAL-LABEL: canonicalize_denormal1_f32_dynamic:
-  ; DYN-DENORMAL: # %bb.0:
-  ; DYN-DENORMAL-NEXT: 	vmovss	.LCPI9_0(%rip), %xmm0
-  ; DYN-DENORMAL-NEXT: 	vmovss	%xmm0, (%rdi)
-  ; DYN-DENORMAL-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f32_dynamic:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_dynamic:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_dynamic:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f32_dynamic:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_dynamic:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_dynamic:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f32_dynamic:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_dynamic:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_dynamic:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f32_dynamic:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_dynamic:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_dynamic:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f32_dynamic:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_dynamic:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_dynamic:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_f64_dynamic(double addrspace(1)* %out) {
-  ; DYN-DENORMAL-LABEL: .LCPI10_0:
-  ; DYN-DENORMAL: 	.quad	0x800fffffffffffff              # double -2.2250738585072009E-308
-  ; DYN-DENORMAL-LABEL: canonicalize_denormal1_f64_dynamic:
-  ; DYN-DENORMAL: # %bb.0:
-  ; DYN-DENORMAL-NEXT: 	vmovsd	.LCPI10_0(%rip), %xmm0
-  ; DYN-DENORMAL-NEXT: 	vmovsd	%xmm0, (%rdi)
-  ; DYN-DENORMAL-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f64_dynamic:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_dynamic:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_dynamic:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f64_dynamic:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_dynamic:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_dynamic:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f64_dynamic:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_dynamic:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_dynamic:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f64_dynamic:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_dynamic:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_dynamic:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f64_dynamic:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_dynamic:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_dynamic:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
+; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_bfloat_pre_sign(bfloat addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI11_0:
-  ; CHECK: 	.long	0x80000000                      # float -0
-  ; CHECK-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovss	.LCPI11_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
-  ; CHECK-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_pre_sign:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
   store bfloat %canonicalized, bfloat addrspace(1)* %out
   ret void
@@ -174,15 +1147,80 @@ define void @canonicalize_denormal1_bfloat_pre_sign(bfloat addrspace(1)* %out) {
 
 
 define void @canonicalize_denormal1_bfloat_ieee(bfloat addrspace(1)* %out) {
-  ; IEEE-DENORMAL-LABEL: .LCPI12_0:
-  ; IEEE-DENORMAL: 	.long	0x80000000                      # float -0
-  ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_bfloat_ieee:
-  ; IEEE-DENORMAL: # %bb.0:
-  ; IEEE-DENORMAL-NEXT: 	vmovss	.LCPI12_0(%rip), %xmm0
-  ; IEEE-DENORMAL-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
-  ; IEEE-DENORMAL-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
-  ; IEEE-DENORMAL-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_bfloat_ieee:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_ieee:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_ieee:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_bfloat_ieee:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_ieee:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_ieee:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_bfloat_ieee:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_ieee:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_ieee:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_bfloat_ieee:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_ieee:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_ieee:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_bfloat_ieee:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_ieee:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_ieee:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
   store bfloat %canonicalized, bfloat addrspace(1)* %out
   ret void
@@ -190,29 +1228,178 @@ define void @canonicalize_denormal1_bfloat_ieee(bfloat addrspace(1)* %out) {
 
 
 define void @canonicalize_denormal1_bfloat_dynamic(bfloat addrspace(1)* %out) {
-  ; DYN-DENORMAL-LABEL: .LCPI13_0:
-  ; DYN-DENORMAL: 	.long	0x80000000                      # float -0
-  ; DYN-DENORMAL-LABEL: canonicalize_denormal1_bfloat_dynamic:
-  ; DYN-DENORMAL: # %bb.0:
-  ; DYN-DENORMAL-NEXT: 	vmovss	.LCPI13_0(%rip), %xmm0
-  ; DYN-DENORMAL-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
-  ; DYN-DENORMAL-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
-  ; DYN-DENORMAL-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_dynamic:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
   store bfloat %canonicalized, bfloat addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_half_pre_sign(half addrspace(1)* %out) {
-  ; CHECK-LABEL: .LCPI14_0:
-  ; CHECK: 	.short	0x8000                      # half -0
-  ; CHECK-LABEL: canonicalize_denormal1_half_pre_sign:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmovsh	.LCPI14_0(%rip), %xmm0
-  ; CHECK-NEXT: 	vmovsh	%xmm0, (%rdi)
-  ; CHECK-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_half_pre_sign:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; PRE-SIGN-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; PRE-SIGN-SSE2-NEXT:    movw %ax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_pre_sign:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; IEEE-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; IEEE-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_pre_sign:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; DYN-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; DYN-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_half_pre_sign:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_pre_sign:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_pre_sign:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_half_pre_sign:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_pre_sign:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_pre_sign:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_half_pre_sign:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_pre_sign:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_pre_sign:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_half_pre_sign:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_pre_sign:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_pre_sign:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -220,68 +1407,482 @@ define void @canonicalize_denormal1_half_pre_sign(half addrspace(1)* %out) {
 
 
 define void @canonicalize_denormal1_half_ieee(half addrspace(1)* %out) {
-  ; IEEE-DENORMAL-LABEL: .LCPI15_0:
-  ; IEEE-DENORMAL: 	.short	0x8000                          # half -0
-  ; IEEE-DENORMAL-LABEL: canonicalize_denormal1_half_ieee:
-  ; IEEE-DENORMAL: # %bb.0:
-  ; IEEE-DENORMAL-NEXT: 	vmovsh	.LCPI15_0(%rip), %xmm0
-  ; IEEE-DENORMAL-NEXT: 	vmovsh	%xmm0, (%rdi)
-  ; IEEE-DENORMAL-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_half_ieee:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; PRE-SIGN-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; PRE-SIGN-SSE2-NEXT:    movw %ax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_ieee:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; IEEE-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; IEEE-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_ieee:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; DYN-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; DYN-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_half_ieee:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_ieee:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_ieee:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_half_ieee:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_ieee:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_ieee:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_half_ieee:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_ieee:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_ieee:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_half_ieee:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_ieee:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_ieee:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_half_dynamic(half addrspace(1)* %out) {
-  ; DYN-DENORMAL-LABEL: .LCPI16_0:
-  ; DYN-DENORMAL: 	.short	0x8000                          # half -0
-  ; DYN-DENORMAL-LABEL: canonicalize_denormal1_half_dynamic:
-  ; DYN-DENORMAL: # %bb.0:
-  ; DYN-DENORMAL-NEXT: 	vmovsh	.LCPI16_0(%rip), %xmm0
-  ; DYN-DENORMAL-NEXT: 	vmovsh	%xmm0, (%rdi)
-  ; DYN-DENORMAL-NEXT: 	retq
-
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_half_dynamic:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; PRE-SIGN-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; PRE-SIGN-SSE2-NEXT:    movw %ax, (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_dynamic:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; IEEE-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; IEEE-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_dynamic:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; DYN-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; DYN-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_half_dynamic:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_dynamic:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_dynamic:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_half_dynamic:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_dynamic:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_dynamic:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_half_dynamic:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_dynamic:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_dynamic:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_half_dynamic:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; PRE-SIGN-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_dynamic:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; IEEE-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_dynamic:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; DYN-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_x86_fp80_pre_sign(x86_fp80 addrspace(1)* %out) {
-    ; CHECK-LAEBL: .LCPI17_0:
-    ; CHECK: .long	0x00000000                      # float 0
-    ; CHECK-LAEBL: canonicalize_denormal1_x86_fp80_pre_sign
-    ; CHECK: # %bb.0:
-    ; CHECK-NEXT: flds	.LCPI17_0(%rip)
-    ; CHECK-NEXT: fstpt	(%rdi)
-    ; CHECK-NEXT: retq
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    fldz
+; PRE-SIGN-SSE2-NEXT:    fstpt (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    fldz
+; PRE-SIGN-AVX-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    fldz
+; PRE-SIGN-AVX2-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    fldz
+; PRE-SIGN-AVX512F-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    fldz
+; PRE-SIGN-AVX512BW-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
     %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
     store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
     ret void
 }
 
 define void @canonicalize_denormal1_x86_fp80_dynamic(x86_fp80 addrspace(1)* %out) {
-  ; DYN-DENORMAL-LAEBL: .LCPI17_0:
-  ; DYN-DENORMAL: .quad	0x0000000000000001              # x86_fp80 3.64519953188247460253E-4951
-  ; DYN-DENORMAL-LAEBL: canonicalize_denormal1_x86_fp80_dynamic
-  ; DYN-DENORMAL: # %bb.0:
-  ; DYN-DENORMAL-NEXT: fldt	.LCPI17_0(%rip)
-  ; DYN-DENORMAL-NEXT: fstpt	(%rdi)
-  ; DYN-DENORMAL-NEXT: retq
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    fldz
+; PRE-SIGN-SSE2-NEXT:    fstpt (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    fldz
+; PRE-SIGN-AVX-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    fldz
+; PRE-SIGN-AVX2-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    fldz
+; PRE-SIGN-AVX512F-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    fldz
+; PRE-SIGN-AVX512BW-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
   store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
   ret void
 }
 
 define void @canonicalize_denormal1_x86_fp80_ieee(x86_fp80 addrspace(1)* %out) {
-  ; IEEE-DENORMAL-LAEBL: .LCPI17_0:
-  ; IEEE-DENORMAL: .quad	0x0000000000000001              # x86_fp80 3.64519953188247460253E-4951
-  ; IEEE-DENORMAL-LAEBL: canonicalize_denormal1_x86_fp80_ieee
-  ; IEEE-DENORMAL: # %bb.0:
-  ; IEEE-DENORMAL-NEXT: fldt	.LCPI17_0(%rip)
-  ; IEEE-DENORMAL-NEXT: fstpt	(%rdi)
-  ; IEEE-DENORMAL-NEXT: retq
+; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; PRE-SIGN-SSE2:       # %bb.0:
+; PRE-SIGN-SSE2-NEXT:    fldz
+; PRE-SIGN-SSE2-NEXT:    fstpt (%rdi)
+; PRE-SIGN-SSE2-NEXT:    retq
+;
+; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; IEEE-DENORMAL-SSE2:       # %bb.0:
+; IEEE-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-SSE2-NEXT:    retq
+;
+; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; DYN-DENORMAL-SSE2:       # %bb.0:
+; DYN-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-SSE2-NEXT:    retq
+;
+; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; PRE-SIGN-AVX:       # %bb.0:
+; PRE-SIGN-AVX-NEXT:    fldz
+; PRE-SIGN-AVX-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; IEEE-DENORMAL-AVX:       # %bb.0:
+; IEEE-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX-NEXT:    retq
+;
+; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; DYN-DENORMAL-AVX:       # %bb.0:
+; DYN-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX-NEXT:    retq
+;
+; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; PRE-SIGN-AVX2:       # %bb.0:
+; PRE-SIGN-AVX2-NEXT:    fldz
+; PRE-SIGN-AVX2-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX2-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; IEEE-DENORMAL-AVX2:       # %bb.0:
+; IEEE-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX2-NEXT:    retq
+;
+; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; DYN-DENORMAL-AVX2:       # %bb.0:
+; DYN-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX2-NEXT:    retq
+;
+; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; PRE-SIGN-AVX512F:       # %bb.0:
+; PRE-SIGN-AVX512F-NEXT:    fldz
+; PRE-SIGN-AVX512F-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX512F-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; IEEE-DENORMAL-AVX512F:       # %bb.0:
+; IEEE-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX512F-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; DYN-DENORMAL-AVX512F:       # %bb.0:
+; DYN-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX512F-NEXT:    retq
+;
+; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; PRE-SIGN-AVX512BW:       # %bb.0:
+; PRE-SIGN-AVX512BW-NEXT:    fldz
+; PRE-SIGN-AVX512BW-NEXT:    fstpt (%rdi)
+; PRE-SIGN-AVX512BW-NEXT:    retq
+;
+; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; IEEE-DENORMAL-AVX512BW:       # %bb.0:
+; IEEE-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; IEEE-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
+; IEEE-DENORMAL-AVX512BW-NEXT:    retq
+;
+; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_ieee:
+; DYN-DENORMAL-AVX512BW:       # %bb.0:
+; DYN-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; DYN-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
+; DYN-DENORMAL-AVX512BW-NEXT:    retq
   %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
   store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
   ret void
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll
index c1b5dd0dddcd2b..0075386c023618 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll
@@ -1,14 +1,266 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
-; RUN: llc --mcpu=sapphirerapids -mtriple=x86_64 < %s | FileCheck %s
+; RUN: llc -mattr=sse -mtriple=x86_64 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -mattr=sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefix=SSE2
+; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX
+; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX2
+; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX512F
+; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX512BW
+
+
+define float @canon_fp32_varargsf32(float %a) {
+; SSE-LABEL: canon_fp32_varargsf32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canon_fp32_varargsf32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canon_fp32_varargsf32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canon_fp32_varargsf32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canon_fp32_varargsf32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canon_fp32_varargsf32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    retq
+  %canonicalized = call float @llvm.canonicalize.f32(float %a)
+  ret float %canonicalized
+}
+
+define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) {
+; SSE-LABEL: canon_fp32_varargsf80:
+; SSE:       # %bb.0:
+; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canon_fp32_varargsf80:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canon_fp32_varargsf80:
+; AVX:       # %bb.0:
+; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canon_fp32_varargsf80:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canon_fp32_varargsf80:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canon_fp32_varargsf80:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    retq
+  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %a)
+  ret x86_fp80 %canonicalized
+}
+
+define bfloat @canon_fp32_varargsbf16(bfloat %a) {
+; SSE-LABEL: canon_fp32_varargsbf16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canon_fp32_varargsbf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canon_fp32_varargsbf16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canon_fp32_varargsbf16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canon_fp32_varargsbf16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canon_fp32_varargsbf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    retq
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %a)
+  ret bfloat %canonicalized
+}
 
 define half @complex_canonicalize_fmul_half(half %a, half %b) {
-; CHECK-LABEL: complex_canonicalize_fmul_half:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT: 	vsubsh	%xmm1, %xmm0, %xmm0
-; CHECK-NEXT: 	vaddsh	%xmm1, %xmm0, %xmm2
-; CHECK-NEXT: 	vsubsh	%xmm0, %xmm2, %xmm0
-; CHECK-NEXT: 	vsubsh	%xmm1, %xmm0, %xmm0
-; CHECK-NEXT: 	retq
+; SSE-LABEL: complex_canonicalize_fmul_half:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    movss %xmm1, (%rsp) # 4-byte Spill
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movss (%rsp), %xmm0 # 4-byte Reload
+; SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    subss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    callq __truncsfhf2 at PLT
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    callq __truncsfhf2 at PLT
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    callq __truncsfhf2 at PLT
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    callq __truncsfhf2 at PLT
+; SSE-NEXT:    popq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: complex_canonicalize_fmul_half:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    movss %xmm1, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movss (%rsp), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    subss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: complex_canonicalize_fmul_half:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    vmovss %xmm1, (%rsp) # 4-byte Spill
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    vmovss (%rsp), %xmm0 # 4-byte Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; AVX-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    callq __truncsfhf2 at PLT
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    callq __truncsfhf2 at PLT
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    callq __truncsfhf2 at PLT
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    callq __truncsfhf2 at PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: complex_canonicalize_fmul_half:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    vmovss %xmm1, (%rsp) # 4-byte Spill
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vmovss (%rsp), %xmm0 # 4-byte Reload
+; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; AVX2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    popq %rax
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: complex_canonicalize_fmul_half:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX512F-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX512F-NEXT:    vmovd %ecx, %xmm0
+; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT:    vmovd %eax, %xmm1
+; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: complex_canonicalize_fmul_half:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm0
+; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512BW-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512BW-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    retq
 entry:
 
   %mul1 = fsub half %a, %b
@@ -20,17 +272,72 @@ entry:
 }
 
 define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
+; SSE-LABEL: complex_canonicalize_fmul_x86_fp80:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE-NEXT:    fsub %st(1), %st
+; SSE-NEXT:    fld %st(0)
+; SSE-NEXT:    fadd %st(2), %st
+; SSE-NEXT:    fsubp %st, %st(1)
+; SSE-NEXT:    fsubp %st, %st(1)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: complex_canonicalize_fmul_x86_fp80:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT:    fsub %st(1), %st
+; SSE2-NEXT:    fld %st(0)
+; SSE2-NEXT:    fadd %st(2), %st
+; SSE2-NEXT:    fsubp %st, %st(1)
+; SSE2-NEXT:    fsubp %st, %st(1)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: complex_canonicalize_fmul_x86_fp80:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX-NEXT:    fsub %st(1), %st
+; AVX-NEXT:    fld %st(0)
+; AVX-NEXT:    fadd %st(2), %st
+; AVX-NEXT:    fsubp %st, %st(1)
+; AVX-NEXT:    fsubp %st, %st(1)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: complex_canonicalize_fmul_x86_fp80:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    fsub %st(1), %st
+; AVX2-NEXT:    fld %st(0)
+; AVX2-NEXT:    fadd %st(2), %st
+; AVX2-NEXT:    fsubp %st, %st(1)
+; AVX2-NEXT:    fsubp %st, %st(1)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: complex_canonicalize_fmul_x86_fp80:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    fsub %st(1), %st
+; AVX512F-NEXT:    fld %st(0)
+; AVX512F-NEXT:    fadd %st(2), %st
+; AVX512F-NEXT:    fsubp %st, %st(1)
+; AVX512F-NEXT:    fsubp %st, %st(1)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: complex_canonicalize_fmul_x86_fp80:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    fsub %st(1), %st
+; AVX512BW-NEXT:    fld %st(0)
+; AVX512BW-NEXT:    fadd %st(2), %st
+; AVX512BW-NEXT:    fsubp %st, %st(1)
+; AVX512BW-NEXT:    fsubp %st, %st(1)
+; AVX512BW-NEXT:    retq
 entry:
-  ; CHECK-LABEL: complex_canonicalize_fmul_x86_fp80
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	fldt	24(%rsp)
-  ; CHECK-NEXT: 	fldt	8(%rsp)
-  ; CHECK-NEXT: 	fsub	%st(1), %st
-  ; CHECK-NEXT: 	fld	%st(0)
-  ; CHECK-NEXT: 	fadd	%st(2), %st
-  ; CHECK-NEXT: 	fsubp	%st, %st(1)
-  ; CHECK-NEXT: 	fsubp	%st, %st(1)
-  ; CHECK-NEXT: 	retq
 
   %mul1 = fsub x86_fp80 %a, %b
   %add = fadd x86_fp80 %mul1, %b
@@ -41,39 +348,203 @@ entry:
 }
 
 define bfloat @complex_canonicalize_fmul_bfloat(bfloat %a, bfloat %b) {
-; CHECK-LABEL: complex_canonicalize_fmul_bfloat:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT: 	vmovw	%xmm0, %eax
-; CHECK-NEXT: 	vmovw	%xmm1, %ecx
-; CHECK-NEXT: 	shll	$16, %ecx
-; CHECK-NEXT: 	vmovd	%ecx, %xmm0
-; CHECK-NEXT: 	shll	$16, %eax
-; CHECK-NEXT: 	vmovd	%eax, %xmm1
-; CHECK-NEXT: 	vsubss	%xmm0, %xmm1, %xmm1
-; CHECK-NEXT: 	vcvtneps2bf16	%xmm1, %xmm1
-; CHECK-NEXT: 	vmovw	%xmm1, %eax
-; CHECK-NEXT: 	shll	$16, %eax
-; CHECK-NEXT: 	vmovd	%eax, %xmm1
-; CHECK-NEXT: 	vaddss	%xmm0, %xmm1, %xmm2
-; CHECK-NEXT: 	vcvtneps2bf16	%xmm2, %xmm2
-; CHECK-NEXT: 	vmovw	%xmm2, %eax
-; CHECK-NEXT: 	shll	$16, %eax
-; CHECK-NEXT: 	vmovd	%eax, %xmm2
-; CHECK-NEXT: 	vsubss	%xmm1, %xmm2, %xmm1
-; CHECK-NEXT: 	vcvtneps2bf16	%xmm1, %xmm1
-; CHECK-NEXT: 	vmovw	%xmm1, %eax
-; CHECK-NEXT: 	shll	$16, %eax
-; CHECK-NEXT: 	vmovd	%eax, %xmm1
-; CHECK-NEXT: 	vcvtneps2bf16	%xmm1, %xmm1
-; CHECK-NEXT: 	vmovw	%xmm1, %eax
-; CHECK-NEXT: 	shll	$16, %eax
-; CHECK-NEXT: 	vmovd	%eax, %xmm1
-; CHECK-NEXT: 	vsubss	%xmm0, %xmm1, %xmm0
-; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
-; CHECK-NEXT: 	vmovw	%xmm0, %eax
-; CHECK-NEXT: 	vmovw	%eax, %xmm0
-; CHECK-NEXT: 	retq
-
+; SSE-LABEL: complex_canonicalize_fmul_bfloat:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    pextrw $0, %xmm0, %eax
+; SSE-NEXT:    pextrw $0, %xmm1, %ecx
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    movd %ecx, %xmm1
+; SSE-NEXT:    movd %xmm1, (%rsp) # 4-byte Folded Spill
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    callq __truncsfbf2 at PLT
+; SSE-NEXT:    pextrw $0, %xmm0, %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    callq __truncsfbf2 at PLT
+; SSE-NEXT:    pextrw $0, %xmm0, %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    callq __truncsfbf2 at PLT
+; SSE-NEXT:    pextrw $0, %xmm0, %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    callq __truncsfbf2 at PLT
+; SSE-NEXT:    popq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: complex_canonicalize_fmul_bfloat:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    pextrw $0, %xmm1, %ecx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movd %xmm1, (%rsp) # 4-byte Folded Spill
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    subss %xmm1, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: complex_canonicalize_fmul_bfloat:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    shll $16, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vmovd %xmm1, (%rsp) # 4-byte Folded Spill
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    callq __truncsfbf2 at PLT
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    callq __truncsfbf2 at PLT
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    callq __truncsfbf2 at PLT
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    shll $16, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    callq __truncsfbf2 at PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: complex_canonicalize_fmul_bfloat:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX2-NEXT:    shll $16, %ecx
+; AVX2-NEXT:    vmovd %ecx, %xmm1
+; AVX2-NEXT:    vmovd %xmm1, (%rsp) # 4-byte Folded Spill
+; AVX2-NEXT:    shll $16, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    callq __truncsfbf2 at PLT
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    shll $16, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX2-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    callq __truncsfbf2 at PLT
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    shll $16, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    callq __truncsfbf2 at PLT
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    shll $16, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    callq __truncsfbf2 at PLT
+; AVX2-NEXT:    popq %rax
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: complex_canonicalize_fmul_bfloat:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512F-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX512F-NEXT:    shll $16, %ecx
+; AVX512F-NEXT:    vmovd %ecx, %xmm1
+; AVX512F-NEXT:    vmovd %xmm1, (%rsp) # 4-byte Folded Spill
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    vmovd %eax, %xmm0
+; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    callq __truncsfbf2 at PLT
+; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    vmovd %eax, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX512F-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX512F-NEXT:    callq __truncsfbf2 at PLT
+; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    vmovd %eax, %xmm0
+; AVX512F-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX512F-NEXT:    callq __truncsfbf2 at PLT
+; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    vmovd %eax, %xmm0
+; AVX512F-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX512F-NEXT:    callq __truncsfbf2 at PLT
+; AVX512F-NEXT:    popq %rax
+; AVX512F-NEXT:    .cfi_def_cfa_offset 8
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: complex_canonicalize_fmul_bfloat:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    pushq %rax
+; AVX512BW-NEXT:    .cfi_def_cfa_offset 16
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512BW-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX512BW-NEXT:    shll $16, %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm1
+; AVX512BW-NEXT:    vmovd %xmm1, (%rsp) # 4-byte Folded Spill
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm0
+; AVX512BW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    callq __truncsfbf2 at PLT
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX512BW-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX512BW-NEXT:    callq __truncsfbf2 at PLT
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm0
+; AVX512BW-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX512BW-NEXT:    callq __truncsfbf2 at PLT
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm0
+; AVX512BW-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX512BW-NEXT:    callq __truncsfbf2 at PLT
+; AVX512BW-NEXT:    popq %rax
+; AVX512BW-NEXT:    .cfi_def_cfa_offset 8
+; AVX512BW-NEXT:    retq
 entry:
 
   %sub1 = fsub bfloat %a, %b
@@ -85,14 +556,60 @@ entry:
 }
 
 define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
+; SSE-LABEL: canonicalize_fp64:
+; SSE:       # %bb.0: # %start
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    cmpunordsd %xmm0, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm3
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    maxsd %xmm0, %xmm1
+; SSE-NEXT:    andnpd %xmm1, %xmm2
+; SSE-NEXT:    orpd %xmm3, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canonicalize_fp64:
+; SSE2:       # %bb.0: # %start
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    cmpunordsd %xmm0, %xmm2
+; SSE2-NEXT:    movapd %xmm2, %xmm3
+; SSE2-NEXT:    andpd %xmm1, %xmm3
+; SSE2-NEXT:    maxsd %xmm0, %xmm1
+; SSE2-NEXT:    andnpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm3, %xmm2
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canonicalize_fp64:
+; AVX:       # %bb.0: # %start
+; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canonicalize_fp64:
+; AVX2:       # %bb.0: # %start
+; AVX2-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canonicalize_fp64:
+; AVX512F:       # %bb.0: # %start
+; AVX512F-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
+; AVX512F-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512F-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovapd %xmm2, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canonicalize_fp64:
+; AVX512BW:       # %bb.0: # %start
+; AVX512BW-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
+; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT:    vmovapd %xmm2, %xmm0
+; AVX512BW-NEXT:    retq
 start:
-  ; CHECK-LABEL: canonicalize_fp64:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: 	vmaxsd	%xmm0, %xmm1, %xmm2
-  ; CHECK-NEXT: 	vcmpunordsd	%xmm0, %xmm0, %k1
-  ; CHECK-NEXT: 	vmovsd	%xmm1, %xmm2, %xmm2 {%k1}
-  ; CHECK-NEXT: 	vmovapd	%xmm2, %xmm0
-  ; CHECK-NEXT: 	retq
 
   %c = fcmp olt double %a, %b
   %d = fcmp uno double %a, 0.000000e+00
@@ -103,14 +620,60 @@ start:
 }
 
 define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
+; SSE-LABEL: canonicalize_fp32:
+; SSE:       # %bb.0: # %start
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    cmpunordss %xmm0, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    maxss %xmm0, %xmm1
+; SSE-NEXT:    andnps %xmm1, %xmm2
+; SSE-NEXT:    orps %xmm3, %xmm2
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: canonicalize_fp32:
+; SSE2:       # %bb.0: # %start
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpunordss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm3
+; SSE2-NEXT:    maxss %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: canonicalize_fp32:
+; AVX:       # %bb.0: # %start
+; AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: canonicalize_fp32:
+; AVX2:       # %bb.0: # %start
+; AVX2-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: canonicalize_fp32:
+; AVX512F:       # %bb.0: # %start
+; AVX512F-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
+; AVX512F-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: canonicalize_fp32:
+; AVX512BW:       # %bb.0: # %start
+; AVX512BW-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
+; AVX512BW-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512BW-NEXT:    retq
 start:
-  ; CHECK-LABEL: canonicalize_fp32:
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vmaxss %xmm0, %xmm1, %xmm2
-  ; CHECK-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-  ; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-  ; CHECK-NEXT: vmovaps %xmm2, %xmm0
-  ; CHECK-NEXT: retq
 
   %cc = fcmp olt float %aa, %bb
   %dd = fcmp uno float %aa, 0.000000e+00
@@ -121,11 +684,47 @@ start:
 }
 
 define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
-  ; CHECK-LAEBL: v_test_canonicalize_var_f32
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vmovss	(%rdi), %xmm0                   
-	; CHECK-NEXT: vmovss	%xmm0, (%rdi)
-	; CHECK-NEXT: retq
+; SSE-LABEL: v_test_canonicalize_var_f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    mulss (%rdi), %xmm0
+; SSE-NEXT:    movss %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_var_f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    mulss (%rdi), %xmm0
+; SSE2-NEXT:    movss %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_var_f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmulss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovss %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_var_f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX2-NEXT:    vmulss (%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vmovss %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_var_f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX512F-NEXT:    vmulss (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT:    vmovss %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_var_f32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX512BW-NEXT:    vmulss (%rdi), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovss %xmm0, (%rdi)
+; AVX512BW-NEXT:    retq
   %val = load float, float addrspace(1)* %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
@@ -133,11 +732,53 @@ define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
 }
 
 define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
-  ; CHECK-LAEBL: v_test_canonicalize_x86_fp80
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: fldt	(%rdi)
-	; CHECK-NEXT: fstpt	(%rdi)
-	; CHECK-NEXT: retq
+; SSE-LABEL: v_test_canonicalize_x86_fp80:
+; SSE:       # %bb.0:
+; SSE-NEXT:    fldt (%rdi)
+; SSE-NEXT:    fld1
+; SSE-NEXT:    fmulp %st, %st(1)
+; SSE-NEXT:    fstpt (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_x86_fp80:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    fldt (%rdi)
+; SSE2-NEXT:    fld1
+; SSE2-NEXT:    fmulp %st, %st(1)
+; SSE2-NEXT:    fstpt (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_x86_fp80:
+; AVX:       # %bb.0:
+; AVX-NEXT:    fldt (%rdi)
+; AVX-NEXT:    fld1
+; AVX-NEXT:    fmulp %st, %st(1)
+; AVX-NEXT:    fstpt (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_x86_fp80:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    fldt (%rdi)
+; AVX2-NEXT:    fld1
+; AVX2-NEXT:    fmulp %st, %st(1)
+; AVX2-NEXT:    fstpt (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_x86_fp80:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    fldt (%rdi)
+; AVX512F-NEXT:    fld1
+; AVX512F-NEXT:    fmulp %st, %st(1)
+; AVX512F-NEXT:    fstpt (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_x86_fp80:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    fldt (%rdi)
+; AVX512BW-NEXT:    fld1
+; AVX512BW-NEXT:    fmulp %st, %st(1)
+; AVX512BW-NEXT:    fstpt (%rdi)
+; AVX512BW-NEXT:    retq
   %val = load x86_fp80, x86_fp80 addrspace(1)* %out
   %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %val)
   store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
@@ -145,11 +786,127 @@ define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
 }
 
 define void @v_test_canonicalize__half(half addrspace(1)* %out) {
-; CHECK-LABEL: v_test_canonicalize__half:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT: 	vmovsh	(%rdi), %xmm0                   # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: 	vmovsh	%xmm0, (%rdi)
-; CHECK-NEXT: 	retq
+; SSE-LABEL: v_test_canonicalize__half:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    subq $16, %rsp
+; SSE-NEXT:    .cfi_def_cfa_offset 32
+; SSE-NEXT:    .cfi_offset %rbx, -16
+; SSE-NEXT:    movq %rdi, %rbx
+; SSE-NEXT:    pinsrw $0, (%rdi), %xmm0
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    callq __truncsfhf2 at PLT
+; SSE-NEXT:    pextrw $0, %xmm0, %eax
+; SSE-NEXT:    movw %ax, (%rbx)
+; SSE-NEXT:    addq $16, %rsp
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize__half:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    subq $16, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    .cfi_offset %rbx, -16
+; SSE2-NEXT:    movq %rdi, %rbx
+; SSE2-NEXT:    pinsrw $0, (%rdi), %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rbx)
+; SSE2-NEXT:    addq $16, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize__half:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    subq $16, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    .cfi_offset %rbx, -16
+; AVX-NEXT:    movq %rdi, %rbx
+; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    callq __truncsfhf2 at PLT
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
+; AVX-NEXT:    addq $16, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize__half:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    subq $16, %rsp
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    .cfi_offset %rbx, -16
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vpextrw $0, %xmm0, (%rbx)
+; AVX2-NEXT:    addq $16, %rsp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize__half:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    movzwl (%rdi), %eax
+; AVX512F-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512F-NEXT:    vmovd %ecx, %xmm0
+; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT:    vmovd %eax, %xmm1
+; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    movw %ax, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize__half:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    movzwl (%rdi), %eax
+; AVX512BW-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm0
+; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512BW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    movw %ax, (%rdi)
+; AVX512BW-NEXT:    retq
 entry:
   %val = load half, half addrspace(1)* %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
@@ -158,11 +915,47 @@ entry:
 }
 
 define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
-  ; CHECK-LAEBL: v_test_canonicalize_var_f64
-  ; CHECK: # %bb.0:
-  ; CHECK-NEXT: vmovsd	(%rdi), %xmm0                   # xmm0 = mem[0],zero                   
-	; CHECK-NEXT: vmovsd	%xmm0, (%rdi)
-	; CHECK-NEXT: retq
+; SSE-LABEL: v_test_canonicalize_var_f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; SSE-NEXT:    mulsd (%rdi), %xmm0
+; SSE-NEXT:    movsd %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize_var_f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; SSE2-NEXT:    mulsd (%rdi), %xmm0
+; SSE2-NEXT:    movsd %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize_var_f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; AVX-NEXT:    vmulsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize_var_f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; AVX2-NEXT:    vmulsd (%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vmovsd %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize_var_f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; AVX512F-NEXT:    vmulsd (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT:    vmovsd %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_var_f64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; AVX512BW-NEXT:    vmulsd (%rdi), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovsd %xmm0, (%rdi)
+; AVX512BW-NEXT:    retq
   %val = load double, double addrspace(1)* %out
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
@@ -170,15 +963,29 @@ define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
 }
 
 define void @v_test_canonicalize__bfloat(bfloat addrspace(1)* %out) {
-; CHECK-LABEL: v_test_canonicalize__bfloat:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT: 	movzwl	(%rdi), %eax
-; CHECK-NEXT: 	shll	$16, %eax
-; CHECK-NEXT: 	vmovd	%eax, %xmm0
-; CHECK-NEXT: 	vcvtneps2bf16	%xmm0, %xmm0
-; CHECK-NEXT: 	vpextrw	$0, %xmm0, (%rdi)
-; CHECK-NEXT: 	retq
-
+; SSE-LABEL: v_test_canonicalize__bfloat:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    retq
+;
+; SSE2-LABEL: v_test_canonicalize__bfloat:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: v_test_canonicalize__bfloat:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: v_test_canonicalize__bfloat:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: v_test_canonicalize__bfloat:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: v_test_canonicalize__bfloat:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    retq
 entry:
   %val = load bfloat, bfloat addrspace(1)* %out
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
@@ -190,4 +997,4 @@ declare double @llvm.canonicalize.f64(double)
 declare float @llvm.canonicalize.f32(float)
 declare bfloat @llvm.canonicalize.bf16(bfloat)
 declare x86_fp80 @llvm.canonicalize.f80(x86_fp80)
-declare half @llvm.canonicalize.f16(half)
\ No newline at end of file
+declare half @llvm.canonicalize.f16(half)

>From d40523083c236995400c5d44444fdfdd20560d71 Mon Sep 17 00:00:00 2001
From: Pawan Anil Nirpal <pawan.anil.nirpal at intel.com>
Date: Fri, 6 Sep 2024 10:49:32 +0200
Subject: [PATCH 3/4] addressing review comments, simplify condtions

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4b9f0326e4d46d..c9227be5d6c297 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58077,14 +58077,12 @@ SDValue combineConstantCanonicalize(SDNode *Node, SelectionDAG &DAG) {
           DAG.getMachineFunction().getDenormalMode(C.getSemantics());
       assert((Mode != DenormalMode::getPositiveZero()) &&
              "Positive denormal mode is not valid for X86 target.");
-      if (Mode == DenormalMode::getPreserveSign()) {
-        SDValue SDZero =
-            DAG.getConstantFP((C.isNegative() ? -0.0 : 0.0), dl, VT);
-        return SDZero;
-      } else if (Mode == DenormalMode::getIEEE()) {
+      if (Mode == DenormalMode::getPreserveSign())
+        return DAG.getConstantFP((C.isNegative() ? -0.0 : 0.0), dl, VT);
+      if (Mode == DenormalMode::getIEEE() || Mode == DenormalMode::getDynamic())
         return Operand;
-      }
-    } else if (C.isNaN() && C.isSignaling()) {
+    }
+    if (C.isNaN() && C.isSignaling()) {
       APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
       SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
       return QuitNaN;
@@ -58094,7 +58092,7 @@ SDValue combineConstantCanonicalize(SDNode *Node, SelectionDAG &DAG) {
 }
 
 SDValue findLastStrictOpChain(SDNode *N, SelectionDAG &DAG) {
-  assert(N!=nullptr && "Trying to find last chain for a NULL Node");
+  assert(N != nullptr && "Trying to find last chain for a NULL Node");
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     SDValue Op = N->getOperand(i);
     if (Op.getValueType() == MVT::Other && Op.getNode()->isStrictFPOpcode())
@@ -58104,10 +58102,10 @@ SDValue findLastStrictOpChain(SDNode *N, SelectionDAG &DAG) {
 }
 
 bool isNonCanonicalizingOperation(SDNode *N) {
-  assert(N!=nullptr && "Trying to check canonical opcode for a NULL Node");
+  assert(N != nullptr && "Trying to check canonical opcode for a NULL Node");
   unsigned Opc = N->getOpcode();
   switch (Opc) {
-  // Ensure these are the exasustive set of non canonicalizing opcodes. Add more
+  // Ensure these are the exhaustive set of non canonicalizing opcodes. Add more
   // if not.
   case X86::RET:
   case ISD::STORE:

>From 317dd6f68da03de47fe525fbc5453494ec16c059 Mon Sep 17 00:00:00 2001
From: Pawan Anil Nirpal <pawan.anil.nirpal at intel.com>
Date: Tue, 10 Sep 2024 15:10:50 +0200
Subject: [PATCH 4/4] Removed constant folding for another patch, moving undef
 canonicalize to generic dag combiner

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   10 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   94 +-
 .../CodeGen/X86/canonicalize-constants.ll     |  583 -----
 .../CodeGen/X86/canonicalize-subnormals.ll    | 1888 -----------------
 llvm/test/CodeGen/X86/canonicalize-vars.ll    |  441 ++--
 5 files changed, 164 insertions(+), 2852 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/canonicalize-constants.ll
 delete mode 100644 llvm/test/CodeGen/X86/canonicalize-subnormals.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 37272a09b336ab..ef989ea3190274 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1980,6 +1980,16 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FREEZE:             return visitFREEZE(N);
   case ISD::GET_FPENV_MEM:      return visitGET_FPENV_MEM(N);
   case ISD::SET_FPENV_MEM:      return visitSET_FPENV_MEM(N);
+  case ISD::FCANONICALIZE:{
+    SDValue Operand = N->getOperand(0);
+    EVT VT = Operand.getValueType();
+    SDLoc dl(N);
+    if(Operand.isUndef()){
+      APFloat CanonicalQNaN = APFloat::getQNaN(VT.getFltSemantics());
+      return DAG.getConstantFP(CanonicalQNaN, dl, VT);
+    }
+    break;
+  }
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_FMUL:
   case ISD::VECREDUCE_ADD:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c9227be5d6c297..8dcb52ac611d64 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58066,81 +58066,11 @@ static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-SDValue combineConstantCanonicalize(SDNode *Node, SelectionDAG &DAG) {
-  SDValue Operand = Node->getOperand(0);
-  SDLoc dl(Node);
-  EVT VT = Operand.getValueType();
-  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Operand)) {
-    const APFloat &C = CFP->getValueAPF();
-    if (C.isDenormal()) {
-      DenormalMode Mode =
-          DAG.getMachineFunction().getDenormalMode(C.getSemantics());
-      assert((Mode != DenormalMode::getPositiveZero()) &&
-             "Positive denormal mode is not valid for X86 target.");
-      if (Mode == DenormalMode::getPreserveSign())
-        return DAG.getConstantFP((C.isNegative() ? -0.0 : 0.0), dl, VT);
-      if (Mode == DenormalMode::getIEEE() || Mode == DenormalMode::getDynamic())
-        return Operand;
-    }
-    if (C.isNaN() && C.isSignaling()) {
-      APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
-      SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
-      return QuitNaN;
-    }
-  }
-  return Operand;
-}
-
-SDValue findLastStrictOpChain(SDNode *N, SelectionDAG &DAG) {
-  assert(N != nullptr && "Trying to find last chain for a NULL Node");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    SDValue Op = N->getOperand(i);
-    if (Op.getValueType() == MVT::Other && Op.getNode()->isStrictFPOpcode())
-      return Op;
-  }
-  return DAG.getEntryNode();
-}
-
-bool isNonCanonicalizingOperation(SDNode *N) {
-  assert(N != nullptr && "Trying to check canonical opcode for a NULL Node");
-  unsigned Opc = N->getOpcode();
-  switch (Opc) {
-  // Ensure these are the exhaustive set of non canonicalizing opcodes. Add more
-  // if not.
-  case X86::RET:
-  case ISD::STORE:
-  case ISD::SETCC:
-  case X86ISD::FCMP:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool isUsedByNonCanonicalizingOp(SDNode *N) {
-  for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
-       ++UI) {
-    SDNode *User = *UI;
-    if (isNonCanonicalizingOperation(User))
-      return true;
-  }
-  return false;
-}
-
 SDValue combineCanonicalize(SDNode *Node, SelectionDAG &DAG) {
   SDValue Operand = Node->getOperand(0);
   EVT VT = Operand.getValueType();
   SDLoc dl(Node);
 
-  if (auto *CFP = dyn_cast<ConstantFPSDNode>(Operand))
-    return combineConstantCanonicalize(Node, DAG);
-
-  if (Operand.isUndef()) {
-    APFloat CanonicalQNaN = APFloat::getQNaN(VT.getFltSemantics());
-    SDValue QuitNaN = DAG.getConstantFP(CanonicalQNaN, dl, VT);
-    return QuitNaN;
-  }
-
   // Canonicalize scalar variable FP Nodes.
   SDValue MulNode;
   SDValue One;
@@ -58157,28 +58087,18 @@ SDValue combineCanonicalize(SDNode *Node, SelectionDAG &DAG) {
     One = DAG.getConstantFP(Val, dl, VT);
   } else {
     // Is it better to assert? when we encounter an unknown FP type,Than to
-    // just replace with the operand! As this might be our last attempt at
-    // legalization.
+    // just replace with the operand!
     return Operand;
   }
 
-  // Store, return, and compare are non-canonicalizing operations. If a
-  // non-canonicalizing operation uses the rest then mul * 1.0 must be generated
-  // int those cases.
-  // TODO: For now Preventing bf16 from generating strict_fmul as it
-  // leads to a crash SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
+  // TODO: Fix Crash for bf16 when generating strict_fmul as it
+  // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
   // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
   // promote this operator's result!
-  if (isUsedByNonCanonicalizingOp(Node) && VT != MVT::bf16) {
-    SDValue Chain = findLastStrictOpChain(Node, DAG);
-    // TODO : Follow-up with tablegen pattern to generate mul * 1.0.
-    SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
-                                     {Chain, One, Operand});
-
-    return StrictFmul;
-  }
-
-  return Operand;
+  SDValue Chain = DAG.getEntryNode();
+  SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
+                                   {Chain, One, Operand});
+  return StrictFmul;
   // TODO : Hanlde vectors.
 }
 
diff --git a/llvm/test/CodeGen/X86/canonicalize-constants.ll b/llvm/test/CodeGen/X86/canonicalize-constants.ll
deleted file mode 100644
index b1a9733806d40e..00000000000000
--- a/llvm/test/CodeGen/X86/canonicalize-constants.ll
+++ /dev/null
@@ -1,583 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
-; RUN: llc -mattr=sse -mtriple=x86_64 < %s | FileCheck %s -check-prefix=SSE
-; RUN: llc -mattr=sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefix=SSE2
-; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX
-; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX2
-; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX512F
-; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefix=AVX512BW
-
-define float @canon_fp32() {
-; SSE-LABEL: canon_fp32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: canon_fp32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: canon_fp32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: canon_fp32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: canon_fp32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: canon_fp32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; AVX512BW-NEXT:    retq
-  %canonicalized = call float @llvm.canonicalize.f32(float 3.0)
-  ret float %canonicalized
-}
-
-define half @canon_fp16() {
-; SSE-LABEL: canon_fp16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: canon_fp16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: canon_fp16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: canon_fp16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: canon_fp16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: canon_fp16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    retq
-  %canonicalized = call half @llvm.canonicalize.f16(half 0xH4200) ; half 3.0
-  ret half %canonicalized
-}
-
-define double @canon_fp64() {
-; SSE-LABEL: canon_fp64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: canon_fp64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: canon_fp64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: canon_fp64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: canon_fp64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: canon_fp64:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double 3.0)
-  ret double %canonicalized
-}
-
-define x86_fp80 @canon_fp80() {
-; SSE-LABEL: canon_fp80:
-; SSE:       # %bb.0:
-; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: canon_fp80:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: canon_fp80:
-; AVX:       # %bb.0:
-; AVX-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: canon_fp80:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: canon_fp80:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: canon_fp80:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX512BW-NEXT:    retq
-  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000) ; 90.0
-  ret x86_fp80 %canonicalized
-}
-
-
-define x86_fp80 @complex_canonicalize_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
-; SSE-LABEL: complex_canonicalize_x86_fp80:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
-; SSE-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: complex_canonicalize_x86_fp80:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    fldt {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: complex_canonicalize_x86_fp80:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
-; AVX-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: complex_canonicalize_x86_fp80:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    fldt {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: complex_canonicalize_x86_fp80:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    fldt {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: complex_canonicalize_x86_fp80:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    fldt {{[0-9]+}}(%rsp)
-; AVX512BW-NEXT:    fsubrs {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX512BW-NEXT:    retq
-entry:
-  %mul1 = fsub x86_fp80 %a, %b
-  %add = fadd x86_fp80 %mul1, %b
-  %mul2 = fsub x86_fp80 %add, %mul1
-  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000)
-  %result = fsub x86_fp80 %canonicalized, %b
-  ret x86_fp80 %result
-}
-
-define double @complex_canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
-; SSE-LABEL: complex_canonicalize_fp64:
-; SSE:       # %bb.0: # %start
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: complex_canonicalize_fp64:
-; SSE2:       # %bb.0: # %start
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: complex_canonicalize_fp64:
-; AVX:       # %bb.0: # %start
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: complex_canonicalize_fp64:
-; AVX2:       # %bb.0: # %start
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: complex_canonicalize_fp64:
-; AVX512F:       # %bb.0: # %start
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: complex_canonicalize_fp64:
-; AVX512BW:       # %bb.0: # %start
-; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
-; AVX512BW-NEXT:    retq
-start:
-  %c = fcmp olt double %a, %b
-  %d = fcmp uno double %a, 0.000000e+00
-  %or.cond.i.i = or i1 %d, %c
-  %e = select i1 %or.cond.i.i, double %b, double %a
-  %f = tail call double @llvm.canonicalize.f64(double 3.0) #2
-  ret double %f
-}
-
-define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
-; SSE-LABEL: test_fold_canonicalize_p0_f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl $0, (%rdi)
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: test_fold_canonicalize_p0_f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl $0, (%rdi)
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: test_fold_canonicalize_p0_f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl $0, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: test_fold_canonicalize_p0_f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl $0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test_fold_canonicalize_p0_f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movl $0, (%rdi)
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: test_fold_canonicalize_p0_f32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    movl $0, (%rdi)
-; AVX512BW-NEXT:    retq
-  %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
-  store float %canonicalized, float addrspace(1)* %out
-  ret void
-}
-
-define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
-; SSE-LABEL: test_fold_canonicalize_n0_f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: test_fold_canonicalize_n0_f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: test_fold_canonicalize_n0_f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: test_fold_canonicalize_n0_f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test_fold_canonicalize_n0_f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: test_fold_canonicalize_n0_f32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; AVX512BW-NEXT:    retq
-  %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
-  store float %canonicalized, float addrspace(1)* %out
-  ret void
-}
-
-
-define void @v_test_canonicalize_p90_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
-; SSE-LABEL: v_test_canonicalize_p90_x86_fp80:
-; SSE:       # %bb.0:
-; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; SSE-NEXT:    fstpt (%rdi)
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: v_test_canonicalize_p90_x86_fp80:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; SSE2-NEXT:    fstpt (%rdi)
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: v_test_canonicalize_p90_x86_fp80:
-; AVX:       # %bb.0:
-; AVX-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX-NEXT:    fstpt (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: v_test_canonicalize_p90_x86_fp80:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX2-NEXT:    fstpt (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: v_test_canonicalize_p90_x86_fp80:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX512F-NEXT:    fstpt (%rdi)
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: v_test_canonicalize_p90_x86_fp80:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX512BW-NEXT:    fstpt (%rdi)
-; AVX512BW-NEXT:    retq
-  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK4005B400000000000000)
-  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
-  ret void
-}
-
-define void @v_test_canonicalize_p3__half(half addrspace(1)* %out) {
-; SSE-LABEL: v_test_canonicalize_p3__half:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    movw %ax, (%rdi)
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: v_test_canonicalize_p3__half:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: v_test_canonicalize_p3__half:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: v_test_canonicalize_p3__half:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: v_test_canonicalize_p3__half:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: v_test_canonicalize_p3__half:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX512BW-NEXT:    retq
-entry:
-  %canonicalized = call half @llvm.canonicalize.f16(half 0xH4200)
-  store half %canonicalized, half addrspace(1)* %out
-  ret void
-}
-
-define void @v_test_canonicalize_p3_f64(double addrspace(1)* %out) #1 {
-; SSE-LABEL: v_test_canonicalize_p3_f64:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
-; SSE-NEXT:    movq %rax, (%rdi)
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: v_test_canonicalize_p3_f64:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
-; SSE2-NEXT:    movq %rax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: v_test_canonicalize_p3_f64:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
-; AVX-NEXT:    movq %rax, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: v_test_canonicalize_p3_f64:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
-; AVX2-NEXT:    movq %rax, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: v_test_canonicalize_p3_f64:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
-; AVX512F-NEXT:    movq %rax, (%rdi)
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: v_test_canonicalize_p3_f64:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
-; AVX512BW-NEXT:    movq %rax, (%rdi)
-; AVX512BW-NEXT:    retq
-entry:
-  %canonicalized = call double @llvm.canonicalize.f64(double 3.0)
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-define void @v_test_canonicalize_p3__bfloat(bfloat addrspace(1)* %out) {
-; SSE-LABEL: v_test_canonicalize_p3__bfloat:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movw $16448, (%rdi) # imm = 0x4040
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: v_test_canonicalize_p3__bfloat:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movw $16448, (%rdi) # imm = 0x4040
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: v_test_canonicalize_p3__bfloat:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    movw $16448, (%rdi) # imm = 0x4040
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: v_test_canonicalize_p3__bfloat:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movw $16448, (%rdi) # imm = 0x4040
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: v_test_canonicalize_p3__bfloat:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    movw $16448, (%rdi) # imm = 0x4040
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: v_test_canonicalize_p3__bfloat:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    movw $16448, (%rdi) # imm = 0x4040
-; AVX512BW-NEXT:    retq
-entry:
-  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 3.0)
-  store bfloat %canonicalized, bfloat addrspace(1)* %out
-  ret void
-}
-
-define void @v_test_canonicalize_n3__bfloat(bfloat addrspace(1)* %out) {
-; SSE-LABEL: v_test_canonicalize_n3__bfloat:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: v_test_canonicalize_n3__bfloat:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: v_test_canonicalize_n3__bfloat:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: v_test_canonicalize_n3__bfloat:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: v_test_canonicalize_n3__bfloat:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: v_test_canonicalize_n3__bfloat:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    movw $-16320, (%rdi) # imm = 0xC040
-; AVX512BW-NEXT:    retq
-entry:
-  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -3.0)
-  store bfloat %canonicalized, bfloat addrspace(1)* %out
-  ret void
-}
-
-define void @v_test_canonicalize_n90_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
-; SSE-LABEL: v_test_canonicalize_n90_x86_fp80:
-; SSE:       # %bb.0:
-; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; SSE-NEXT:    fstpt (%rdi)
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: v_test_canonicalize_n90_x86_fp80:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; SSE2-NEXT:    fstpt (%rdi)
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: v_test_canonicalize_n90_x86_fp80:
-; AVX:       # %bb.0:
-; AVX-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX-NEXT:    fstpt (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: v_test_canonicalize_n90_x86_fp80:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX2-NEXT:    fstpt (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: v_test_canonicalize_n90_x86_fp80:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX512F-NEXT:    fstpt (%rdi)
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: v_test_canonicalize_n90_x86_fp80:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; AVX512BW-NEXT:    fstpt (%rdi)
-; AVX512BW-NEXT:    retq
-  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xKC005B400000000000000)
-  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
-  ret void
-}
-
-define void @v_test_canonicalize_n3__half(half addrspace(1)* %out) {
-; SSE-LABEL: v_test_canonicalize_n3__half:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    movw %ax, (%rdi)
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: v_test_canonicalize_n3__half:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: v_test_canonicalize_n3__half:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: v_test_canonicalize_n3__half:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: v_test_canonicalize_n3__half:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: v_test_canonicalize_n3__half:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX512BW-NEXT:    retq
-entry:
-  %canonicalized = call half @llvm.canonicalize.f16(half 0xHC200)
-  store half %canonicalized, half addrspace(1)* %out
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/canonicalize-subnormals.ll b/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
deleted file mode 100644
index 034da96271eb85..00000000000000
--- a/llvm/test/CodeGen/X86/canonicalize-subnormals.ll
+++ /dev/null
@@ -1,1888 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
-; RUN: llc -mattr=sse2 -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-SSE2 %s
-; RUN: llc -mattr=sse2 -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-SSE2 %s
-; RUN: llc -mattr=sse2 -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-SSE2 %s
-; RUN: llc -mattr=+avx -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-AVX %s
-; RUN: llc -mattr=+avx -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-AVX %s
-; RUN: llc -mattr=+avx -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-AVX %s
-; RUN: llc -mattr=+avx2 -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-AVX2 %s
-; RUN: llc -mattr=+avx2 -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-AVX2 %s
-; RUN: llc -mattr=+avx2 -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-AVX2 %s
-; RUN: llc -mattr=+avx512f -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-AVX512F %s
-; RUN: llc -mattr=+avx512f -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-AVX512F %s
-; RUN: llc -mattr=+avx512f -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-AVX512F %s
-; RUN: llc -mattr=+avx512bw -mtriple=x86_64 -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=PRE-SIGN-AVX512BW %s
-; RUN: llc -mattr=+avx512bw -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=IEEE-DENORMAL-AVX512BW %s
-; RUN: llc -mattr=+avx512bw -mtriple=x86_64 -denormal-fp-math=ieee < %s | FileCheck -check-prefix=DYN-DENORMAL-AVX512BW %s
-
-define double @test_bad_subnormal() {
-; PRE-SIGN-SSE2-LABEL: test_bad_subnormal:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: test_bad_subnormal:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: test_bad_subnormal:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: test_bad_subnormal:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: test_bad_subnormal:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: test_bad_subnormal:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: test_bad_subnormal:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: test_bad_subnormal:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: test_bad_subnormal:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: test_bad_subnormal:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: test_bad_subnormal:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: test_bad_subnormal:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: test_bad_subnormal:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: test_bad_subnormal:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: test_bad_subnormal:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canon = call double @llvm.canonicalize(double 0x7ff8000000000001) ; Nan
-  ret double %canon
-}
-
-define void @canonicalize_denormal1_f32_pre_sign(float addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f32_pre_sign:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_pre_sign:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_pre_sign:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f32_pre_sign:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_pre_sign:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_pre_sign:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f32_pre_sign:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_pre_sign:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_pre_sign:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f32_pre_sign:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_pre_sign:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_pre_sign:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f32_pre_sign:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_pre_sign:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_pre_sign:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
-  store float %canonicalized, float addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_f64_pre_sign(double addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f64_pre_sign:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_pre_sign:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_pre_sign:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f64_pre_sign:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_pre_sign:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_pre_sign:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f64_pre_sign:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_pre_sign:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_pre_sign:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f64_pre_sign:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_pre_sign:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_pre_sign:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f64_pre_sign:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_pre_sign:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_pre_sign:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-
-define void @canonicalize_qnan_f64(double addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_qnan_f64:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_qnan_f64:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_qnan_f64:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_qnan_f64:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_qnan_f64:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_qnan_f64:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_qnan_f64:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_qnan_f64:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_qnan_f64:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_qnan_f64:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_qnan_f64:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_qnan_f64:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_qnan_f64:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_f64:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_f64:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_qnan_value_neg1_f64:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movq $-1, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_qnan_value_neg1_f64:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movq $-1, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_qnan_value_neg1_f64:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movq $-1, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_qnan_value_neg1_f64:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movq $-1, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_qnan_value_neg1_f64:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movq $-1, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_qnan_value_neg1_f64:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movq $-1, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_qnan_value_neg1_f64:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movq $-1, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_qnan_value_neg1_f64:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movq $-1, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_qnan_value_neg1_f64:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movq $-1, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_qnan_value_neg1_f64:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movq $-1, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_qnan_value_neg1_f64:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movq $-1, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_qnan_value_neg1_f64:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movq $-1, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_qnan_value_neg1_f64:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movq $-1, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_value_neg1_f64:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movq $-1, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_value_neg1_f64:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movq $-1, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_qnan_value_neg2_f64:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movq $-2, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_qnan_value_neg2_f64:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movq $-2, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_qnan_value_neg2_f64:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movq $-2, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_qnan_value_neg2_f64:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movq $-2, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_qnan_value_neg2_f64:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movq $-2, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_qnan_value_neg2_f64:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movq $-2, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_qnan_value_neg2_f64:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movq $-2, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_qnan_value_neg2_f64:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movq $-2, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_qnan_value_neg2_f64:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movq $-2, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_qnan_value_neg2_f64:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movq $-2, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_qnan_value_neg2_f64:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movq $-2, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_qnan_value_neg2_f64:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movq $-2, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_qnan_value_neg2_f64:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movq $-2, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_value_neg2_f64:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movq $-2, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_qnan_value_neg2_f64:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movq $-2, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_snan0_value_f64(double addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_snan0_value_f64:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_snan0_value_f64:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_snan0_value_f64:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_snan0_value_f64:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_snan0_value_f64:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_snan0_value_f64:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_snan0_value_f64:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_snan0_value_f64:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_snan0_value_f64:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_snan0_value_f64:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_snan0_value_f64:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_snan0_value_f64:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_snan0_value_f64:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_snan0_value_f64:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_snan0_value_f64:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_undef(double addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_undef:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_undef:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_undef:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_undef:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_undef:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_undef:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_undef:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_undef:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_undef:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_undef:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_undef:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_undef:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_undef:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_undef:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_undef:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double undef)
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_f32_ieee(float addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f32_ieee:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_ieee:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_ieee:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f32_ieee:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_ieee:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_ieee:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f32_ieee:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_ieee:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_ieee:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f32_ieee:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_ieee:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_ieee:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f32_ieee:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_ieee:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_ieee:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
-  store float %canonicalized, float addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_f64_ieee(double addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f64_ieee:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_ieee:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_ieee:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f64_ieee:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_ieee:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_ieee:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f64_ieee:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_ieee:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_ieee:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f64_ieee:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_ieee:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_ieee:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f64_ieee:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_ieee:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_ieee:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_f32_dynamic(float addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f32_dynamic:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_dynamic:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f32_dynamic:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f32_dynamic:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_dynamic:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f32_dynamic:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f32_dynamic:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_dynamic:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f32_dynamic:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f32_dynamic:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_dynamic:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f32_dynamic:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f32_dynamic:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movl $-2147483648, (%rdi) # imm = 0x80000000
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_dynamic:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f32_dynamic:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movl $-2139095041, (%rdi) # imm = 0x807FFFFF
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
-  store float %canonicalized, float addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_f64_dynamic(double addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_f64_dynamic:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-SSE2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_dynamic:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_f64_dynamic:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-SSE2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_f64_dynamic:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_dynamic:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_f64_dynamic:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_f64_dynamic:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX2-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_dynamic:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_f64_dynamic:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX2-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_f64_dynamic:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX512F-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_dynamic:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_f64_dynamic:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX512F-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_f64_dynamic:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; PRE-SIGN-AVX512BW-NEXT:    movq %rax, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_dynamic:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; IEEE-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_f64_dynamic:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movabsq $-9218868437227405313, %rax # imm = 0x800FFFFFFFFFFFFF
-; DYN-DENORMAL-AVX512BW-NEXT:    movq %rax, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
-  store double %canonicalized, double addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_bfloat_pre_sign(bfloat addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_pre_sign:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
-  store bfloat %canonicalized, bfloat addrspace(1)* %out
-  ret void
-}
-
-
-define void @canonicalize_denormal1_bfloat_ieee(bfloat addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_bfloat_ieee:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_ieee:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_ieee:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_bfloat_ieee:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_ieee:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_ieee:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_bfloat_ieee:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_ieee:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_ieee:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_bfloat_ieee:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_ieee:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_ieee:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_bfloat_ieee:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_ieee:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_ieee:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
-  store bfloat %canonicalized, bfloat addrspace(1)* %out
-  ret void
-}
-
-
-define void @canonicalize_denormal1_bfloat_dynamic(bfloat addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_bfloat_dynamic:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    movw $-32768, (%rdi) # imm = 0x8000
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 32768 to bfloat))
-  store bfloat %canonicalized, bfloat addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_half_pre_sign(half addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_half_pre_sign:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; PRE-SIGN-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; PRE-SIGN-SSE2-NEXT:    movw %ax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_pre_sign:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; IEEE-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; IEEE-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_pre_sign:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; DYN-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; DYN-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_half_pre_sign:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_pre_sign:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_pre_sign:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_half_pre_sign:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_pre_sign:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_pre_sign:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_half_pre_sign:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_pre_sign:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_pre_sign:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_half_pre_sign:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_pre_sign:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_pre_sign:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
-  store half %canonicalized, half addrspace(1)* %out
-  ret void
-}
-
-
-define void @canonicalize_denormal1_half_ieee(half addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_half_ieee:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; PRE-SIGN-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; PRE-SIGN-SSE2-NEXT:    movw %ax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_ieee:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; IEEE-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; IEEE-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_ieee:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; DYN-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; DYN-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_half_ieee:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_ieee:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_ieee:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_half_ieee:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_ieee:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_ieee:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_half_ieee:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_ieee:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_ieee:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_half_ieee:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_ieee:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_ieee:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
-  store half %canonicalized, half addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_half_dynamic(half addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_half_dynamic:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; PRE-SIGN-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; PRE-SIGN-SSE2-NEXT:    movw %ax, (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_dynamic:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; IEEE-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; IEEE-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_half_dynamic:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; DYN-DENORMAL-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; DYN-DENORMAL-SSE2-NEXT:    movw %ax, (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_half_dynamic:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_dynamic:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_half_dynamic:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_half_dynamic:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_dynamic:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_half_dynamic:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX2-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_half_dynamic:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_dynamic:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_half_dynamic:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_half_dynamic:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; PRE-SIGN-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_dynamic:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; IEEE-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_half_dynamic:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; DYN-DENORMAL-AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 32768 to half))
-  store half %canonicalized, half addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_x86_fp80_pre_sign(x86_fp80 addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    fldz
-; PRE-SIGN-SSE2-NEXT:    fstpt (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    fldz
-; PRE-SIGN-AVX-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    fldz
-; PRE-SIGN-AVX2-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    fldz
-; PRE-SIGN-AVX512F-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    fldz
-; PRE-SIGN-AVX512BW-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_pre_sign:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-    %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
-    store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
-    ret void
-}
-
-define void @canonicalize_denormal1_x86_fp80_dynamic(x86_fp80 addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    fldz
-; PRE-SIGN-SSE2-NEXT:    fstpt (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    fldz
-; PRE-SIGN-AVX-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    fldz
-; PRE-SIGN-AVX2-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    fldz
-; PRE-SIGN-AVX512F-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    fldz
-; PRE-SIGN-AVX512BW-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_dynamic:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
-  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
-  ret void
-}
-
-define void @canonicalize_denormal1_x86_fp80_ieee(x86_fp80 addrspace(1)* %out) {
-; PRE-SIGN-SSE2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; PRE-SIGN-SSE2:       # %bb.0:
-; PRE-SIGN-SSE2-NEXT:    fldz
-; PRE-SIGN-SSE2-NEXT:    fstpt (%rdi)
-; PRE-SIGN-SSE2-NEXT:    retq
-;
-; IEEE-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; IEEE-DENORMAL-SSE2:       # %bb.0:
-; IEEE-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-SSE2-NEXT:    retq
-;
-; DYN-DENORMAL-SSE2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; DYN-DENORMAL-SSE2:       # %bb.0:
-; DYN-DENORMAL-SSE2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-SSE2-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-SSE2-NEXT:    retq
-;
-; PRE-SIGN-AVX-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; PRE-SIGN-AVX:       # %bb.0:
-; PRE-SIGN-AVX-NEXT:    fldz
-; PRE-SIGN-AVX-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; IEEE-DENORMAL-AVX:       # %bb.0:
-; IEEE-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX-NEXT:    retq
-;
-; DYN-DENORMAL-AVX-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; DYN-DENORMAL-AVX:       # %bb.0:
-; DYN-DENORMAL-AVX-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX-NEXT:    retq
-;
-; PRE-SIGN-AVX2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; PRE-SIGN-AVX2:       # %bb.0:
-; PRE-SIGN-AVX2-NEXT:    fldz
-; PRE-SIGN-AVX2-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX2-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; IEEE-DENORMAL-AVX2:       # %bb.0:
-; IEEE-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX2-NEXT:    retq
-;
-; DYN-DENORMAL-AVX2-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; DYN-DENORMAL-AVX2:       # %bb.0:
-; DYN-DENORMAL-AVX2-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX2-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX2-NEXT:    retq
-;
-; PRE-SIGN-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; PRE-SIGN-AVX512F:       # %bb.0:
-; PRE-SIGN-AVX512F-NEXT:    fldz
-; PRE-SIGN-AVX512F-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX512F-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; IEEE-DENORMAL-AVX512F:       # %bb.0:
-; IEEE-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX512F-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512F-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; DYN-DENORMAL-AVX512F:       # %bb.0:
-; DYN-DENORMAL-AVX512F-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX512F-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX512F-NEXT:    retq
-;
-; PRE-SIGN-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; PRE-SIGN-AVX512BW:       # %bb.0:
-; PRE-SIGN-AVX512BW-NEXT:    fldz
-; PRE-SIGN-AVX512BW-NEXT:    fstpt (%rdi)
-; PRE-SIGN-AVX512BW-NEXT:    retq
-;
-; IEEE-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; IEEE-DENORMAL-AVX512BW:       # %bb.0:
-; IEEE-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; IEEE-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
-; IEEE-DENORMAL-AVX512BW-NEXT:    retq
-;
-; DYN-DENORMAL-AVX512BW-LABEL: canonicalize_denormal1_x86_fp80_ieee:
-; DYN-DENORMAL-AVX512BW:       # %bb.0:
-; DYN-DENORMAL-AVX512BW-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
-; DYN-DENORMAL-AVX512BW-NEXT:    fstpt (%rdi)
-; DYN-DENORMAL-AVX512BW-NEXT:    retq
-  %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 0xK00000000000000000001)
-  store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll
index 0075386c023618..a9564496324703 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll
@@ -10,26 +10,32 @@
 define float @canon_fp32_varargsf32(float %a) {
 ; SSE-LABEL: canon_fp32_varargsf32:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; SSE2-LABEL: canon_fp32_varargsf32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: canon_fp32_varargsf32:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: canon_fp32_varargsf32:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: canon_fp32_varargsf32:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: canon_fp32_varargsf32:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
   %canonicalized = call float @llvm.canonicalize.f32(float %a)
   ret float %canonicalized
@@ -39,90 +45,80 @@ define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) {
 ; SSE-LABEL: canon_fp32_varargsf80:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE-NEXT:    fld1
+; SSE-NEXT:    fmulp %st, %st(1)
 ; SSE-NEXT:    retq
 ;
 ; SSE2-LABEL: canon_fp32_varargsf80:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT:    fld1
+; SSE2-NEXT:    fmulp %st, %st(1)
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: canon_fp32_varargsf80:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX-NEXT:    fld1
+; AVX-NEXT:    fmulp %st, %st(1)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: canon_fp32_varargsf80:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    fld1
+; AVX2-NEXT:    fmulp %st, %st(1)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: canon_fp32_varargsf80:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    fld1
+; AVX512F-NEXT:    fmulp %st, %st(1)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: canon_fp32_varargsf80:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    fldt {{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    fld1
+; AVX512BW-NEXT:    fmulp %st, %st(1)
 ; AVX512BW-NEXT:    retq
   %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %a)
   ret x86_fp80 %canonicalized
 }
 
-define bfloat @canon_fp32_varargsbf16(bfloat %a) {
-; SSE-LABEL: canon_fp32_varargsbf16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: canon_fp32_varargsbf16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: canon_fp32_varargsbf16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: canon_fp32_varargsbf16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: canon_fp32_varargsbf16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: canon_fp32_varargsbf16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    retq
-  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %a)
-  ret bfloat %canonicalized
-}
-
 define half @complex_canonicalize_fmul_half(half %a, half %b) {
 ; SSE-LABEL: complex_canonicalize_fmul_half:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movss %xmm1, (%rsp) # 4-byte Spill
+; SSE-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SSE-NEXT:    callq __extendhfsf2 at PLT
-; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT:    movss (%rsp), %xmm0 # 4-byte Reload
+; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    callq __extendhfsf2 at PLT
-; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
-; SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
 ; SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    subss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    callq __truncsfhf2 at PLT
 ; SSE-NEXT:    callq __extendhfsf2 at PLT
-; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; SSE-NEXT:    callq __truncsfhf2 at PLT
 ; SSE-NEXT:    callq __extendhfsf2 at PLT
-; SSE-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
 ; SSE-NEXT:    callq __truncsfhf2 at PLT
 ; SSE-NEXT:    callq __extendhfsf2 at PLT
-; SSE-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    mulss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT:    callq __truncsfhf2 at PLT
+; SSE-NEXT:    callq __extendhfsf2 at PLT
+; SSE-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; SSE-NEXT:    callq __truncsfhf2 at PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -132,27 +128,33 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    pushq %rax
 ; SSE2-NEXT:    .cfi_def_cfa_offset 16
-; SSE2-NEXT:    movss %xmm1, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
-; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT:    movss (%rsp), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
-; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
-; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
 ; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    subss %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2 at PLT
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
-; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; SSE2-NEXT:    callq __truncsfhf2 at PLT
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
-; SSE2-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
 ; SSE2-NEXT:    callq __truncsfhf2 at PLT
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
-; SSE2-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    mulss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; SSE2-NEXT:    callq __truncsfhf2 at PLT
 ; SSE2-NEXT:    popq %rax
 ; SSE2-NEXT:    .cfi_def_cfa_offset 8
@@ -162,26 +164,32 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss %xmm1, (%rsp) # 4-byte Spill
+; AVX-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX-NEXT:    callq __extendhfsf2 at PLT
-; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX-NEXT:    vmovss (%rsp), %xmm0 # 4-byte Reload
+; AVX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; AVX-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    callq __extendhfsf2 at PLT
-; AVX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
-; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
 ; AVX-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    callq __truncsfhf2 at PLT
 ; AVX-NEXT:    callq __extendhfsf2 at PLT
-; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX-NEXT:    vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
 ; AVX-NEXT:    callq __truncsfhf2 at PLT
 ; AVX-NEXT:    callq __extendhfsf2 at PLT
-; AVX-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
 ; AVX-NEXT:    callq __truncsfhf2 at PLT
 ; AVX-NEXT:    callq __extendhfsf2 at PLT
-; AVX-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX-NEXT:    callq __truncsfhf2 at PLT
+; AVX-NEXT:    callq __extendhfsf2 at PLT
+; AVX-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
 ; AVX-NEXT:    callq __truncsfhf2 at PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -191,26 +199,32 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) {
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    pushq %rax
 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    vmovss %xmm1, (%rsp) # 4-byte Spill
+; AVX2-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    callq __extendhfsf2 at PLT
-; AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT:    vmovss (%rsp), %xmm0 # 4-byte Reload
+; AVX2-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; AVX2-NEXT:    callq __extendhfsf2 at PLT
-; AVX2-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
-; AVX2-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
 ; AVX2-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; AVX2-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    callq __extendhfsf2 at PLT
-; AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT:    vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    callq __extendhfsf2 at PLT
-; AVX2-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    callq __extendhfsf2 at PLT
-; AVX2-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    callq __extendhfsf2 at PLT
+; AVX2-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    popq %rax
 ; AVX2-NEXT:    .cfi_def_cfa_offset 8
@@ -232,6 +246,15 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) {
 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX512F-NEXT:    vsubss %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512F-NEXT:    vmovd %eax, %xmm2
+; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT:    vmulss %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -255,6 +278,15 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) {
 ; AVX512BW-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX512BW-NEXT:    vsubss %xmm0, %xmm2, %xmm0
 ; AVX512BW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm2
+; AVX512BW-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512BW-NEXT:    vmulss %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512BW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512BW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -280,6 +312,8 @@ define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
 ; SSE-NEXT:    fld %st(0)
 ; SSE-NEXT:    fadd %st(2), %st
 ; SSE-NEXT:    fsubp %st, %st(1)
+; SSE-NEXT:    fld1
+; SSE-NEXT:    fmulp %st, %st(1)
 ; SSE-NEXT:    fsubp %st, %st(1)
 ; SSE-NEXT:    retq
 ;
@@ -291,6 +325,8 @@ define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
 ; SSE2-NEXT:    fld %st(0)
 ; SSE2-NEXT:    fadd %st(2), %st
 ; SSE2-NEXT:    fsubp %st, %st(1)
+; SSE2-NEXT:    fld1
+; SSE2-NEXT:    fmulp %st, %st(1)
 ; SSE2-NEXT:    fsubp %st, %st(1)
 ; SSE2-NEXT:    retq
 ;
@@ -302,6 +338,8 @@ define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
 ; AVX-NEXT:    fld %st(0)
 ; AVX-NEXT:    fadd %st(2), %st
 ; AVX-NEXT:    fsubp %st, %st(1)
+; AVX-NEXT:    fld1
+; AVX-NEXT:    fmulp %st, %st(1)
 ; AVX-NEXT:    fsubp %st, %st(1)
 ; AVX-NEXT:    retq
 ;
@@ -313,6 +351,8 @@ define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
 ; AVX2-NEXT:    fld %st(0)
 ; AVX2-NEXT:    fadd %st(2), %st
 ; AVX2-NEXT:    fsubp %st, %st(1)
+; AVX2-NEXT:    fld1
+; AVX2-NEXT:    fmulp %st, %st(1)
 ; AVX2-NEXT:    fsubp %st, %st(1)
 ; AVX2-NEXT:    retq
 ;
@@ -324,6 +364,8 @@ define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
 ; AVX512F-NEXT:    fld %st(0)
 ; AVX512F-NEXT:    fadd %st(2), %st
 ; AVX512F-NEXT:    fsubp %st, %st(1)
+; AVX512F-NEXT:    fld1
+; AVX512F-NEXT:    fmulp %st, %st(1)
 ; AVX512F-NEXT:    fsubp %st, %st(1)
 ; AVX512F-NEXT:    retq
 ;
@@ -335,6 +377,8 @@ define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
 ; AVX512BW-NEXT:    fld %st(0)
 ; AVX512BW-NEXT:    fadd %st(2), %st
 ; AVX512BW-NEXT:    fsubp %st, %st(1)
+; AVX512BW-NEXT:    fld1
+; AVX512BW-NEXT:    fmulp %st, %st(1)
 ; AVX512BW-NEXT:    fsubp %st, %st(1)
 ; AVX512BW-NEXT:    retq
 entry:
@@ -347,214 +391,6 @@ entry:
   ret x86_fp80 %result
 }
 
-define bfloat @complex_canonicalize_fmul_bfloat(bfloat %a, bfloat %b) {
-; SSE-LABEL: complex_canonicalize_fmul_bfloat:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    pextrw $0, %xmm1, %ecx
-; SSE-NEXT:    shll $16, %ecx
-; SSE-NEXT:    movd %ecx, %xmm1
-; SSE-NEXT:    movd %xmm1, (%rsp) # 4-byte Folded Spill
-; SSE-NEXT:    shll $16, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    subss %xmm1, %xmm0
-; SSE-NEXT:    callq __truncsfbf2 at PLT
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    shll $16, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
-; SSE-NEXT:    callq __truncsfbf2 at PLT
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    shll $16, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; SSE-NEXT:    callq __truncsfbf2 at PLT
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    shll $16, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
-; SSE-NEXT:    callq __truncsfbf2 at PLT
-; SSE-NEXT:    popq %rax
-; SSE-NEXT:    .cfi_def_cfa_offset 8
-; SSE-NEXT:    retq
-;
-; SSE2-LABEL: complex_canonicalize_fmul_bfloat:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    pextrw $0, %xmm1, %ecx
-; SSE2-NEXT:    shll $16, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm1
-; SSE2-NEXT:    movd %xmm1, (%rsp) # 4-byte Folded Spill
-; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    subss %xmm1, %xmm0
-; SSE2-NEXT:    callq __truncsfbf2 at PLT
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
-; SSE2-NEXT:    callq __truncsfbf2 at PLT
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; SSE2-NEXT:    callq __truncsfbf2 at PLT
-; SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
-; SSE2-NEXT:    callq __truncsfbf2 at PLT
-; SSE2-NEXT:    popq %rax
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
-; SSE2-NEXT:    retq
-;
-; AVX-LABEL: complex_canonicalize_fmul_bfloat:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    pushq %rax
-; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
-; AVX-NEXT:    shll $16, %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm1
-; AVX-NEXT:    vmovd %xmm1, (%rsp) # 4-byte Folded Spill
-; AVX-NEXT:    shll $16, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    callq __truncsfbf2 at PLT
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    shll $16, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; AVX-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX-NEXT:    callq __truncsfbf2 at PLT
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    shll $16, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX-NEXT:    callq __truncsfbf2 at PLT
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    shll $16, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX-NEXT:    callq __truncsfbf2 at PLT
-; AVX-NEXT:    popq %rax
-; AVX-NEXT:    .cfi_def_cfa_offset 8
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: complex_canonicalize_fmul_bfloat:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    vpextrw $0, %xmm1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm1
-; AVX2-NEXT:    vmovd %xmm1, (%rsp) # 4-byte Folded Spill
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    callq __truncsfbf2 at PLT
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; AVX2-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX2-NEXT:    callq __truncsfbf2 at PLT
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX2-NEXT:    callq __truncsfbf2 at PLT
-; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX2-NEXT:    callq __truncsfbf2 at PLT
-; AVX2-NEXT:    popq %rax
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: complex_canonicalize_fmul_bfloat:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    vpextrw $0, %xmm1, %ecx
-; AVX512F-NEXT:    shll $16, %ecx
-; AVX512F-NEXT:    vmovd %ecx, %xmm1
-; AVX512F-NEXT:    vmovd %xmm1, (%rsp) # 4-byte Folded Spill
-; AVX512F-NEXT:    shll $16, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm0
-; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    callq __truncsfbf2 at PLT
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    shll $16, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm0
-; AVX512F-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; AVX512F-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX512F-NEXT:    callq __truncsfbf2 at PLT
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    shll $16, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm0
-; AVX512F-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX512F-NEXT:    callq __truncsfbf2 at PLT
-; AVX512F-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT:    shll $16, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm0
-; AVX512F-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX512F-NEXT:    callq __truncsfbf2 at PLT
-; AVX512F-NEXT:    popq %rax
-; AVX512F-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: complex_canonicalize_fmul_bfloat:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    pushq %rax
-; AVX512BW-NEXT:    .cfi_def_cfa_offset 16
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %ecx
-; AVX512BW-NEXT:    shll $16, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm1
-; AVX512BW-NEXT:    vmovd %xmm1, (%rsp) # 4-byte Folded Spill
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm0
-; AVX512BW-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    callq __truncsfbf2 at PLT
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm0
-; AVX512BW-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; AVX512BW-NEXT:    vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX512BW-NEXT:    callq __truncsfbf2 at PLT
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm0
-; AVX512BW-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX512BW-NEXT:    callq __truncsfbf2 at PLT
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm0
-; AVX512BW-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
-; AVX512BW-NEXT:    callq __truncsfbf2 at PLT
-; AVX512BW-NEXT:    popq %rax
-; AVX512BW-NEXT:    .cfi_def_cfa_offset 8
-; AVX512BW-NEXT:    retq
-entry:
-
-  %sub1 = fsub bfloat %a, %b
-  %add = fadd bfloat %sub1, %b
-  %sub2 = fsub bfloat %add, %sub1
-  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %sub2)
-  %result = fsub bfloat %canonicalized, %b
-  ret bfloat %result
-}
-
 define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; SSE-LABEL: canonicalize_fp64:
 ; SSE:       # %bb.0: # %start
@@ -565,6 +401,7 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; SSE-NEXT:    maxsd %xmm0, %xmm1
 ; SSE-NEXT:    andnpd %xmm1, %xmm2
 ; SSE-NEXT:    orpd %xmm3, %xmm2
+; SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE-NEXT:    movapd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -577,6 +414,7 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; SSE2-NEXT:    maxsd %xmm0, %xmm1
 ; SSE2-NEXT:    andnpd %xmm1, %xmm2
 ; SSE2-NEXT:    orpd %xmm3, %xmm2
+; SSE2-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    movapd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -585,6 +423,7 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: canonicalize_fp64:
@@ -592,6 +431,7 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; AVX2-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
 ; AVX2-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: canonicalize_fp64:
@@ -599,7 +439,7 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; AVX512F-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
 ; AVX512F-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512F-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512F-NEXT:    vmovapd %xmm2, %xmm0
+; AVX512F-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: canonicalize_fp64:
@@ -607,7 +447,7 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; AVX512BW-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
 ; AVX512BW-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
 ; AVX512BW-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT:    vmovapd %xmm2, %xmm0
+; AVX512BW-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
 ; AVX512BW-NEXT:    retq
 start:
 
@@ -629,6 +469,7 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; SSE-NEXT:    maxss %xmm0, %xmm1
 ; SSE-NEXT:    andnps %xmm1, %xmm2
 ; SSE-NEXT:    orps %xmm3, %xmm2
+; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -641,6 +482,7 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; SSE2-NEXT:    maxss %xmm0, %xmm1
 ; SSE2-NEXT:    andnps %xmm1, %xmm2
 ; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -649,6 +491,7 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: canonicalize_fp32:
@@ -656,6 +499,7 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; AVX2-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
 ; AVX2-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: canonicalize_fp32:
@@ -663,7 +507,7 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; AVX512F-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
 ; AVX512F-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512F-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512F-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: canonicalize_fp32:
@@ -671,7 +515,7 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; AVX512BW-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
 ; AVX512BW-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512BW-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT:    vmovaps %xmm2, %xmm0
+; AVX512BW-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
 ; AVX512BW-NEXT:    retq
 start:
 
@@ -962,39 +806,48 @@ define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
   ret void
 }
 
-define void @v_test_canonicalize__bfloat(bfloat addrspace(1)* %out) {
-; SSE-LABEL: v_test_canonicalize__bfloat:
-; SSE:       # %bb.0: # %entry
+define void @canonicalize_undef(double addrspace(1)* %out) {
+; SSE-LABEL: canonicalize_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; SSE-NEXT:    movq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
-; SSE2-LABEL: v_test_canonicalize__bfloat:
-; SSE2:       # %bb.0: # %entry
+; SSE2-LABEL: canonicalize_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; SSE2-NEXT:    movq %rax, (%rdi)
 ; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: v_test_canonicalize__bfloat:
-; AVX:       # %bb.0: # %entry
+; AVX-LABEL: canonicalize_undef:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; AVX-NEXT:    movq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
-; AVX2-LABEL: v_test_canonicalize__bfloat:
-; AVX2:       # %bb.0: # %entry
+; AVX2-LABEL: canonicalize_undef:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; AVX2-NEXT:    movq %rax, (%rdi)
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: v_test_canonicalize__bfloat:
-; AVX512F:       # %bb.0: # %entry
+; AVX512F-LABEL: canonicalize_undef:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; AVX512F-NEXT:    movq %rax, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
-; AVX512BW-LABEL: v_test_canonicalize__bfloat:
-; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-LABEL: canonicalize_undef:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; AVX512BW-NEXT:    movq %rax, (%rdi)
 ; AVX512BW-NEXT:    retq
-entry:
-  %val = load bfloat, bfloat addrspace(1)* %out
-  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
-  store bfloat %canonicalized, bfloat addrspace(1)* %out
+  %canonicalized = call double @llvm.canonicalize.f64(double undef)
+  store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
 declare double @llvm.canonicalize.f64(double)
 declare float @llvm.canonicalize.f32(float)
-declare bfloat @llvm.canonicalize.bf16(bfloat)
 declare x86_fp80 @llvm.canonicalize.f80(x86_fp80)
 declare half @llvm.canonicalize.f16(half)



More information about the llvm-commits mailing list