[llvm] [AArch64][GlobalISel] More FCmp legalization. (PR #78734)

David Green via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 19 07:48:53 PST 2024


https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/78734

This fills out the fcmp handling to be more like the other instructions, adding better support for fp16 and some larger vectors.

Select of f16 values is still not handled optimally in places as the select is only legal for s32 values, not s16. This would be correct for integer but not necessarily for fp. It is as if we need to do legalization -> regbankselect -> extra legaliation -> selection.

There is some fallback due to some missing handling for shifts, which I think Chuong is currently looking at.

>From c89d6d5e5e140f325cb9760a6b69bb842cb3dd67 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 19 Jan 2024 15:40:27 +0000
Subject: [PATCH] [AArch64][GlobalISel] More FCmp legalization.

This fills out the fcmp handling to be more like the other instructions, adding
better support for fp16 and some larger vectors.

Select of f16 values is still not handled optimally in places as the select is
only legal for s32 values, not s16. This would be correct for integer but not
necessarily for fp. It is as if we need to do legalization -> regbankselect ->
extra legaliation -> selection.

There is some fallback due to some missing handling for shifts, which I think
Chuong is currently looking at.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  10 +
 .../GISel/AArch64InstructionSelector.cpp      |  16 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  33 +-
 .../AArch64/GlobalISel/legalize-fcmp.mir      |  83 +--
 llvm/test/CodeGen/AArch64/fcmp.ll             | 682 +++++-------------
 5 files changed, 219 insertions(+), 605 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a868860f343ba73..0f7c765b3dcc563 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1556,6 +1556,16 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_FCMP:
+    Observer.changingInstr(MI);
+    if (TypeIdx == 0)
+      narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
+    else {
+      return UnableToLegalize;
+    }
+    Observer.changedInstr(MI);
+    return Legalized;
+
   case TargetOpcode::G_SEXT_INREG: {
     if (TypeIdx != 0)
       return UnableToLegalize;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 8344e79f78e1eb6..48cfb60210d965d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -4600,8 +4600,7 @@ MachineInstr *AArch64InstructionSelector::emitFPCompare(
   if (Ty.isVector())
     return nullptr;
   unsigned OpSize = Ty.getSizeInBits();
-  if (OpSize != 32 && OpSize != 64)
-    return nullptr;
+  assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
 
   // If this is a compare against +0.0, then we don't have
   // to explicitly materialize a constant.
@@ -4620,9 +4619,11 @@ MachineInstr *AArch64InstructionSelector::emitFPCompare(
       std::swap(LHS, RHS);
     }
   }
-  unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
-                              {AArch64::FCMPSri, AArch64::FCMPDri}};
-  unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
+  unsigned CmpOpcTbl[2][3] = {
+      {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
+      {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
+  unsigned CmpOpc =
+      CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
 
   // Partially build the compare. Decide if we need to add a use for the
   // third operand based off whether or not we're comparing against 0.0.
@@ -4889,18 +4890,21 @@ MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
   // TODO: emit CMN as an optimization.
   auto &MRI = *MIB.getMRI();
   LLT OpTy = MRI.getType(LHS);
-  assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
   unsigned CCmpOpc;
   std::optional<ValueAndVReg> C;
   if (CmpInst::isIntPredicate(CC)) {
+    assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
     C = getIConstantVRegValWithLookThrough(RHS, MRI);
     if (C && C->Value.ult(32))
       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
     else
       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
   } else {
+    assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
+           OpTy.getSizeInBits() == 64);
     switch (OpTy.getSizeInBits()) {
     case 16:
+      assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
       CCmpOpc = AArch64::FCCMPHrr;
       break;
     case 32:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index b561cb12c93a1c3..114a2b03e8d52c7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -532,32 +532,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s64, v2s64);
 
   getActionDefinitionsBuilder(G_FCMP)
-      // If we don't have full FP16 support, then scalarize the elements of
-      // vectors containing fp16 types.
-      .fewerElementsIf(
-          [=](const LegalityQuery &Query) {
-            const auto &Ty = Query.Types[0];
-            return Ty.isVector() && Ty.getElementType() == s16 && !HasFP16;
-          },
-          [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
-      // If we don't have full FP16 support, then widen s16 to s32 if we
-      // encounter it.
-      .widenScalarIf(
-          [=](const LegalityQuery &Query) {
-            return Query.Types[0] == s16 && !HasFP16;
-          },
-          [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
-      .legalFor({{s16, s16},
+      .legalFor({{s32, MinFPScalar},
                  {s32, s32},
                  {s32, s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32},
-                 {v2s64, v2s64},
-                 {v4s16, v4s16},
-                 {v8s16, v8s16}})
+                 {v2s64, v2s64}})
+      .legalIf([=](const LegalityQuery &Query) {
+        const auto &Ty = Query.Types[0];
+        return (Ty == v8s16 || Ty == v4s16) && HasFP16;
+      })
       .widenScalarOrEltToNextPow2(1)
-      .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s32)
+      .clampScalarOrElt(1, MinFPScalar, s64)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
@@ -566,8 +553,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                    Ty.getElementType() != SrcTy.getElementType();
           },
           0, 1)
-      .clampNumElements(0, v2s32, v4s32)
-      .clampMaxNumElements(1, s64, 2);
+      .clampNumElements(1, v4s16, v8s16)
+      .clampNumElements(1, v2s32, v4s32)
+      .clampMaxNumElements(1, s64, 2)
+      .moreElementsToNextPow2(1);
 
   // Extensions
   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
index 355cf193272e654..b7c86f235267e6d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
@@ -6,13 +6,13 @@ body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: test_icmp
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oge), [[COPY]](s64), [[COPY1]]
-    ; CHECK: $w0 = COPY [[FCMP]](s32)
-    ; CHECK: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(uno), [[TRUNC]](s32), [[TRUNC1]]
-    ; CHECK: $w0 = COPY [[FCMP1]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oge), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: $w0 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(uno), [[TRUNC]](s32), [[TRUNC1]]
+    ; CHECK-NEXT: $w0 = COPY [[FCMP1]](s32)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x0
     %2:_(s32) = G_TRUNC %0(s64)
@@ -36,41 +36,17 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %lhs:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: %rhs:_(<8 x s16>) = COPY $q1
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %lhs(<8 x s16>)
-    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %rhs(<8 x s16>)
-    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16)
-    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV8]](s16)
-    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT]](s32), [[FPEXT1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP]](s32)
-    ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16)
-    ; CHECK-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[UV9]](s16)
-    ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT2]](s32), [[FPEXT3]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP1]](s32)
-    ; CHECK-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[UV2]](s16)
-    ; CHECK-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[UV10]](s16)
-    ; CHECK-NEXT: [[FCMP2:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT4]](s32), [[FPEXT5]]
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP2]](s32)
-    ; CHECK-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[UV3]](s16)
-    ; CHECK-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[UV11]](s16)
-    ; CHECK-NEXT: [[FCMP3:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT6]](s32), [[FPEXT7]]
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP3]](s32)
-    ; CHECK-NEXT: [[FPEXT8:%[0-9]+]]:_(s32) = G_FPEXT [[UV4]](s16)
-    ; CHECK-NEXT: [[FPEXT9:%[0-9]+]]:_(s32) = G_FPEXT [[UV12]](s16)
-    ; CHECK-NEXT: [[FCMP4:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT8]](s32), [[FPEXT9]]
-    ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP4]](s32)
-    ; CHECK-NEXT: [[FPEXT10:%[0-9]+]]:_(s32) = G_FPEXT [[UV5]](s16)
-    ; CHECK-NEXT: [[FPEXT11:%[0-9]+]]:_(s32) = G_FPEXT [[UV13]](s16)
-    ; CHECK-NEXT: [[FCMP5:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT10]](s32), [[FPEXT11]]
-    ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP5]](s32)
-    ; CHECK-NEXT: [[FPEXT12:%[0-9]+]]:_(s32) = G_FPEXT [[UV6]](s16)
-    ; CHECK-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[UV14]](s16)
-    ; CHECK-NEXT: [[FCMP6:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT12]](s32), [[FPEXT13]]
-    ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP6]](s32)
-    ; CHECK-NEXT: [[FPEXT14:%[0-9]+]]:_(s32) = G_FPEXT [[UV7]](s16)
-    ; CHECK-NEXT: [[FPEXT15:%[0-9]+]]:_(s32) = G_FPEXT [[UV15]](s16)
-    ; CHECK-NEXT: [[FCMP7:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT14]](s32), [[FPEXT15]]
-    ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP7]](s32)
-    ; CHECK-NEXT: %fcmp:_(<8 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES %lhs(<8 x s16>)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV]](<4 x s16>)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV1]](<4 x s16>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<4 x s16>), [[UV3:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES %rhs(<8 x s16>)
+    ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV2]](<4 x s16>)
+    ; CHECK-NEXT: [[FPEXT3:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV3]](<4 x s16>)
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(<4 x s32>) = G_FCMP floatpred(oeq), [[FPEXT]](<4 x s32>), [[FPEXT2]]
+    ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(<4 x s32>) = G_FCMP floatpred(oeq), [[FPEXT1]](<4 x s32>), [[FPEXT3]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[FCMP]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[FCMP1]](<4 x s32>)
+    ; CHECK-NEXT: %fcmp:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
     ; CHECK-NEXT: $q0 = COPY %fcmp(<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %lhs:_(<8 x s16>) = COPY $q0
@@ -93,25 +69,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %lhs:_(<4 x s16>) = COPY $d0
     ; CHECK-NEXT: %rhs:_(<4 x s16>) = COPY $d1
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %lhs(<4 x s16>)
-    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %rhs(<4 x s16>)
-    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16)
-    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV4]](s16)
-    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT]](s32), [[FPEXT1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP]](s32)
-    ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16)
-    ; CHECK-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[UV5]](s16)
-    ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT2]](s32), [[FPEXT3]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP1]](s32)
-    ; CHECK-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[UV2]](s16)
-    ; CHECK-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[UV6]](s16)
-    ; CHECK-NEXT: [[FCMP2:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT4]](s32), [[FPEXT5]]
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP2]](s32)
-    ; CHECK-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[UV3]](s16)
-    ; CHECK-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[UV7]](s16)
-    ; CHECK-NEXT: [[FCMP3:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT6]](s32), [[FPEXT7]]
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP3]](s32)
-    ; CHECK-NEXT: %fcmp:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(<4 x s32>) = G_FPEXT %lhs(<4 x s16>)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(<4 x s32>) = G_FPEXT %rhs(<4 x s16>)
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(<4 x s32>) = G_FCMP floatpred(oeq), [[FPEXT]](<4 x s32>), [[FPEXT1]]
+    ; CHECK-NEXT: %fcmp:_(<4 x s16>) = G_TRUNC [[FCMP]](<4 x s32>)
     ; CHECK-NEXT: $d0 = COPY %fcmp(<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %lhs:_(<4 x s16>) = COPY $d0
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 82e29d0f8a194f1..21366f5e37c6ffd 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -4,14 +4,11 @@
 ; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
 ; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v3f64_double
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f64_i32
+; CHECK-GI:       warning: Instruction selection used fallback path for v3f64_i32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f32_float
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f32_i32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v7f16_half
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v16f16_half
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v7f16_i32
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v16f16_i32
 
 define double @f64_double(double %a, double %b, double %d, double %e) {
 ; CHECK-LABEL: f64_double:
@@ -79,19 +76,31 @@ define half @f16_half(half %a, half %b, half %d, half %e) {
 ; CHECK-SD-FP16-NEXT:    fcsel h0, h2, h3, mi
 ; CHECK-SD-FP16-NEXT:    ret
 ;
-; CHECK-GI-LABEL: f16_half:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-GI-NEXT:    // kill: def $h3 killed $h3 def $s3
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    fmov w9, s3
-; CHECK-GI-NEXT:    fcmp s0, s1
-; CHECK-GI-NEXT:    csel w8, w8, w9, mi
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-GI-NEXT:    ret
+; CHECK-GI-NOFP16-LABEL: f16_half:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s2
+; CHECK-GI-NOFP16-NEXT:    fmov w9, s3
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    csel w8, w8, w9, mi
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: f16_half:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-GI-FP16-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-GI-FP16-NEXT:    fcmp h0, h1
+; CHECK-GI-FP16-NEXT:    fmov w8, s2
+; CHECK-GI-FP16-NEXT:    fmov w9, s3
+; CHECK-GI-FP16-NEXT:    csel w8, w8, w9, mi
+; CHECK-GI-FP16-NEXT:    fmov s0, w8
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt half %a, %b
   %s = select i1 %c, half %d, half %e
@@ -113,13 +122,19 @@ define i32 @f16_i32(half %a, half %b, i32 %d, i32 %e) {
 ; CHECK-SD-FP16-NEXT:    csel w0, w0, w1, mi
 ; CHECK-SD-FP16-NEXT:    ret
 ;
-; CHECK-GI-LABEL: f16_i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    fcmp s0, s1
-; CHECK-GI-NEXT:    csel w0, w0, w1, mi
-; CHECK-GI-NEXT:    ret
+; CHECK-GI-NOFP16-LABEL: f16_i32:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    csel w0, w0, w1, mi
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: f16_i32:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmp h0, h1
+; CHECK-GI-FP16-NEXT:    csel w0, w0, w1, mi
+; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt half %a, %b
   %s = select i1 %c, i32 %d, i32 %e
@@ -139,32 +154,60 @@ entry:
 }
 
 define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double> %d, <3 x double> %e) {
-; CHECK-LABEL: v3f64_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d6 killed $d6 def $q6
-; CHECK-NEXT:    // kill: def $d7 killed $d7 def $q7
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    // kill: def $d5 killed $d5 def $q5
-; CHECK-NEXT:    ldr d16, [sp, #24]
-; CHECK-NEXT:    ldr d17, [sp]
-; CHECK-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-NEXT:    ldp d1, d4, [sp, #8]
-; CHECK-NEXT:    fcmgt v2.2d, v5.2d, v2.2d
-; CHECK-NEXT:    mov v1.d[1], v4.d[0]
-; CHECK-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
-; CHECK-NEXT:    bsl v2.16b, v17.16b, v16.16b
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v1.16b
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f64_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-SD-NEXT:    ldr d16, [sp, #24]
+; CHECK-SD-NEXT:    ldr d17, [sp]
+; CHECK-SD-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-SD-NEXT:    ldp d1, d4, [sp, #8]
+; CHECK-SD-NEXT:    fcmgt v2.2d, v5.2d, v2.2d
+; CHECK-SD-NEXT:    mov v1.d[1], v4.d[0]
+; CHECK-SD-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-SD-NEXT:    bsl v2.16b, v17.16b, v16.16b
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f64_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-GI-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-GI-NEXT:    fcmp d2, d5
+; CHECK-GI-NEXT:    ldr x8, [sp]
+; CHECK-GI-NEXT:    ldr x10, [sp, #24]
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-GI-NEXT:    ldp d1, d4, [sp, #8]
+; CHECK-GI-NEXT:    cset w9, mi
+; CHECK-GI-NEXT:    sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    mov v1.d[1], v4.d[0]
+; CHECK-GI-NEXT:    and x8, x8, x9
+; CHECK-GI-NEXT:    bic x9, x10, x9
+; CHECK-GI-NEXT:    orr x8, x8, x9
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x double> %a, %b
   %s = select <3 x i1> %c, <3 x double> %d, <3 x double> %e
@@ -524,39 +567,10 @@ define <4 x half> @v4f16_half(<4 x half> %a, <4 x half> %b, <4 x half> %d, <4 x
 ;
 ; CHECK-GI-NOFP16-LABEL: v4f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w9
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    shl v0.4h, v4.4h, #15
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4h, v0.4h, #15
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -640,68 +654,14 @@ define <8 x half> @v8f16_half(<8 x half> %a, <8 x half> %b, <8 x half> %d, <8 x
 ;
 ; CHECK-GI-NOFP16-LABEL: v8f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h17
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s7, s16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w9
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v16.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s5, s17
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w8
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcmp s5, s16
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    shl v0.8h, v4.8h, #15
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v4.4s, v5.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v4.8h, v0.8h
+; CHECK-GI-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.8h, v0.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -843,130 +803,22 @@ define <16 x half> @v16f16_half(<16 x half> %a, <16 x half> %b, <16 x half> %d,
 ;
 ; CHECK-GI-NOFP16-LABEL: v16f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
-; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    cset w15, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
-; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w13, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h1
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h3
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    mov h20, v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w10, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    cset w16, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-GI-NOFP16-NEXT:    fmov s18, w16
-; CHECK-GI-NOFP16-NEXT:    cset w17, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s19
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h20
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w14
-; CHECK-GI-NOFP16-NEXT:    fmov s20, w15
-; CHECK-GI-NOFP16-NEXT:    fmov s21, w17
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v20.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v21.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h20, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcmp s17, s19
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w13
-; CHECK-GI-NOFP16-NEXT:    fmov s19, w14
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w13, mi
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcmp s2, s16
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h20
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h21
-; CHECK-GI-NOFP16-NEXT:    fmov s20, w12
-; CHECK-GI-NOFP16-NEXT:    fmov s21, w13
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v20.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[3], v21.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcmp s2, s16
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w11
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w12
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s17, s19
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[4], v16.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w9
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w11
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[5], v16.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcmp s1, s2
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w9
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w10
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[6], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w8
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[7], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v17.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v18.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v19.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v16.4s, v18.4s, v16.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v19.4s, v17.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v3.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v16.8h, v0.8h
+; CHECK-GI-NOFP16-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
 ; CHECK-GI-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
-; CHECK-GI-NOFP16-NEXT:    shl v1.8h, v18.8h, #15
+; CHECK-GI-NOFP16-NEXT:    shl v1.8h, v1.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.8h, v0.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    sshr v1.8h, v1.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
@@ -975,10 +827,10 @@ define <16 x half> @v16f16_half(<16 x half> %a, <16 x half> %b, <16 x half> %d,
 ;
 ; CHECK-GI-FP16-LABEL: v16f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v2.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-GI-FP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt <16 x half> %a, %b
@@ -1256,40 +1108,9 @@ define <4 x i32> @v4f16_i32(<4 x half> %a, <4 x half> %b, <4 x i32> %d, <4 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v4f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w9
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v4.4h, #0
-; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -1383,74 +1204,15 @@ define <8 x i32> @v8f16_i32(<8 x half> %a, <8 x half> %b, <8 x i32> %d, <8 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v8f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h1
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    cset w10, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w12
-; CHECK-GI-NOFP16-NEXT:    cset w13, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
-; CHECK-GI-NOFP16-NEXT:    fmov s7, w9
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w13
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w10
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w11
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v6.4h, #0
-; CHECK-GI-NOFP16-NEXT:    ushll v1.4s, v16.4h, #0
-; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v1.4s, v1.4s, #31
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v6.4s, v7.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v6.16b
 ; CHECK-GI-NOFP16-NEXT:    bsl v1.16b, v3.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: v8f16_i32:
@@ -1619,143 +1381,24 @@ define <16 x i32> @v16f16_i32(<16 x half> %a, <16 x half> %b, <16 x i32> %d, <16
 ;
 ; CHECK-GI-NOFP16-LABEL: v16f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
-; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    cset w15, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
-; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h1
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h3
-; CHECK-GI-NOFP16-NEXT:    cset w16, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    cset w17, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w10, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    cset w18, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    cset w0, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w13, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-GI-NOFP16-NEXT:    fmov s18, w15
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    fmov s19, w17
-; CHECK-GI-NOFP16-NEXT:    cset w1, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w16
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w14
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w18
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w0
-; CHECK-GI-NOFP16-NEXT:    fmov s19, w14
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v18.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s18, w1
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s1, s3
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w11
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w9
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w14
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w13
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w8
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w10
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[3], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w12
-; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    ushll v3.4s, v18.4h, #0
-; CHECK-GI-NOFP16-NEXT:    ushll v1.4s, v2.4h, #0
-; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v17.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v18.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v19.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v17.4s, v19.4s, v17.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v16.4s, v18.4s, v16.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v3.4s, v3.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    ldp q0, q1, [sp]
 ; CHECK-GI-NOFP16-NEXT:    ldp q18, q19, [sp, #32]
-; CHECK-GI-NOFP16-NEXT:    shl v3.4s, v3.4s, #31
-; CHECK-GI-NOFP16-NEXT:    ushll v2.4s, v16.4h, #0
-; CHECK-GI-NOFP16-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    ldp q16, q17, [sp]
-; CHECK-GI-NOFP16-NEXT:    sshr v3.4s, v3.4s, #31
-; CHECK-GI-NOFP16-NEXT:    shl v2.4s, v2.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v1.4s, v1.4s, #31
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v4.16b, v16.16b
+; CHECK-GI-NOFP16-NEXT:    bit v1.16b, v5.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    mov v2.16b, v17.16b
+; CHECK-GI-NOFP16-NEXT:    bit v0.16b, v4.16b, v16.16b
 ; CHECK-GI-NOFP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
-; CHECK-GI-NOFP16-NEXT:    sshr v2.4s, v2.4s, #31
-; CHECK-GI-NOFP16-NEXT:    bsl v1.16b, v5.16b, v17.16b
 ; CHECK-GI-NOFP16-NEXT:    bsl v2.16b, v6.16b, v18.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -1763,19 +1406,26 @@ define <16 x i32> @v16f16_i32(<16 x half> %a, <16 x half> %b, <16 x i32> %d, <16
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v2.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    ldp q2, q20, [sp]
 ; CHECK-GI-FP16-NEXT:    ldp q18, q19, [sp, #32]
-; CHECK-GI-FP16-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT:    sshll v16.4s, v1.4h, #0
-; CHECK-GI-FP16-NEXT:    sshll2 v17.4s, v1.8h, #0
-; CHECK-GI-FP16-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-GI-FP16-NEXT:    mov v0.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-FP16-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-FP16-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-FP16-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    shl v3.4s, v3.4s, #31
+; CHECK-GI-FP16-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v2.4s, v2.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v16.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v3.4s, v3.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v17.4s, v1.4s, #31
+; CHECK-GI-FP16-NEXT:    ldp q0, q1, [sp]
+; CHECK-GI-FP16-NEXT:    bit v0.16b, v4.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    mov v2.16b, v3.16b
 ; CHECK-GI-FP16-NEXT:    mov v3.16b, v17.16b
-; CHECK-GI-FP16-NEXT:    bsl v1.16b, v5.16b, v20.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v4.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    mov v2.16b, v16.16b
-; CHECK-GI-FP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
+; CHECK-GI-FP16-NEXT:    bit v1.16b, v5.16b, v16.16b
 ; CHECK-GI-FP16-NEXT:    bsl v2.16b, v6.16b, v18.16b
+; CHECK-GI-FP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt <16 x half> %a, %b



More information about the llvm-commits mailing list