[llvm] [AArch64] Legalize fp128 types as libcalls for G_FCMP (PR #98452)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 22 08:11:09 PDT 2024


https://github.com/Him188 updated https://github.com/llvm/llvm-project/pull/98452

>From a294242256e93cbb863d59503aede635958aca77 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 3 Jul 2024 17:45:07 +0100
Subject: [PATCH 1/8] [AArch64] Legalize fp128 scalars for G_FCMP

- Generate libcall for supported predicates.
- Generate unsupported predicates as combinations of supported predicates.
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   3 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 153 ++++-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   5 +-
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       |  70 ++-
 llvm/test/CodeGen/AArch64/fcmp-fp128.ll       | 560 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/fcmp.ll             | 353 ++++++-----
 6 files changed, 977 insertions(+), 167 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fcmp-fp128.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 284f434fbb9b0..b8a0d94eeda2e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -280,6 +280,9 @@ class LegalizerHelper {
   LegalizeResult createResetStateLibcall(MachineIRBuilder &MIRBuilder,
                                          MachineInstr &MI,
                                          LostDebugLocObserver &LocObserver);
+  LegalizeResult createFCMPLibcall(MachineIRBuilder &MIRBuilder,
+                                   MachineInstr &MI,
+                                   LostDebugLocObserver &LocObserver);
 
   MachineInstrBuilder
   getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 86de1f3be9047..04a266daa9855 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -723,8 +723,7 @@ static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
   if (MemType.isVector())
     return RTLIB::UNKNOWN_LIBCALL;
 
-#define LCALLS(A, B)                                                           \
-  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
 #define LCALL5(A)                                                              \
   LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
   switch (Opc) {
@@ -980,6 +979,150 @@ LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
                        LocObserver, nullptr);
 }
 
+/// Returns the corresponding libcall for the given Pred and
+/// the ICMP predicate that should be generated to compare with #0
+/// after the libcall.
+static std::pair<RTLIB::Libcall, CmpInst::Predicate>
+getFCMPLibcallDesc(const CmpInst::Predicate Pred) {
+
+  switch (Pred) {
+  case CmpInst::FCMP_OEQ:
+    return {RTLIB::OEQ_F128, CmpInst::ICMP_EQ};
+  case CmpInst::FCMP_UNE:
+    return {RTLIB::UNE_F128, CmpInst::ICMP_NE};
+  case CmpInst::FCMP_OGE:
+    return {RTLIB::OGE_F128, CmpInst::ICMP_SGE};
+  case CmpInst::FCMP_OLT:
+    return {RTLIB::OLT_F128, CmpInst::ICMP_SLT};
+  case CmpInst::FCMP_OLE:
+    return {RTLIB::OLE_F128, CmpInst::ICMP_SLE};
+  case CmpInst::FCMP_OGT:
+    return {RTLIB::OGT_F128, CmpInst::ICMP_SGT};
+  case CmpInst::FCMP_UNO:
+    return {RTLIB::UO_F128, CmpInst::ICMP_NE};
+  default:
+    return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
+  }
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
+                                   MachineInstr &MI,
+                                   LostDebugLocObserver &LocObserver) {
+  auto &MF = MIRBuilder.getMF();
+  auto &Ctx = MF.getFunction().getContext();
+
+  LLT OpLLT = MRI.getType(MI.getOperand(2).getReg());
+  if (OpLLT != LLT::scalar(128) ||
+      OpLLT != MRI.getType(MI.getOperand(3).getReg()))
+    return UnableToLegalize;
+
+  Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
+
+  // Libcall always return i32
+  constexpr LLT I32LLT = LLT::scalar(32);
+  constexpr LLT PredTy = LLT::scalar(1);
+
+  const Register DstReg = MI.getOperand(0).getReg();
+  const Register Op1 = MI.getOperand(2).getReg();
+  const Register Op2 = MI.getOperand(3).getReg();
+  const auto Pred =
+      static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+
+  // Generates a libcall followed by ICMP
+  const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
+                                const CmpInst::Predicate ICmpPred) -> Register {
+    Register Temp = MRI.createGenericVirtualRegister(I32LLT);
+    // Generate libcall, storing result into Temp
+    const auto Status =
+        createLibcall(MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
+                      {{Op1, OpType, 0}, {Op2, OpType, 1}}, LocObserver, &MI);
+    if (!Status)
+      return MCRegister::NoRegister;
+
+    // FCMP libcall always returns an i32, we need to compare it with #0 to get
+    // the final result.
+    const Register Res = MRI.createGenericVirtualRegister(PredTy);
+    MIRBuilder.buildICmp(ICmpPred, Res, Temp,
+                         MIRBuilder.buildConstant(I32LLT, 0));
+    return Res;
+  };
+
+  // Simple case if we have a direct mapping from predicate to libcall
+  if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Pred);
+      Libcall != RTLIB::UNKNOWN_LIBCALL &&
+      ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
+    if (const auto Res = BuildLibcall(Libcall, ICmpPred)) {
+      MIRBuilder.buildCopy(DstReg, Res);
+      return Legalized;
+    }
+    return UnableToLegalize;
+  }
+
+  // No direct mapping found, should be generated as combination of libcalls.
+
+  switch (Pred) {
+  case CmpInst::FCMP_UEQ: {
+    // FCMP_UEQ: unordered or equal
+    // Convert into (FCMP_OEQ || FCMP_UNO).
+
+    const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
+    const auto Oeq = BuildLibcall(OeqLibcall, OeqPred);
+
+    const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
+    const auto Uno = BuildLibcall(UnoLibcall, UnoPred);
+
+    MIRBuilder.buildCopy(DstReg, MIRBuilder.buildOr(PredTy, Oeq, Uno));
+    break;
+  }
+  case CmpInst::FCMP_ONE: {
+    // FCMP_ONE: ordered and operands are unequal
+    // Convert into (!FCMP_OEQ && !FCMP_UNO).
+
+    // We inverse the predicate instead of generating a NOT
+    // to save one instruciton.
+    // On AArch64 isel can even select two cmp into a single ccmp.
+    const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
+    const auto NotOeq =
+        BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred));
+
+    const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
+    const auto NotUno =
+        BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred));
+
+    if (NotOeq && NotUno)
+      MIRBuilder.buildCopy(DstReg, MIRBuilder.buildAnd(PredTy, NotOeq, NotUno));
+    else
+      return UnableToLegalize;
+
+    break;
+  }
+  case CmpInst::FCMP_ULT:
+  case CmpInst::FCMP_UGE:
+  case CmpInst::FCMP_UGT:
+  case CmpInst::FCMP_ULE:
+  case CmpInst::FCMP_ORD: {
+    // Convert into: !(inverse(Pred))
+    // E.g. FCMP_ULT becomes !FCMP_OGE
+    // This is equivalent to the following, but saves some instructions.
+    //   MIRBuilder.buildNot(
+    //       PredTy,
+    //       MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
+    //                            Op1, Op2));
+    const auto [InversedLibcall, InversedPred] =
+        getFCMPLibcallDesc(CmpInst::getInversePredicate(Pred));
+    MIRBuilder.buildCopy(
+        DstReg, BuildLibcall(InversedLibcall,
+                             CmpInst::getInversePredicate(InversedPred)));
+    break;
+  }
+  default:
+    return UnableToLegalize;
+  }
+
+  return Legalized;
+}
+
 // The function is used to legalize operations that set default environment
 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
 // On most targets supported in glibc FE_DFL_MODE is defined as
@@ -1120,6 +1263,12 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
+  case TargetOpcode::G_FCMP: {
+    LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI: {
     // FIXME: Support other types
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index c6eb4d2b3ec78..511e47354cead 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -560,7 +560,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       })
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(0, s32, s32)
-      .clampScalarOrElt(1, MinFPScalar, s64)
+      .clampScalarOrElt(1, MinFPScalar, s128)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
@@ -572,7 +572,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(1, v4s16, v8s16)
       .clampNumElements(1, v2s32, v4s32)
       .clampMaxNumElements(1, s64, 2)
-      .moreElementsToNextPow2(1);
+      .moreElementsToNextPow2(1)
+      .libcallFor({{s32, s128}});
 
   // Extensions
   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index b6702bba1598c..e951c80e30845 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK,GISEL
 target triple = "arm64-apple-ios"
 
 define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
@@ -950,29 +950,51 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
 ; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead.
 
 define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 {
-; CHECK-LABEL: f128_select_and_olt_oge:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    stp q2, q3, [sp] ; 32-byte Folded Spill
-; CHECK-NEXT:    bl ___lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w21, lt
-; CHECK-NEXT:    ldp q0, q1, [sp] ; 32-byte Folded Reload
-; CHECK-NEXT:    bl ___getf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w8, ge
-; CHECK-NEXT:    tst w8, w21
-; CHECK-NEXT:    csel w0, w20, w19, ne
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: f128_select_and_olt_oge:
+; SDISEL:       ; %bb.0:
+; SDISEL-NEXT:    sub sp, sp, #80
+; SDISEL-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
+; SDISEL-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; SDISEL-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; SDISEL-NEXT:    mov x19, x1
+; SDISEL-NEXT:    mov x20, x0
+; SDISEL-NEXT:    stp q2, q3, [sp] ; 32-byte Folded Spill
+; SDISEL-NEXT:    bl ___lttf2
+; SDISEL-NEXT:    cmp w0, #0
+; SDISEL-NEXT:    cset w21, lt
+; SDISEL-NEXT:    ldp q0, q1, [sp] ; 32-byte Folded Reload
+; SDISEL-NEXT:    bl ___getf2
+; SDISEL-NEXT:    cmp w0, #0
+; SDISEL-NEXT:    cset w8, ge
+; SDISEL-NEXT:    tst w8, w21
+; SDISEL-NEXT:    csel w0, w20, w19, ne
+; SDISEL-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; SDISEL-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
+; SDISEL-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
+; SDISEL-NEXT:    add sp, sp, #80
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: f128_select_and_olt_oge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #80
+; GISEL-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp q3, q2, [sp] ; 32-byte Folded Spill
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    mov x20, x1
+; GISEL-NEXT:    bl ___lttf2
+; GISEL-NEXT:    mov x21, x0
+; GISEL-NEXT:    ldp q1, q0, [sp] ; 32-byte Folded Reload
+; GISEL-NEXT:    bl ___getf2
+; GISEL-NEXT:    cmp w21, #0
+; GISEL-NEXT:    ccmp w0, #0, #8, lt
+; GISEL-NEXT:    csel w0, w19, w20, ge
+; GISEL-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
+; GISEL-NEXT:    add sp, sp, #80
+; GISEL-NEXT:    ret
   %c0 = fcmp olt fp128 %v0, %v1
   %c1 = fcmp oge fp128 %v2, %v3
   %cr = and i1 %c1, %c0
diff --git a/llvm/test/CodeGen/AArch64/fcmp-fp128.ll b/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
new file mode 100644
index 0000000000000..503cb8c533bab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
@@ -0,0 +1,560 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK-GI
+
+; Checks generated libcalls for fp128 types
+
+define double @oeq(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: oeq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: oeq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp oeq fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ogt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ogt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, gt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ogt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, gt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ogt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @olt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: olt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: olt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ole(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ole:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __letf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, le
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ole:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __letf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, le
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ole fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @one(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: one:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w19, ne
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w8, eq
+; CHECK-SD-NEXT:    tst w8, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: one:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp one fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ord(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ord:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ord:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ord fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @uno(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: uno:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uno:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp uno fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ueq(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ueq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __eqtf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    mov w19, w0
+; CHECK-SD-NEXT:    bl __unordtf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, eq
+; CHECK-SD-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ueq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __eqtf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    bl __unordtf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ccmp w19, #0, #4, eq
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, eq
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ueq fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ugt(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __letf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, gt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __letf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, gt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ugt fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @uge(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ge
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ge
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp uge fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ult(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __getf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __getf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ult fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @ule(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __gttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, le
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __gttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, le
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp ule fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define double @une(fp128 %a, fp128 %b, double %d, double %e) {
+; CHECK-SD-LABEL: une:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __netf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, ne
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: une:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __netf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, ne
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp une fp128 %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index a76d0b36fa1aa..6fe96611167da 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -4,37 +4,51 @@
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
-
-; CHECK-GI:      warning: Instruction selection used fallback path for f128_fp128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_i128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_float
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_i32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for f128_half
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_fp128
+; CHECK-GI: warning: Instruction selection used fallback path for v2f128_fp128
 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_fp128
 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_double
 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_double
 
-
 define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
-; CHECK-LABEL: f128_fp128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q2, q3, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.ge .LBB0_2
-; CHECK-NEXT:  // %bb.1: // %entry
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:  .LBB0_2: // %entry
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_fp128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q2, q3, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.ge .LBB0_2
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:  .LBB0_2: // %entry
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_fp128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    stp q3, q2, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov d0, v2.d[1]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fcsel d2, d2, d3, lt
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    fcsel d1, d0, d1, lt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, fp128 %d, fp128 %e
@@ -42,37 +56,61 @@ entry:
 }
 
 define i128 @f128_i128(fp128 %a, fp128 %b, i128 %d, i128 %e) {
-; CHECK-LABEL: f128_i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    mov x19, x3
-; CHECK-NEXT:    mov x20, x2
-; CHECK-NEXT:    mov x21, x1
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    csel x20, x22, x20, lt
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel x1, x21, x19, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #80
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov x21, x1
+; CHECK-SD-NEXT:    mov x22, x0
+; CHECK-SD-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    csel x20, x22, x20, lt
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    mov w8, w0
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w8, #0
+; CHECK-SD-NEXT:    csel x1, x21, x19, lt
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #80
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x21, x2
+; CHECK-GI-NEXT:    mov x22, x3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel x0, x19, x21, lt
+; CHECK-GI-NEXT:    csel x1, x20, x22, lt
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, i128 %d, i128 %e
@@ -80,22 +118,39 @@ entry:
 }
 
 define double @f128_double(fp128 %a, fp128 %b, double %d, double %e) {
-; CHECK-LABEL: f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    fmov d8, d3
-; CHECK-NEXT:    fmov d9, d2
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    fcsel d0, d9, d8, lt
-; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov d8, d3
+; CHECK-SD-NEXT:    fmov d9, d2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov d8, d2
+; CHECK-GI-NEXT:    fmov d9, d3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, double %d, double %e
@@ -103,22 +158,39 @@ entry:
 }
 
 define float @f128_float(fp128 %a, fp128 %b, float %d, float %e) {
-; CHECK-LABEL: f128_float:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    fmov s8, s3
-; CHECK-NEXT:    fmov s9, s2
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    fcsel s0, s9, s8, lt
-; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_float:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    .cfi_offset b8, -24
+; CHECK-SD-NEXT:    .cfi_offset b9, -32
+; CHECK-SD-NEXT:    fmov s8, s3
+; CHECK-SD-NEXT:    fmov s9, s2
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcsel s0, s9, s8, lt
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_float:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov s8, s2
+; CHECK-GI-NEXT:    fmov s9, s3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel s0, s8, s9, lt
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, float %d, float %e
@@ -126,22 +198,39 @@ entry:
 }
 
 define i32 @f128_i32(fp128 %a, fp128 %b, i32 %d, i32 %e) {
-; CHECK-LABEL: f128_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov w19, w1
-; CHECK-NEXT:    mov w20, w0
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    csel w0, w20, w19, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f128_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov w19, w1
+; CHECK-SD-NEXT:    mov w20, w0
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    csel w0, w20, w19, lt
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f128_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov w19, w0
+; CHECK-GI-NEXT:    mov w20, w1
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    csel w0, w19, w20, lt
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, i32 %d, i32 %e
@@ -184,40 +273,26 @@ define half @f128_half(fp128 %a, fp128 %b, half %d, half %e) {
 ; CHECK-SD-FP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
 ; CHECK-SD-FP16-NEXT:    ret
 ;
-; CHECK-GI-NOFP16-LABEL: f128_half:
-; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset b8, -24
-; CHECK-GI-NOFP16-NEXT:    .cfi_offset b9, -32
-; CHECK-GI-NOFP16-NEXT:    fmov s8, s3
-; CHECK-GI-NOFP16-NEXT:    fmov s9, s2
-; CHECK-GI-NOFP16-NEXT:    bl __lttf2
-; CHECK-GI-NOFP16-NEXT:    cmp w0, #0
-; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    fcsel s0, s9, s8, lt
-; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-GI-NOFP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ret
-;
-; CHECK-GI-FP16-LABEL: f128_half:
-; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-FP16-NEXT:    .cfi_offset b8, -24
-; CHECK-GI-FP16-NEXT:    .cfi_offset b9, -32
-; CHECK-GI-FP16-NEXT:    fmov s8, s3
-; CHECK-GI-FP16-NEXT:    fmov s9, s2
-; CHECK-GI-FP16-NEXT:    bl __lttf2
-; CHECK-GI-FP16-NEXT:    cmp w0, #0
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    fcsel h0, h9, h8, lt
-; CHECK-GI-FP16-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ret
+; CHECK-GI-LABEL: f128_half:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov s8, s2
+; CHECK-GI-NEXT:    fmov s9, s3
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    fmov w8, s8
+; CHECK-GI-NEXT:    fmov w9, s9
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    csel w8, w8, w9, lt
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt fp128 %a, %b
   %s = select i1 %c, half %d, half %e

>From 9c702f62c4ad9c461eec81b797105b58312bc98f Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Fri, 12 Jul 2024 14:21:23 +0100
Subject: [PATCH 2/8] Apply suggestions

---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 04a266daa9855..dce8ad4926958 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1011,10 +1011,10 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
                                    LostDebugLocObserver &LocObserver) {
   auto &MF = MIRBuilder.getMF();
   auto &Ctx = MF.getFunction().getContext();
+  const GFCmp *Cmp = cast<GFCmp>(&MI);
 
-  LLT OpLLT = MRI.getType(MI.getOperand(2).getReg());
-  if (OpLLT != LLT::scalar(128) ||
-      OpLLT != MRI.getType(MI.getOperand(3).getReg()))
+  LLT OpLLT = MRI.getType(Cmp->getLHSReg());
+  if (OpLLT != LLT::scalar(128) || OpLLT != MRI.getType(Cmp->getRHSReg()))
     return UnableToLegalize;
 
   Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
@@ -1023,11 +1023,10 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
   constexpr LLT I32LLT = LLT::scalar(32);
   constexpr LLT PredTy = LLT::scalar(1);
 
-  const Register DstReg = MI.getOperand(0).getReg();
-  const Register Op1 = MI.getOperand(2).getReg();
-  const Register Op2 = MI.getOperand(3).getReg();
-  const auto Pred =
-      static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+  const Register DstReg = Cmp->getReg(0);
+  const Register Op1 = Cmp->getLHSReg();
+  const Register Op2 = Cmp->getRHSReg();
+  const auto Cond = Cmp->getCond();
 
   // Generates a libcall followed by ICMP
   const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
@@ -1038,18 +1037,17 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
         createLibcall(MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
                       {{Op1, OpType, 0}, {Op2, OpType, 1}}, LocObserver, &MI);
     if (!Status)
-      return MCRegister::NoRegister;
+      return {};
 
     // FCMP libcall always returns an i32, we need to compare it with #0 to get
     // the final result.
-    const Register Res = MRI.createGenericVirtualRegister(PredTy);
-    MIRBuilder.buildICmp(ICmpPred, Res, Temp,
-                         MIRBuilder.buildConstant(I32LLT, 0));
-    return Res;
+    return MIRBuilder
+        .buildICmp(ICmpPred, PredTy, Temp, MIRBuilder.buildConstant(I32LLT, 0))
+        .getReg(0);
   };
 
   // Simple case if we have a direct mapping from predicate to libcall
-  if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Pred);
+  if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond);
       Libcall != RTLIB::UNKNOWN_LIBCALL &&
       ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
     if (const auto Res = BuildLibcall(Libcall, ICmpPred)) {
@@ -1061,7 +1059,7 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
 
   // No direct mapping found, should be generated as combination of libcalls.
 
-  switch (Pred) {
+  switch (Cond) {
   case CmpInst::FCMP_UEQ: {
     // FCMP_UEQ: unordered or equal
     // Convert into (FCMP_OEQ || FCMP_UNO).
@@ -1110,7 +1108,7 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
     //       MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
     //                            Op1, Op2));
     const auto [InversedLibcall, InversedPred] =
-        getFCMPLibcallDesc(CmpInst::getInversePredicate(Pred));
+        getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond));
     MIRBuilder.buildCopy(
         DstReg, BuildLibcall(InversedLibcall,
                              CmpInst::getInversePredicate(InversedPred)));
@@ -1267,7 +1265,8 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
     LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
     if (Status != Legalized)
       return Status;
-    break;
+    MI.eraseFromParent();
+    return Status;
   }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI: {

>From 30bf0512806f4268ef5ee391e67d6e86f612d222 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Wed, 17 Jul 2024 17:42:52 +0100
Subject: [PATCH 3/8] Use minScalarOrElt

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 511e47354cead..04cf9a8b156de 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -560,7 +560,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       })
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(0, s32, s32)
-      .clampScalarOrElt(1, MinFPScalar, s128)
+      .minScalarOrElt(1, MinFPScalar)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];

>From 50c5854c04de7338158f6d7b925ae9dcb6007a6f Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Thu, 18 Jul 2024 10:54:44 +0100
Subject: [PATCH 4/8] Scalarize vectors

---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   1 +
 llvm/test/CodeGen/AArch64/fcmp.ll             | 262 ++++++++++++------
 2 files changed, 175 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 04cf9a8b156de..6f0ac8c0b90f2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -561,6 +561,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(0, s32, s32)
       .minScalarOrElt(1, MinFPScalar)
+      .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 6fe96611167da..a5d7ae147ffda 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -6,8 +6,6 @@
 
 ; CHECK-GI: warning: Instruction selection used fallback path for v2f128_fp128
 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_fp128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v2f128_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_double
 
 define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
 ; CHECK-SD-LABEL: f128_fp128:
@@ -511,37 +509,64 @@ entry:
 
 
 define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double> %d, <2 x double> %e) {
-; CHECK-LABEL: v2f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    stp q4, q5, [sp, #48] // 32-byte Folded Spill
-; CHECK-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ldp q2, q1, [sp, #48] // 32-byte Folded Reload
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
-; CHECK-NEXT:    add sp, sp, #96
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #96
+; CHECK-SD-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    stp q4, q5, [sp, #48] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ldp q2, q1, [sp, #48] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    add sp, sp, #96
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    stp q3, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    stp q5, q4, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w19, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    ldp q2, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <2 x fp128> %a, %b
   %s = select <2 x i1> %c, <2 x double> %d, <2 x double> %e
@@ -549,61 +574,122 @@ entry:
 }
 
 define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> %d, <3 x double> %e) {
-; CHECK-LABEL: v3f128_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #160
-; CHECK-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 160
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d6 killed $d6 def $q6
-; CHECK-NEXT:    // kill: def $d7 killed $d7 def $q7
-; CHECK-NEXT:    ldr d5, [sp, #184]
-; CHECK-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp d3, d2, [sp, #168]
-; CHECK-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr d5, [sp, #160]
-; CHECK-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    mov v1.d[1], v0.d[0]
-; CHECK-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
-; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
-; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    bsl v2.16b, v4.16b, v3.16b
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    add sp, sp, #160
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f128_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #160
+; CHECK-SD-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    ldr d5, [sp, #184]
+; CHECK-SD-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp d3, d2, [sp, #168]
+; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    mov v1.16b, v4.16b
+; CHECK-SD-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr d5, [sp, #160]
+; CHECK-SD-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-SD-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    fmov d1, x8
+; CHECK-SD-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-SD-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    cset w8, lt
+; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-SD-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    add sp, sp, #160
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f128_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #176
+; CHECK-GI-NEXT:    str x30, [sp, #128] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    stp q4, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    ldr x19, [sp, #176]
+; CHECK-GI-NEXT:    stp q5, q2, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr d2, [sp, #184]
+; CHECK-GI-NEXT:    ldr x20, [sp, #200]
+; CHECK-GI-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-GI-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-GI-NEXT:    str q7, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q2, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr d2, [sp, #192]
+; CHECK-GI-NEXT:    stp q6, q2, [sp, #80] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w21, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w22, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q0, q2, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    sbfx x8, x21, #0, #1
+; CHECK-GI-NEXT:    ldp q4, q3, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    sbfx x9, x22, #0, #1
+; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #128] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v2.d[1], v0.d[0]
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    and x9, x19, x8
+; CHECK-GI-NEXT:    bic x8, x20, x8
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    orr x8, x9, x8
+; CHECK-GI-NEXT:    bic v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    add sp, sp, #176
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x fp128> %a, %b
   %s = select <3 x i1> %c, <3 x double> %d, <3 x double> %e

>From 58a1bb93881a4975258bfa17865e377a461396fb Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Mon, 22 Jul 2024 11:02:57 +0100
Subject: [PATCH 5/8] Fix typos

---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index dce8ad4926958..a3fcb53def65b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1032,7 +1032,7 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
   const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
                                 const CmpInst::Predicate ICmpPred) -> Register {
     Register Temp = MRI.createGenericVirtualRegister(I32LLT);
-    // Generate libcall, storing result into Temp
+    // Generate libcall, holding result in Temp
     const auto Status =
         createLibcall(MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
                       {{Op1, OpType, 0}, {Op2, OpType, 1}}, LocObserver, &MI);
@@ -1078,7 +1078,7 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
     // Convert into (!FCMP_OEQ && !FCMP_UNO).
 
     // We inverse the predicate instead of generating a NOT
-    // to save one instruciton.
+    // to save one instruction.
     // On AArch64 isel can even select two cmp into a single ccmp.
     const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
     const auto NotOeq =

>From f328a58a78eb3d5aa5c860590d223918eaff369d Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Mon, 22 Jul 2024 11:03:26 +0100
Subject: [PATCH 6/8] Remove redundant -global-isel-abort=1

---
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index e951c80e30845..50afc79a5a576 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL
 target triple = "arm64-apple-ios"
 
 define i32 @single_same(i32 %a, i32 %b) nounwind ssp {

>From ce436c7a5eb9b5ceabe0e1bfe1e8ad7339a91384 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Mon, 22 Jul 2024 11:05:42 +0100
Subject: [PATCH 7/8] Check BuildLibcall failures

---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a3fcb53def65b..5f8bf3c2ec3d0 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1069,8 +1069,11 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
 
     const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
     const auto Uno = BuildLibcall(UnoLibcall, UnoPred);
+    if (Oeq && Uno) 
+      MIRBuilder.buildCopy(DstReg, MIRBuilder.buildOr(PredTy, Oeq, Uno));
+    else
+      return UnableToLegalize;
 
-    MIRBuilder.buildCopy(DstReg, MIRBuilder.buildOr(PredTy, Oeq, Uno));
     break;
   }
   case CmpInst::FCMP_ONE: {
@@ -1109,9 +1112,11 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
     //                            Op1, Op2));
     const auto [InversedLibcall, InversedPred] =
         getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond));
-    MIRBuilder.buildCopy(
-        DstReg, BuildLibcall(InversedLibcall,
-                             CmpInst::getInversePredicate(InversedPred)));
+    if (const auto Reg = BuildLibcall(
+            InversedLibcall, CmpInst::getInversePredicate(InversedPred)))
+      MIRBuilder.buildCopy(DstReg, Reg);
+    else
+      return UnableToLegalize;
     break;
   }
   default:

>From da0b110b9f1365f985f67e372f17f7e114260b84 Mon Sep 17 00:00:00 2001
From: Tianyi Guan <tguan at nvidia.com>
Date: Mon, 22 Jul 2024 11:42:21 +0100
Subject: [PATCH 8/8] Use s32 type for ICMP def and update comments

---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 47 +++++++++----------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 5f8bf3c2ec3d0..7ddaff161de6c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1019,30 +1019,30 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
 
   Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
 
-  // Libcall always return i32
-  constexpr LLT I32LLT = LLT::scalar(32);
-  constexpr LLT PredTy = LLT::scalar(1);
-
+  // DstReg type is s32
   const Register DstReg = Cmp->getReg(0);
-  const Register Op1 = Cmp->getLHSReg();
-  const Register Op2 = Cmp->getRHSReg();
   const auto Cond = Cmp->getCond();
 
-  // Generates a libcall followed by ICMP
-  const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
-                                const CmpInst::Predicate ICmpPred) -> Register {
-    Register Temp = MRI.createGenericVirtualRegister(I32LLT);
+  // Reference:
+  // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
+  // Generates a libcall followed by ICMP.
+  const auto BuildLibcall =
+      [&](const RTLIB::Libcall Libcall, const CmpInst::Predicate ICmpPred,
+          const DstOp &Res = LLT::scalar(32)) -> Register {
+    // FCMP libcall always returns an i32, and needs an ICMP with #0.
+    constexpr LLT TempLLT = LLT::scalar(32);
+    Register Temp = MRI.createGenericVirtualRegister(TempLLT);
     // Generate libcall, holding result in Temp
-    const auto Status =
-        createLibcall(MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
-                      {{Op1, OpType, 0}, {Op2, OpType, 1}}, LocObserver, &MI);
+    const auto Status = createLibcall(
+        MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
+        {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
+        LocObserver, &MI);
     if (!Status)
       return {};
 
-    // FCMP libcall always returns an i32, we need to compare it with #0 to get
-    // the final result.
+    // Compare temp with #0 to get the final result.
     return MIRBuilder
-        .buildICmp(ICmpPred, PredTy, Temp, MIRBuilder.buildConstant(I32LLT, 0))
+        .buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
         .getReg(0);
   };
 
@@ -1050,8 +1050,7 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
   if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond);
       Libcall != RTLIB::UNKNOWN_LIBCALL &&
       ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
-    if (const auto Res = BuildLibcall(Libcall, ICmpPred)) {
-      MIRBuilder.buildCopy(DstReg, Res);
+    if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
       return Legalized;
     }
     return UnableToLegalize;
@@ -1069,8 +1068,8 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
 
     const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
     const auto Uno = BuildLibcall(UnoLibcall, UnoPred);
-    if (Oeq && Uno) 
-      MIRBuilder.buildCopy(DstReg, MIRBuilder.buildOr(PredTy, Oeq, Uno));
+    if (Oeq && Uno)
+      MIRBuilder.buildOr(DstReg, Oeq, Uno);
     else
       return UnableToLegalize;
 
@@ -1092,7 +1091,7 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
         BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred));
 
     if (NotOeq && NotUno)
-      MIRBuilder.buildCopy(DstReg, MIRBuilder.buildAnd(PredTy, NotOeq, NotUno));
+      MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
     else
       return UnableToLegalize;
 
@@ -1112,10 +1111,8 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
     //                            Op1, Op2));
     const auto [InversedLibcall, InversedPred] =
         getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond));
-    if (const auto Reg = BuildLibcall(
-            InversedLibcall, CmpInst::getInversePredicate(InversedPred)))
-      MIRBuilder.buildCopy(DstReg, Reg);
-    else
+    if (!BuildLibcall(InversedLibcall,
+                      CmpInst::getInversePredicate(InversedPred), DstReg))
       return UnableToLegalize;
     break;
   }



More information about the llvm-commits mailing list