[llvm] 365f5b4 - [AArch64][GISel] Add fp128 and i128 sitofp/uitofp handling (#97691)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 15 08:24:27 PDT 2024


Author: Him188
Date: 2024-07-15T16:24:24+01:00
New Revision: 365f5b4a1dfe84f2978a41d5a85672e749056620

URL: https://github.com/llvm/llvm-project/commit/365f5b4a1dfe84f2978a41d5a85672e749056620
DIFF: https://github.com/llvm/llvm-project/commit/365f5b4a1dfe84f2978a41d5a85672e749056620.diff

LOG: [AArch64][GISel] Add fp128 and i128 sitofp/uitofp handling (#97691)

Legalize sitofp/uitofp involving fp128/i128 types into a libcall. 
Vector with i128/fp128 types are scalarized.

Added: 
    

Modified: 
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir
    llvm/test/CodeGen/AArch64/itofp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index cea5a4e954faf..83df106a7fdc8 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1154,16 +1154,13 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
   }
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP: {
-    // FIXME: Support other types
     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
-    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
+    Type *ToTy =
+        getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
+    if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy)
       return UnableToLegalize;
     LegalizeResult Status = conversionLibcall(
-        MI, MIRBuilder,
-        ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
-        FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
-        LocObserver);
+        MI, MIRBuilder, ToTy, Type::getIntNTy(Ctx, FromSize), LocObserver);
     if (Status != Legalized)
       return Status;
     break;

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index f9c8aa9ba255b..d42d5511a8242 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -711,7 +711,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
 
   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
-      .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
+      .legalFor({{s32, s32},
+                 {s64, s32},
+                 {s32, s64},
+                 {s64, s64},
+                 {v2s64, v2s64},
+                 {v4s32, v4s32},
+                 {v2s32, v2s32}})
       .legalIf([=](const LegalityQuery &Query) {
         return HasFP16 &&
                (Query.Types[0] == s16 || Query.Types[0] == v4s16 ||
@@ -719,26 +725,35 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                (Query.Types[1] == s32 || Query.Types[1] == s64 ||
                 Query.Types[1] == v4s16 || Query.Types[1] == v8s16);
       })
-      .widenScalarToNextPow2(1)
-      .clampScalar(1, s32, s64)
-      .widenScalarToNextPow2(0)
-      .clampScalarOrElt(0, MinFPScalar, s64)
-      .moreElementsToNextPow2(0)
+      .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+      .moreElementsToNextPow2(1)
+      .widenScalarOrEltToNextPow2OrMinSize(1)
+      .minScalar(1, s32)
+      .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32)
       .widenScalarIf(
           [=](const LegalityQuery &Query) {
-            return Query.Types[0].getScalarSizeInBits() <
-                   Query.Types[1].getScalarSizeInBits();
+            return Query.Types[1].getScalarSizeInBits() <= 64 &&
+                   Query.Types[0].getScalarSizeInBits() <
+                       Query.Types[1].getScalarSizeInBits();
           },
           LegalizeMutations::changeElementSizeTo(0, 1))
       .widenScalarIf(
           [=](const LegalityQuery &Query) {
-            return Query.Types[0].getScalarSizeInBits() >
-                   Query.Types[1].getScalarSizeInBits();
+            return Query.Types[0].getScalarSizeInBits() <= 64 &&
+                   Query.Types[0].getScalarSizeInBits() >
+                       Query.Types[1].getScalarSizeInBits();
           },
           LegalizeMutations::changeElementSizeTo(1, 0))
       .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v2s32, v4s32)
-      .clampMaxNumElements(0, s64, 2);
+      .clampMaxNumElements(0, s64, 2)
+      .libcallFor({{s16, s128},
+                   {s32, s128},
+                   {s64, s128},
+                   {s128, s128},
+                   {s128, s32},
+                   {s128, s64}});
 
   // Control-flow
   getActionDefinitionsBuilder(G_BRCOND)

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir
index a3094225a031a..0890f746fa0fa 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir
@@ -635,79 +635,3 @@ body:             |
     %1(s64) = G_FPTOUI %0
     $x0 = COPY %1(s64)
 ...
-
----
-name:            sitofp_v2s64_v2s32
-legalized:       true
-regBankSelected: true
-
-body:             |
-  bb.0:
-    liveins: $d0
-
-    ; CHECK-LABEL: name: sitofp_v2s64_v2s32
-    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[SSHLLv2i32_shift:%[0-9]+]]:fpr128 = SSHLLv2i32_shift [[COPY]], 0
-    ; CHECK: [[SCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept SCVTFv2f64 [[SSHLLv2i32_shift]]
-    ; CHECK: $q0 = COPY [[SCVTFv2f64_]]
-    %0:fpr(<2 x s32>) = COPY $d0
-    %1:fpr(<2 x s64>) = G_SITOFP %0
-    $q0 = COPY %1(<2 x s64>)
-...
-
----
-name:            uitofp_v2s64_v2s32
-legalized:       true
-regBankSelected: true
-
-body:             |
-  bb.0:
-    liveins: $d0
-
-    ; CHECK-LABEL: name: uitofp_v2s64_v2s32
-    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[USHLLv2i32_shift:%[0-9]+]]:fpr128 = USHLLv2i32_shift [[COPY]], 0
-    ; CHECK: [[UCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv2f64 [[USHLLv2i32_shift]]
-    ; CHECK: $q0 = COPY [[UCVTFv2f64_]]
-    %0:fpr(<2 x s32>) = COPY $d0
-    %1:fpr(<2 x s64>) = G_UITOFP %0
-    $q0 = COPY %1(<2 x s64>)
-...
-
----
-name:            sitofp_v2s32_v2s64
-legalized:       true
-regBankSelected: true
-
-body:             |
-  bb.0:
-    liveins: $q0
-
-    ; CHECK-LABEL: name: sitofp_v2s32_v2s64
-    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK: [[SCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept SCVTFv2f64 [[COPY]]
-    ; CHECK: [[FCVTNv2i32_:%[0-9]+]]:fpr64 = nofpexcept FCVTNv2i32 [[SCVTFv2f64_]]
-    ; CHECK: $d0 = COPY [[FCVTNv2i32_]]
-    %0:fpr(<2 x s64>) = COPY $q0
-    %1:fpr(<2 x s32>) = G_SITOFP %0
-    $d0 = COPY %1(<2 x s32>)
-...
-
----
-name:            uitofp_v2s32_v2s64
-legalized:       true
-regBankSelected: true
-
-body:             |
-  bb.0:
-    liveins: $q0
-
-    ; CHECK-LABEL: name: uitofp_v2s32_v2s64
-    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK: [[UCVTFv2f64_:%[0-9]+]]:fpr128 = nofpexcept UCVTFv2f64 [[COPY]]
-    ; CHECK: [[FCVTNv2i32_:%[0-9]+]]:fpr64 = nofpexcept FCVTNv2i32 [[UCVTFv2f64_]]
-    ; CHECK: $d0 = COPY [[FCVTNv2i32_]]
-    %0:fpr(<2 x s64>) = COPY $q0
-    %1:fpr(<2 x s32>) = G_UITOFP %0
-    $d0 = COPY %1(<2 x s32>)
-...

diff  --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index ac26ccc44128f..7a4c5cee27b80 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -1,225 +1,228 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
-
-; CHECK-GI:       warning: Instruction selection used fallback path for stofp_i128_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i128_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i64_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i64_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i32_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i32_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i16_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i16_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i8_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i8_f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i128_f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i128_f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i128_f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i128_f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_i128_f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_i128_f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i128_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i128_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i128_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i128_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i64_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i64_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i64_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i64_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i128_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i128_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i128_v3f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i128_v3f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i32_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i32_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i32_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i32_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i16_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i16_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i16_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i16_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i8_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i8_v2f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i8_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i128_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i128_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i128_v3f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i128_v3f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v2i128_v2f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v2i128_v2f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for stofp_v3i128_v3f16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i128_v3f16
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define fp128 @stofp_i128_f128(i128 %a) {
-; CHECK-LABEL: stofp_i128_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floattitf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_i128_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floattitf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_i128_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floattitf
 entry:
   %c = sitofp i128 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @utofp_i128_f128(i128 %a) {
-; CHECK-LABEL: utofp_i128_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floatuntitf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_i128_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floatuntitf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_i128_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floatuntitf
 entry:
   %c = uitofp i128 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @stofp_i64_f128(i64 %a) {
-; CHECK-LABEL: stofp_i64_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floatditf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_i64_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floatditf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_i64_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floatditf
 entry:
   %c = sitofp i64 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @utofp_i64_f128(i64 %a) {
-; CHECK-LABEL: utofp_i64_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floatunditf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_i64_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floatunditf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_i64_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floatunditf
 entry:
   %c = uitofp i64 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @stofp_i32_f128(i32 %a) {
-; CHECK-LABEL: stofp_i32_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_i32_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_i32_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floatsitf
 entry:
   %c = sitofp i32 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @utofp_i32_f128(i32 %a) {
-; CHECK-LABEL: utofp_i32_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_i32_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_i32_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floatunsitf
 entry:
   %c = uitofp i32 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @stofp_i16_f128(i16 %a) {
-; CHECK-LABEL: stofp_i16_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    sxth w0, w0
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_i16_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    sxth w0, w0
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_i16_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w0, w0
+; CHECK-GI-NEXT:    b __floatsitf
 entry:
   %c = sitofp i16 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @utofp_i16_f128(i16 %a) {
-; CHECK-LABEL: utofp_i16_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    and w0, w0, #0xffff
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_i16_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    and w0, w0, #0xffff
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_i16_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w0, w0, #0xffff
+; CHECK-GI-NEXT:    b __floatunsitf
 entry:
   %c = uitofp i16 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @stofp_i8_f128(i8 %a) {
-; CHECK-LABEL: stofp_i8_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    sxtb w0, w0
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_i8_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    sxtb w0, w0
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_i8_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w0, w0
+; CHECK-GI-NEXT:    b __floatsitf
 entry:
   %c = sitofp i8 %a to fp128
   ret fp128 %c
 }
 
 define fp128 @utofp_i8_f128(i8 %a) {
-; CHECK-LABEL: utofp_i8_f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    and w0, w0, #0xff
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_i8_f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    and w0, w0, #0xff
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_i8_f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w0, w0, #0xff
+; CHECK-GI-NEXT:    b __floatunsitf
 entry:
   %c = uitofp i8 %a to fp128
   ret fp128 %c
 }
 
 define double @stofp_i128_f64(i128 %a) {
-; CHECK-LABEL: stofp_i128_f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floattidf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_i128_f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floattidf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_i128_f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floattidf
 entry:
   %c = sitofp i128 %a to double
   ret double %c
 }
 
 define double @utofp_i128_f64(i128 %a) {
-; CHECK-LABEL: utofp_i128_f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floatuntidf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_i128_f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floatuntidf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_i128_f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floatuntidf
 entry:
   %c = uitofp i128 %a to double
   ret double %c
@@ -310,28 +313,36 @@ entry:
 }
 
 define float @stofp_i128_f32(i128 %a) {
-; CHECK-LABEL: stofp_i128_f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floattisf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_i128_f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floattisf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_i128_f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floattisf
 entry:
   %c = sitofp i128 %a to float
   ret float %c
 }
 
 define float @utofp_i128_f32(i128 %a) {
-; CHECK-LABEL: utofp_i128_f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl __floatuntisf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_i128_f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    bl __floatuntisf
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_i128_f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    b __floatuntisf
 entry:
   %c = uitofp i128 %a to float
   ret float %c
@@ -453,12 +464,7 @@ define half @stofp_i128_f16(i128 %a) {
 ;
 ; CHECK-GI-FP16-LABEL: stofp_i128_f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-FP16-NEXT:    bl __floattihf
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ret
+; CHECK-GI-FP16-NEXT:    b __floattihf
 entry:
   %c = sitofp i128 %a to half
   ret half %c
@@ -496,12 +502,7 @@ define half @utofp_i128_f16(i128 %a) {
 ;
 ; CHECK-GI-FP16-LABEL: utofp_i128_f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -16
-; CHECK-GI-FP16-NEXT:    bl __floatuntihf
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ret
+; CHECK-GI-FP16-NEXT:    b __floatuntihf
 entry:
   %c = uitofp i128 %a to half
   ret half %c
@@ -740,390 +741,720 @@ entry:
 }
 
 define <2 x fp128> @stofp_v2i128_v2f128(<2 x i128> %a) {
-; CHECK-LABEL: stofp_v2i128_v2f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov x19, x3
-; CHECK-NEXT:    mov x20, x2
-; CHECK-NEXT:    bl __floattitf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floattitf
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v2i128_v2f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    bl __floattitf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floattitf
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v2i128_v2f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    bl __floattitf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floattitf
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i128> %a to <2 x fp128>
   ret <2 x fp128> %c
 }
 
 define <2 x fp128> @utofp_v2i128_v2f128(<2 x i128> %a) {
-; CHECK-LABEL: utofp_v2i128_v2f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov x19, x3
-; CHECK-NEXT:    mov x20, x2
-; CHECK-NEXT:    bl __floatuntitf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatuntitf
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v2i128_v2f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    bl __floatuntitf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatuntitf
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v2i128_v2f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    bl __floatuntitf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatuntitf
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i128> %a to <2 x fp128>
   ret <2 x fp128> %c
 }
 
 define <3 x fp128> @stofp_v3i128_v3f128(<3 x i128> %a) {
-; CHECK-LABEL: stofp_v3i128_v3f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    bl __floattitf
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floattitf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floattitf
-; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i128_v3f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #80
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    bl __floattitf
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floattitf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floattitf
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #80
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i128_v3f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __floattitf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floattitf
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floattitf
+; CHECK-GI-NEXT:    mov v2.16b, v0.16b
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i128> %a to <3 x fp128>
   ret <3 x fp128> %c
 }
 
 define <3 x fp128> @utofp_v3i128_v3f128(<3 x i128> %a) {
-; CHECK-LABEL: utofp_v3i128_v3f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    bl __floatuntitf
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatuntitf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatuntitf
-; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i128_v3f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #80
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    bl __floatuntitf
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatuntitf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatuntitf
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #80
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i128_v3f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __floatuntitf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatuntitf
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatuntitf
+; CHECK-GI-NEXT:    mov v2.16b, v0.16b
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i128> %a to <3 x fp128>
   ret <3 x fp128> %c
 }
 
 define <2 x fp128> @stofp_v2i64_v2f128(<2 x i64> %a) {
-; CHECK-LABEL: stofp_v2i64_v2f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatditf
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x0, v0.d[1]
-; CHECK-NEXT:    bl __floatditf
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v2i64_v2f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatditf
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov x0, v0.d[1]
+; CHECK-SD-NEXT:    bl __floatditf
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v2i64_v2f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #32
+; CHECK-GI-NEXT:    str d8, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset b8, -16
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    mov d8, v0.d[1]
+; CHECK-GI-NEXT:    bl __floatditf
+; CHECK-GI-NEXT:    fmov x0, d8
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatditf
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #32
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i64> %a to <2 x fp128>
   ret <2 x fp128> %c
 }
 
 define <2 x fp128> @utofp_v2i64_v2f128(<2 x i64> %a) {
-; CHECK-LABEL: utofp_v2i64_v2f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatunditf
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x0, v0.d[1]
-; CHECK-NEXT:    bl __floatunditf
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v2i64_v2f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatunditf
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov x0, v0.d[1]
+; CHECK-SD-NEXT:    bl __floatunditf
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v2i64_v2f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #32
+; CHECK-GI-NEXT:    str d8, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset b8, -16
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    mov d8, v0.d[1]
+; CHECK-GI-NEXT:    bl __floatunditf
+; CHECK-GI-NEXT:    fmov x0, d8
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatunditf
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #32
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i64> %a to <2 x fp128>
   ret <2 x fp128> %c
 }
 
 define <3 x fp128> @stofp_v3i64_v3f128(<3 x i64> %a) {
-; CHECK-LABEL: stofp_v3i64_v3f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatditf
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    bl __floatditf
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    bl __floatditf
-; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i64_v3f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatditf
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    bl __floatditf
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    bl __floatditf
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i64_v3f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov d8, d1
+; CHECK-GI-NEXT:    fmov d9, d2
+; CHECK-GI-NEXT:    bl __floatditf
+; CHECK-GI-NEXT:    fmov x0, d8
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatditf
+; CHECK-GI-NEXT:    fmov x0, d9
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatditf
+; CHECK-GI-NEXT:    mov v2.16b, v0.16b
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i64> %a to <3 x fp128>
   ret <3 x fp128> %c
 }
 
-define <3 x fp128> @utofp_v3i64_v3f128(<3 x i64> %a) {
-; CHECK-LABEL: utofp_v3i64_v3f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatunditf
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    bl __floatunditf
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    bl __floatunditf
-; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+define <3 x fp128> @utofp_v3i64_v2f128(<3 x i64> %a) {
+; CHECK-SD-LABEL: utofp_v3i64_v2f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    str q2, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatunditf
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    bl __floatunditf
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    bl __floatunditf
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i64_v2f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov d8, d1
+; CHECK-GI-NEXT:    fmov d9, d2
+; CHECK-GI-NEXT:    bl __floatunditf
+; CHECK-GI-NEXT:    fmov x0, d8
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatunditf
+; CHECK-GI-NEXT:    fmov x0, d9
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatunditf
+; CHECK-GI-NEXT:    mov v2.16b, v0.16b
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i64> %a to <3 x fp128>
   ret <3 x fp128> %c
 }
 
 define <2 x double> @stofp_v2i128_v2f64(<2 x i128> %a) {
-; CHECK-LABEL: stofp_v2i128_v2f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x1, x3
-; CHECK-NEXT:    bl __floattidf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floattidf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v2i128_v2f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov x19, x1
+; CHECK-SD-NEXT:    mov x20, x0
+; CHECK-SD-NEXT:    mov x0, x2
+; CHECK-SD-NEXT:    mov x1, x3
+; CHECK-SD-NEXT:    bl __floattidf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floattidf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v2i128_v2f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    bl __floattidf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floattidf
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i128> %a to <2 x double>
   ret <2 x double> %c
 }
 
 define <2 x double> @utofp_v2i128_v2f64(<2 x i128> %a) {
-; CHECK-LABEL: utofp_v2i128_v2f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x1, x3
-; CHECK-NEXT:    bl __floatuntidf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatuntidf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v2i128_v2f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov x19, x1
+; CHECK-SD-NEXT:    mov x20, x0
+; CHECK-SD-NEXT:    mov x0, x2
+; CHECK-SD-NEXT:    mov x1, x3
+; CHECK-SD-NEXT:    bl __floatuntidf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatuntidf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v2i128_v2f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    bl __floatuntidf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatuntidf
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i128> %a to <2 x double>
   ret <2 x double> %c
 }
 
 define <3 x double> @stofp_v3i128_v3f64(<3 x i128> %a) {
-; CHECK-LABEL: stofp_v3i128_v3f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    .cfi_offset b8, -56
-; CHECK-NEXT:    .cfi_offset b9, -64
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    bl __floattidf
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    fmov d8, d0
-; CHECK-NEXT:    bl __floattidf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    fmov d9, d0
-; CHECK-NEXT:    bl __floattidf
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov d2, d0
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov d0, d8
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    fmov d1, d9
-; CHECK-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i128_v3f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    .cfi_offset b8, -56
+; CHECK-SD-NEXT:    .cfi_offset b9, -64
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    bl __floattidf
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    fmov d8, d0
+; CHECK-SD-NEXT:    bl __floattidf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    fmov d9, d0
+; CHECK-SD-NEXT:    bl __floattidf
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d0, d8
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d1, d9
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i128_v3f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    .cfi_offset b8, -56
+; CHECK-GI-NEXT:    .cfi_offset b9, -64
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __floattidf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    bl __floattidf
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    fmov d9, d0
+; CHECK-GI-NEXT:    bl __floattidf
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fmov d1, d9
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i128> %a to <3 x double>
   ret <3 x double> %c
 }
 
 define <3 x double> @utofp_v3i128_v3f64(<3 x i128> %a) {
-; CHECK-LABEL: utofp_v3i128_v3f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    .cfi_offset b8, -56
-; CHECK-NEXT:    .cfi_offset b9, -64
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    bl __floatuntidf
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    fmov d8, d0
-; CHECK-NEXT:    bl __floatuntidf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    fmov d9, d0
-; CHECK-NEXT:    bl __floatuntidf
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov d2, d0
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov d0, d8
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    fmov d1, d9
-; CHECK-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i128_v3f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    .cfi_offset b8, -56
+; CHECK-SD-NEXT:    .cfi_offset b9, -64
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    bl __floatuntidf
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    fmov d8, d0
+; CHECK-SD-NEXT:    bl __floatuntidf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    fmov d9, d0
+; CHECK-SD-NEXT:    bl __floatuntidf
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d2, d0
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d0, d8
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fmov d1, d9
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i128_v3f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    .cfi_offset b8, -56
+; CHECK-GI-NEXT:    .cfi_offset b9, -64
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __floatuntidf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    fmov d8, d0
+; CHECK-GI-NEXT:    bl __floatuntidf
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    fmov d9, d0
+; CHECK-GI-NEXT:    bl __floatuntidf
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fmov d1, d9
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i128> %a to <3 x double>
   ret <3 x double> %c
@@ -1438,108 +1769,204 @@ entry:
 }
 
 define <2 x fp128> @stofp_v2i32_v2f128(<2 x i32> %a) {
-; CHECK-LABEL: stofp_v2i32_v2f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    mov w0, v1.s[1]
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v2i32_v2f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #32
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, v1.s[1]
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v2i32_v2f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #32
+; CHECK-GI-NEXT:    str d8, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset b8, -16
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    bl __floatsitf
+; CHECK-GI-NEXT:    fmov w0, s8
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatsitf
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #32
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i32> %a to <2 x fp128>
   ret <2 x fp128> %c
 }
 
 define <2 x fp128> @utofp_v2i32_v2f128(<2 x i32> %a) {
-; CHECK-LABEL: utofp_v2i32_v2f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    mov w0, v1.s[1]
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v2i32_v2f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #32
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov w0, v1.s[1]
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v2i32_v2f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #32
+; CHECK-GI-NEXT:    str d8, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset b8, -16
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    bl __floatunsitf
+; CHECK-GI-NEXT:    fmov w0, s8
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatunsitf
+; CHECK-GI-NEXT:    mov v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #32
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i32> %a to <2 x fp128>
   ret <2 x fp128> %c
 }
 
 define <3 x fp128> @stofp_v3i32_v3f128(<3 x i32> %a) {
-; CHECK-LABEL: stofp_v3i32_v3f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, v0.s[1]
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldp q0, q2, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i32_v3f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, v0.s[1]
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldp q0, q2, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i32_v3f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov s9, v0.s[2]
+; CHECK-GI-NEXT:    bl __floatsitf
+; CHECK-GI-NEXT:    fmov w0, s8
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatsitf
+; CHECK-GI-NEXT:    fmov w0, s9
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatsitf
+; CHECK-GI-NEXT:    mov v2.16b, v0.16b
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i32> %a to <3 x fp128>
   ret <3 x fp128> %c
 }
 
 define <3 x fp128> @utofp_v3i32_v3f128(<3 x i32> %a) {
-; CHECK-LABEL: utofp_v3i32_v3f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, v0.s[1]
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    ldp q0, q2, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i32_v3f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, v0.s[1]
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NEXT:    ldp q0, q2, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i32_v3f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov s9, v0.s[2]
+; CHECK-GI-NEXT:    bl __floatunsitf
+; CHECK-GI-NEXT:    fmov w0, s8
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatunsitf
+; CHECK-GI-NEXT:    fmov w0, s9
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatunsitf
+; CHECK-GI-NEXT:    mov v2.16b, v0.16b
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i32> %a to <3 x fp128>
   ret <3 x fp128> %c
@@ -2703,63 +3130,115 @@ entry:
   ret <2 x fp128> %c
 }
 
-define <3 x fp128> @stofp_v3i8_v3f128(<3 x i8> %a) {
-; CHECK-LABEL: stofp_v3i8_v3f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    sxtb w0, w0
-; CHECK-NEXT:    mov w19, w2
-; CHECK-NEXT:    mov w20, w1
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    sxtb w0, w20
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    sxtb w0, w19
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatsitf
-; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+define <3 x fp128> @stofp_v2i8_v3f128(<3 x i8> %a) {
+; CHECK-SD-LABEL: stofp_v2i8_v3f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    sxtb w0, w0
+; CHECK-SD-NEXT:    mov w19, w2
+; CHECK-SD-NEXT:    mov w20, w1
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    sxtb w0, w20
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    sxtb w0, w19
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatsitf
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v2i8_v3f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    sxtb w0, w0
+; CHECK-GI-NEXT:    mov w19, w1
+; CHECK-GI-NEXT:    mov w20, w2
+; CHECK-GI-NEXT:    bl __floatsitf
+; CHECK-GI-NEXT:    sxtb w0, w19
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatsitf
+; CHECK-GI-NEXT:    sxtb w0, w20
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatsitf
+; CHECK-GI-NEXT:    mov v2.16b, v0.16b
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i8> %a to <3 x fp128>
   ret <3 x fp128> %c
 }
 
-define <3 x fp128> @utofp_v3i8_v3f128(<3 x i8> %a) {
-; CHECK-LABEL: utofp_v3i8_v3f128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    and w0, w0, #0xff
-; CHECK-NEXT:    mov w19, w2
-; CHECK-NEXT:    mov w20, w1
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    and w0, w20, #0xff
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    and w0, w19, #0xff
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatunsitf
-; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+define <3 x fp128> @utofp_v2i8_v3f128(<3 x i8> %a) {
+; CHECK-SD-LABEL: utofp_v2i8_v3f128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    and w0, w0, #0xff
+; CHECK-SD-NEXT:    mov w19, w2
+; CHECK-SD-NEXT:    mov w20, w1
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    and w0, w20, #0xff
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    and w0, w19, #0xff
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatunsitf
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v2i8_v3f128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    and w0, w0, #0xff
+; CHECK-GI-NEXT:    mov w19, w1
+; CHECK-GI-NEXT:    mov w20, w2
+; CHECK-GI-NEXT:    bl __floatunsitf
+; CHECK-GI-NEXT:    and w0, w19, #0xff
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatunsitf
+; CHECK-GI-NEXT:    and w0, w20, #0xff
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatunsitf
+; CHECK-GI-NEXT:    mov v2.16b, v0.16b
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i8> %a to <3 x fp128>
   ret <3 x fp128> %c
@@ -3674,158 +4153,286 @@ entry:
 }
 
 define <2 x float> @stofp_v2i128_v2f32(<2 x i128> %a) {
-; CHECK-LABEL: stofp_v2i128_v2f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x1, x3
-; CHECK-NEXT:    bl __floattisf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floattisf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v2i128_v2f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov x19, x1
+; CHECK-SD-NEXT:    mov x20, x0
+; CHECK-SD-NEXT:    mov x0, x2
+; CHECK-SD-NEXT:    mov x1, x3
+; CHECK-SD-NEXT:    bl __floattisf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floattisf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v2i128_v2f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    bl __floattisf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floattisf
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-GI-NEXT:    fmov d0, d1
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i128> %a to <2 x float>
   ret <2 x float> %c
 }
 
 define <2 x float> @utofp_v2i128_v2f32(<2 x i128> %a) {
-; CHECK-LABEL: utofp_v2i128_v2f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x1, x3
-; CHECK-NEXT:    bl __floatuntisf
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatuntisf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #48
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v2i128_v2f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #48
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    mov x19, x1
+; CHECK-SD-NEXT:    mov x20, x0
+; CHECK-SD-NEXT:    mov x0, x2
+; CHECK-SD-NEXT:    mov x1, x3
+; CHECK-SD-NEXT:    bl __floatuntisf
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatuntisf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v2i128_v2f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #48
+; CHECK-GI-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    bl __floatuntisf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatuntisf
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-GI-NEXT:    fmov d0, d1
+; CHECK-GI-NEXT:    add sp, sp, #48
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i128> %a to <2 x float>
   ret <2 x float> %c
 }
 
 define <3 x float> @stofp_v3i128_v3f32(<3 x i128> %a) {
-; CHECK-LABEL: stofp_v3i128_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    mov x21, x1
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x1, x3
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    bl __floattisf
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floattisf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floattisf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v3i128_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x21, x1
+; CHECK-SD-NEXT:    mov x22, x0
+; CHECK-SD-NEXT:    mov x0, x2
+; CHECK-SD-NEXT:    mov x1, x3
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    bl __floattisf
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floattisf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floattisf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v3i128_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __floattisf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floattisf
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floattisf
+; CHECK-GI-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i128> %a to <3 x float>
   ret <3 x float> %c
 }
 
 define <3 x float> @utofp_v3i128_v3f32(<3 x i128> %a) {
-; CHECK-LABEL: utofp_v3i128_v3f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    mov x21, x1
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x1, x3
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    bl __floatuntisf
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatuntisf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __floatuntisf
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v3i128_v3f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x21, x1
+; CHECK-SD-NEXT:    mov x22, x0
+; CHECK-SD-NEXT:    mov x0, x2
+; CHECK-SD-NEXT:    mov x1, x3
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    bl __floatuntisf
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatuntisf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl __floatuntisf
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-SD-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v3i128_v3f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __floatuntisf
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatuntisf
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __floatuntisf
+; CHECK-GI-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i128> %a to <3 x float>
   ret <3 x float> %c
@@ -5103,18 +5710,16 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-NOFP16-NEXT:    mov x19, x1
-; CHECK-GI-NOFP16-NEXT:    mov x20, x0
-; CHECK-GI-NOFP16-NEXT:    mov x0, x2
-; CHECK-GI-NOFP16-NEXT:    mov x1, x3
+; CHECK-GI-NOFP16-NEXT:    mov x19, x2
+; CHECK-GI-NOFP16-NEXT:    mov x20, x3
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    mov x0, x20
-; CHECK-GI-NOFP16-NEXT:    mov x1, x19
+; CHECK-GI-NOFP16-NEXT:    mov x0, x19
+; CHECK-GI-NOFP16-NEXT:    mov x1, x20
 ; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fcvt h1, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
@@ -5131,13 +5736,11 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-FP16-NEXT:    mov x19, x1
-; CHECK-GI-FP16-NEXT:    mov x20, x0
-; CHECK-GI-FP16-NEXT:    mov x0, x2
-; CHECK-GI-FP16-NEXT:    mov x1, x3
+; CHECK-GI-FP16-NEXT:    mov x19, x2
+; CHECK-GI-FP16-NEXT:    mov x20, x3
 ; CHECK-GI-FP16-NEXT:    bl __floattihf
-; CHECK-GI-FP16-NEXT:    mov x0, x20
-; CHECK-GI-FP16-NEXT:    mov x1, x19
+; CHECK-GI-FP16-NEXT:    mov x0, x19
+; CHECK-GI-FP16-NEXT:    mov x1, x20
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
 ; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-FP16-NEXT:    bl __floattihf
@@ -5145,8 +5748,8 @@ define <2 x half> @stofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
 ; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-FP16-NEXT:    fmov d0, d1
 ; CHECK-GI-FP16-NEXT:    add sp, sp, #48
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5220,18 +5823,16 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-NOFP16-NEXT:    mov x19, x1
-; CHECK-GI-NOFP16-NEXT:    mov x20, x0
-; CHECK-GI-NOFP16-NEXT:    mov x0, x2
-; CHECK-GI-NOFP16-NEXT:    mov x1, x3
+; CHECK-GI-NOFP16-NEXT:    mov x19, x2
+; CHECK-GI-NOFP16-NEXT:    mov x20, x3
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    mov x0, x20
-; CHECK-GI-NOFP16-NEXT:    mov x1, x19
+; CHECK-GI-NOFP16-NEXT:    mov x0, x19
+; CHECK-GI-NOFP16-NEXT:    mov x1, x20
 ; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fcvt h1, s0
+; CHECK-GI-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
@@ -5248,13 +5849,11 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-FP16-NEXT:    mov x19, x1
-; CHECK-GI-FP16-NEXT:    mov x20, x0
-; CHECK-GI-FP16-NEXT:    mov x0, x2
-; CHECK-GI-FP16-NEXT:    mov x1, x3
+; CHECK-GI-FP16-NEXT:    mov x19, x2
+; CHECK-GI-FP16-NEXT:    mov x20, x3
 ; CHECK-GI-FP16-NEXT:    bl __floatuntihf
-; CHECK-GI-FP16-NEXT:    mov x0, x20
-; CHECK-GI-FP16-NEXT:    mov x1, x19
+; CHECK-GI-FP16-NEXT:    mov x0, x19
+; CHECK-GI-FP16-NEXT:    mov x1, x20
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
 ; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-FP16-NEXT:    bl __floatuntihf
@@ -5262,8 +5861,8 @@ define <2 x half> @utofp_v2i128_v2f16(<2 x i128> %a) {
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
 ; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-FP16-NEXT:    fmov d0, d1
 ; CHECK-GI-FP16-NEXT:    add sp, sp, #48
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5356,84 +5955,80 @@ define <3 x half> @stofp_v3i128_v3f16(<3 x i128> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: stofp_v3i128_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #64
-; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #80
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -48
-; CHECK-GI-NOFP16-NEXT:    mov x21, x1
-; CHECK-GI-NOFP16-NEXT:    mov x22, x0
-; CHECK-GI-NOFP16-NEXT:    mov x0, x2
-; CHECK-GI-NOFP16-NEXT:    mov x1, x3
-; CHECK-GI-NOFP16-NEXT:    mov x19, x5
-; CHECK-GI-NOFP16-NEXT:    mov x20, x4
+; CHECK-GI-NOFP16-NEXT:    mov x19, x2
+; CHECK-GI-NOFP16-NEXT:    mov x20, x3
+; CHECK-GI-NOFP16-NEXT:    mov x21, x4
+; CHECK-GI-NOFP16-NEXT:    mov x22, x5
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    mov x0, x22
-; CHECK-GI-NOFP16-NEXT:    mov x1, x21
-; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    mov x0, x19
+; CHECK-GI-NOFP16-NEXT:    mov x1, x20
+; CHECK-GI-NOFP16-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    mov x0, x20
-; CHECK-GI-NOFP16-NEXT:    mov x1, x19
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov x0, x21
+; CHECK-GI-NOFP16-NEXT:    mov x1, x22
 ; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NOFP16-NEXT:    bl __floattisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h1, s0
-; CHECK-GI-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NOFP16-NEXT:    add sp, sp, #64
+; CHECK-GI-NOFP16-NEXT:    add sp, sp, #80
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v3i128_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    sub sp, sp, #64
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-FP16-NEXT:    sub sp, sp, #80
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -48
-; CHECK-GI-FP16-NEXT:    mov x21, x1
-; CHECK-GI-FP16-NEXT:    mov x22, x0
-; CHECK-GI-FP16-NEXT:    mov x0, x2
-; CHECK-GI-FP16-NEXT:    mov x1, x3
-; CHECK-GI-FP16-NEXT:    mov x19, x5
-; CHECK-GI-FP16-NEXT:    mov x20, x4
+; CHECK-GI-FP16-NEXT:    mov x19, x2
+; CHECK-GI-FP16-NEXT:    mov x20, x3
+; CHECK-GI-FP16-NEXT:    mov x21, x4
+; CHECK-GI-FP16-NEXT:    mov x22, x5
 ; CHECK-GI-FP16-NEXT:    bl __floattihf
-; CHECK-GI-FP16-NEXT:    mov x0, x22
-; CHECK-GI-FP16-NEXT:    mov x1, x21
+; CHECK-GI-FP16-NEXT:    mov x0, x19
+; CHECK-GI-FP16-NEXT:    mov x1, x20
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-FP16-NEXT:    bl __floattihf
-; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov x0, x21
+; CHECK-GI-FP16-NEXT:    mov x1, x22
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    mov x0, x20
-; CHECK-GI-FP16-NEXT:    mov x1, x19
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-FP16-NEXT:    bl __floattihf
-; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v2.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    fmov d0, d1
-; CHECK-GI-FP16-NEXT:    add sp, sp, #64
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-FP16-NEXT:    add sp, sp, #80
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <3 x i128> %a to <3 x half>
@@ -5525,84 +6120,80 @@ define <3 x half> @utofp_v3i128_v3f16(<3 x i128> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: utofp_v3i128_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #64
-; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NOFP16-NEXT:    sub sp, sp, #80
+; CHECK-GI-NOFP16-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-NOFP16-NEXT:    .cfi_offset w30, -48
-; CHECK-GI-NOFP16-NEXT:    mov x21, x1
-; CHECK-GI-NOFP16-NEXT:    mov x22, x0
-; CHECK-GI-NOFP16-NEXT:    mov x0, x2
-; CHECK-GI-NOFP16-NEXT:    mov x1, x3
-; CHECK-GI-NOFP16-NEXT:    mov x19, x5
-; CHECK-GI-NOFP16-NEXT:    mov x20, x4
+; CHECK-GI-NOFP16-NEXT:    mov x19, x2
+; CHECK-GI-NOFP16-NEXT:    mov x20, x3
+; CHECK-GI-NOFP16-NEXT:    mov x21, x4
+; CHECK-GI-NOFP16-NEXT:    mov x22, x5
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    mov x0, x22
-; CHECK-GI-NOFP16-NEXT:    mov x1, x21
-; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NOFP16-NEXT:    mov x0, x19
+; CHECK-GI-NOFP16-NEXT:    mov x1, x20
+; CHECK-GI-NOFP16-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    mov x0, x20
-; CHECK-GI-NOFP16-NEXT:    mov x1, x19
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov x0, x21
+; CHECK-GI-NOFP16-NEXT:    mov x1, x22
 ; CHECK-GI-NOFP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NOFP16-NEXT:    bl __floatuntisf
-; CHECK-GI-NOFP16-NEXT:    fcvt h1, s0
-; CHECK-GI-NOFP16-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NOFP16-NEXT:    add sp, sp, #64
+; CHECK-GI-NOFP16-NEXT:    add sp, sp, #80
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v3i128_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    sub sp, sp, #64
-; CHECK-GI-FP16-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-FP16-NEXT:    sub sp, sp, #80
+; CHECK-GI-FP16-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w21, -24
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w22, -32
 ; CHECK-GI-FP16-NEXT:    .cfi_offset w30, -48
-; CHECK-GI-FP16-NEXT:    mov x21, x1
-; CHECK-GI-FP16-NEXT:    mov x22, x0
-; CHECK-GI-FP16-NEXT:    mov x0, x2
-; CHECK-GI-FP16-NEXT:    mov x1, x3
-; CHECK-GI-FP16-NEXT:    mov x19, x5
-; CHECK-GI-FP16-NEXT:    mov x20, x4
+; CHECK-GI-FP16-NEXT:    mov x19, x2
+; CHECK-GI-FP16-NEXT:    mov x20, x3
+; CHECK-GI-FP16-NEXT:    mov x21, x4
+; CHECK-GI-FP16-NEXT:    mov x22, x5
 ; CHECK-GI-FP16-NEXT:    bl __floatuntihf
-; CHECK-GI-FP16-NEXT:    mov x0, x22
-; CHECK-GI-FP16-NEXT:    mov x1, x21
+; CHECK-GI-FP16-NEXT:    mov x0, x19
+; CHECK-GI-FP16-NEXT:    mov x1, x20
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-FP16-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-FP16-NEXT:    bl __floatuntihf
-; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov x0, x21
+; CHECK-GI-FP16-NEXT:    mov x1, x22
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    mov x0, x20
-; CHECK-GI-FP16-NEXT:    mov x1, x19
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-FP16-NEXT:    bl __floatuntihf
-; CHECK-GI-FP16-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
 ; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v2.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    fmov d0, d1
-; CHECK-GI-FP16-NEXT:    add sp, sp, #64
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-FP16-NEXT:    add sp, sp, #80
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <3 x i128> %a to <3 x half>


        


More information about the llvm-commits mailing list