[llvm] 02cb7c9 - [AArch64][GlobalISel] Libcall i128 srem/urem and scalarize more vector types.

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 22 03:39:35 PDT 2024


Author: David Green
Date: 2024-08-22T11:39:27+01:00
New Revision: 02cb7c9ef5aecea3a820bc98b50adf4d7c4c5eb6

URL: https://github.com/llvm/llvm-project/commit/02cb7c9ef5aecea3a820bc98b50adf4d7c4c5eb6
DIFF: https://github.com/llvm/llvm-project/commit/02cb7c9ef5aecea3a820bc98b50adf4d7c4c5eb6.diff

LOG: [AArch64][GlobalISel] Libcall i128 srem/urem and scalarize more vector types.

This better handles i128 scalar and vector types, and allows some of the other
odd-sized-vectors to successfully lower under GISel.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/rem.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 33a1fa1ad04fdf..35d73d36df46fe 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -196,12 +196,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
       .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
+      .libcallFor({s128})
       .widenScalarOrEltToNextPow2(0)
-      .clampScalarOrElt(0, s32, s64)
+      .minScalarOrElt(0, s32)
       .clampNumElements(0, v2s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
-      .moreElementsToNextPow2(0);
-
+      .scalarize(0);
 
   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
       .widenScalarToNextPow2(0, /*Min = */ 32)

diff  --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll
index 7f4df00d4aa794..81682c5f0ce85d 100644
--- a/llvm/test/CodeGen/AArch64/rem.ll
+++ b/llvm/test/CodeGen/AArch64/rem.ll
@@ -1,21 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for si128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for ui128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sv3i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uv3i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sv3i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uv3i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sv3i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uv3i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sv2i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sv3i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sv4i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uv2i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uv3i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uv4i128
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i8 @si8(i8 %a, i8 %b) {
 ; CHECK-SD-LABEL: si8:
@@ -216,21 +201,37 @@ entry:
 }
 
 define <3 x i8> @sv3i8(<3 x i8> %d, <3 x i8> %e) {
-; CHECK-LABEL: sv3i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w3
-; CHECK-NEXT:    sxtb w9, w0
-; CHECK-NEXT:    sxtb w11, w4
-; CHECK-NEXT:    sxtb w12, w1
-; CHECK-NEXT:    sxtb w14, w5
-; CHECK-NEXT:    sxtb w15, w2
-; CHECK-NEXT:    sdiv w10, w9, w8
-; CHECK-NEXT:    sdiv w13, w12, w11
-; CHECK-NEXT:    msub w0, w10, w8, w9
-; CHECK-NEXT:    sdiv w16, w15, w14
-; CHECK-NEXT:    msub w1, w13, w11, w12
-; CHECK-NEXT:    msub w2, w16, w14, w15
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sv3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxtb w8, w3
+; CHECK-SD-NEXT:    sxtb w9, w0
+; CHECK-SD-NEXT:    sxtb w11, w4
+; CHECK-SD-NEXT:    sxtb w12, w1
+; CHECK-SD-NEXT:    sxtb w14, w5
+; CHECK-SD-NEXT:    sxtb w15, w2
+; CHECK-SD-NEXT:    sdiv w10, w9, w8
+; CHECK-SD-NEXT:    sdiv w13, w12, w11
+; CHECK-SD-NEXT:    msub w0, w10, w8, w9
+; CHECK-SD-NEXT:    sdiv w16, w15, w14
+; CHECK-SD-NEXT:    msub w1, w13, w11, w12
+; CHECK-SD-NEXT:    msub w2, w16, w14, w15
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w9, w3
+; CHECK-GI-NEXT:    sxtb w11, w1
+; CHECK-GI-NEXT:    sxtb w12, w4
+; CHECK-GI-NEXT:    sxtb w14, w2
+; CHECK-GI-NEXT:    sxtb w15, w5
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w13, w11, w12
+; CHECK-GI-NEXT:    msub w0, w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w16, w14, w15
+; CHECK-GI-NEXT:    msub w1, w13, w12, w11
+; CHECK-GI-NEXT:    msub w2, w16, w15, w14
+; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <3 x i8> %d, %e
   ret <3 x i8> %s
@@ -1123,21 +1124,37 @@ entry:
 }
 
 define <3 x i8> @uv3i8(<3 x i8> %d, <3 x i8> %e) {
-; CHECK-LABEL: uv3i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w3, #0xff
-; CHECK-NEXT:    and w9, w0, #0xff
-; CHECK-NEXT:    and w11, w4, #0xff
-; CHECK-NEXT:    and w12, w1, #0xff
-; CHECK-NEXT:    and w14, w5, #0xff
-; CHECK-NEXT:    and w15, w2, #0xff
-; CHECK-NEXT:    udiv w10, w9, w8
-; CHECK-NEXT:    udiv w13, w12, w11
-; CHECK-NEXT:    msub w0, w10, w8, w9
-; CHECK-NEXT:    udiv w16, w15, w14
-; CHECK-NEXT:    msub w1, w13, w11, w12
-; CHECK-NEXT:    msub w2, w16, w14, w15
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uv3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    and w8, w3, #0xff
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w11, w4, #0xff
+; CHECK-SD-NEXT:    and w12, w1, #0xff
+; CHECK-SD-NEXT:    and w14, w5, #0xff
+; CHECK-SD-NEXT:    and w15, w2, #0xff
+; CHECK-SD-NEXT:    udiv w10, w9, w8
+; CHECK-SD-NEXT:    udiv w13, w12, w11
+; CHECK-SD-NEXT:    msub w0, w10, w8, w9
+; CHECK-SD-NEXT:    udiv w16, w15, w14
+; CHECK-SD-NEXT:    msub w1, w13, w11, w12
+; CHECK-SD-NEXT:    msub w2, w16, w14, w15
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    and w9, w3, #0xff
+; CHECK-GI-NEXT:    and w11, w1, #0xff
+; CHECK-GI-NEXT:    and w12, w4, #0xff
+; CHECK-GI-NEXT:    and w14, w2, #0xff
+; CHECK-GI-NEXT:    and w15, w5, #0xff
+; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    udiv w13, w11, w12
+; CHECK-GI-NEXT:    msub w0, w10, w9, w8
+; CHECK-GI-NEXT:    udiv w16, w14, w15
+; CHECK-GI-NEXT:    msub w1, w13, w12, w11
+; CHECK-GI-NEXT:    msub w2, w16, w15, w14
+; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <3 x i8> %d, %e
   ret <3 x i8> %s
@@ -2031,27 +2048,51 @@ entry:
 }
 
 define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) {
-; CHECK-LABEL: sv3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w11, v1.h[0]
-; CHECK-NEXT:    smov w12, v0.h[0]
-; CHECK-NEXT:    smov w8, v1.h[1]
-; CHECK-NEXT:    smov w9, v0.h[1]
-; CHECK-NEXT:    smov w14, v1.h[2]
-; CHECK-NEXT:    smov w15, v0.h[2]
-; CHECK-NEXT:    sdiv w13, w12, w11
-; CHECK-NEXT:    sdiv w10, w9, w8
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    sdiv w16, w15, w14
-; CHECK-NEXT:    msub w8, w10, w8, w9
-; CHECK-NEXT:    mov v0.h[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    mov v0.h[2], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sv3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    smov w11, v1.h[0]
+; CHECK-SD-NEXT:    smov w12, v0.h[0]
+; CHECK-SD-NEXT:    smov w8, v1.h[1]
+; CHECK-SD-NEXT:    smov w9, v0.h[1]
+; CHECK-SD-NEXT:    smov w14, v1.h[2]
+; CHECK-SD-NEXT:    smov w15, v0.h[2]
+; CHECK-SD-NEXT:    sdiv w13, w12, w11
+; CHECK-SD-NEXT:    sdiv w10, w9, w8
+; CHECK-SD-NEXT:    msub w11, w13, w11, w12
+; CHECK-SD-NEXT:    fmov s0, w11
+; CHECK-SD-NEXT:    sdiv w16, w15, w14
+; CHECK-SD-NEXT:    msub w8, w10, w8, w9
+; CHECK-SD-NEXT:    mov v0.h[1], w8
+; CHECK-SD-NEXT:    msub w8, w16, w14, w15
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    smov w9, v1.h[0]
+; CHECK-GI-NEXT:    smov w11, v0.h[1]
+; CHECK-GI-NEXT:    smov w12, v1.h[1]
+; CHECK-GI-NEXT:    smov w14, v0.h[2]
+; CHECK-GI-NEXT:    smov w15, v1.h[2]
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w13, w11, w12
+; CHECK-GI-NEXT:    msub w8, w10, w9, w8
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    sdiv w16, w14, w15
+; CHECK-GI-NEXT:    msub w9, w13, w12, w11
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    msub w8, w16, w15, w14
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <3 x i16> %d, %e
   ret <3 x i16> %s
@@ -2472,32 +2513,56 @@ entry:
 }
 
 define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) {
-; CHECK-LABEL: uv3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w11, v1.h[0]
-; CHECK-NEXT:    umov w12, v0.h[0]
-; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    umov w9, v0.h[1]
-; CHECK-NEXT:    umov w13, v0.h[2]
-; CHECK-NEXT:    umov w14, v1.h[0]
-; CHECK-NEXT:    umov w16, v0.h[0]
-; CHECK-NEXT:    udiv w11, w12, w11
-; CHECK-NEXT:    umov w12, v1.h[2]
-; CHECK-NEXT:    udiv w10, w9, w8
-; CHECK-NEXT:    msub w11, w11, w14, w16
-; CHECK-NEXT:    udiv w15, w13, w12
-; CHECK-NEXT:    msub w8, w10, w8, w9
-; CHECK-NEXT:    sxth w9, w11
-; CHECK-NEXT:    fmov s0, w9
-; CHECK-NEXT:    sxth w8, w8
-; CHECK-NEXT:    mov v0.h[1], w8
-; CHECK-NEXT:    msub w10, w15, w12, w13
-; CHECK-NEXT:    sxth w8, w10
-; CHECK-NEXT:    mov v0.h[2], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uv3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w11, v1.h[0]
+; CHECK-SD-NEXT:    umov w12, v0.h[0]
+; CHECK-SD-NEXT:    umov w8, v1.h[1]
+; CHECK-SD-NEXT:    umov w9, v0.h[1]
+; CHECK-SD-NEXT:    umov w13, v0.h[2]
+; CHECK-SD-NEXT:    umov w14, v1.h[0]
+; CHECK-SD-NEXT:    umov w16, v0.h[0]
+; CHECK-SD-NEXT:    udiv w11, w12, w11
+; CHECK-SD-NEXT:    umov w12, v1.h[2]
+; CHECK-SD-NEXT:    udiv w10, w9, w8
+; CHECK-SD-NEXT:    msub w11, w11, w14, w16
+; CHECK-SD-NEXT:    udiv w15, w13, w12
+; CHECK-SD-NEXT:    msub w8, w10, w8, w9
+; CHECK-SD-NEXT:    sxth w9, w11
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    sxth w8, w8
+; CHECK-SD-NEXT:    mov v0.h[1], w8
+; CHECK-SD-NEXT:    msub w10, w15, w12, w13
+; CHECK-SD-NEXT:    sxth w8, w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[0]
+; CHECK-GI-NEXT:    umov w11, v0.h[1]
+; CHECK-GI-NEXT:    umov w12, v1.h[1]
+; CHECK-GI-NEXT:    umov w14, v0.h[2]
+; CHECK-GI-NEXT:    umov w15, v1.h[2]
+; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    udiv w13, w11, w12
+; CHECK-GI-NEXT:    msub w8, w10, w9, w8
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    udiv w16, w14, w15
+; CHECK-GI-NEXT:    msub w9, w13, w12, w11
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    msub w8, w16, w15, w14
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <3 x i16> %d, %e
   ret <3 x i16> %s
@@ -2916,24 +2981,47 @@ entry:
 }
 
 define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: sv3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov w11, s1
-; CHECK-NEXT:    fmov w12, s0
-; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov w9, v0.s[1]
-; CHECK-NEXT:    mov w14, v1.s[2]
-; CHECK-NEXT:    mov w15, v0.s[2]
-; CHECK-NEXT:    sdiv w13, w12, w11
-; CHECK-NEXT:    sdiv w10, w9, w8
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    sdiv w16, w15, w14
-; CHECK-NEXT:    msub w8, w10, w8, w9
-; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    mov v0.s[2], w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sv3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov w11, s1
+; CHECK-SD-NEXT:    fmov w12, s0
+; CHECK-SD-NEXT:    mov w8, v1.s[1]
+; CHECK-SD-NEXT:    mov w9, v0.s[1]
+; CHECK-SD-NEXT:    mov w14, v1.s[2]
+; CHECK-SD-NEXT:    mov w15, v0.s[2]
+; CHECK-SD-NEXT:    sdiv w13, w12, w11
+; CHECK-SD-NEXT:    sdiv w10, w9, w8
+; CHECK-SD-NEXT:    msub w11, w13, w11, w12
+; CHECK-SD-NEXT:    fmov s0, w11
+; CHECK-SD-NEXT:    sdiv w16, w15, w14
+; CHECK-SD-NEXT:    msub w8, w10, w8, w9
+; CHECK-SD-NEXT:    mov v0.s[1], w8
+; CHECK-SD-NEXT:    msub w8, w16, w14, w15
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    mov s1, v1.s[2]
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    fmov w12, s3
+; CHECK-GI-NEXT:    fmov w14, s0
+; CHECK-GI-NEXT:    fmov w15, s1
+; CHECK-GI-NEXT:    sdiv w13, w11, w12
+; CHECK-GI-NEXT:    msub w8, w10, w9, w8
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    sdiv w16, w14, w15
+; CHECK-GI-NEXT:    msub w9, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    msub w8, w16, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <3 x i32> %d, %e
   ret <3 x i32> %s
@@ -3124,24 +3212,47 @@ entry:
 }
 
 define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: uv3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov w11, s1
-; CHECK-NEXT:    fmov w12, s0
-; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov w9, v0.s[1]
-; CHECK-NEXT:    mov w14, v1.s[2]
-; CHECK-NEXT:    mov w15, v0.s[2]
-; CHECK-NEXT:    udiv w13, w12, w11
-; CHECK-NEXT:    udiv w10, w9, w8
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    udiv w16, w15, w14
-; CHECK-NEXT:    msub w8, w10, w8, w9
-; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    mov v0.s[2], w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uv3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov w11, s1
+; CHECK-SD-NEXT:    fmov w12, s0
+; CHECK-SD-NEXT:    mov w8, v1.s[1]
+; CHECK-SD-NEXT:    mov w9, v0.s[1]
+; CHECK-SD-NEXT:    mov w14, v1.s[2]
+; CHECK-SD-NEXT:    mov w15, v0.s[2]
+; CHECK-SD-NEXT:    udiv w13, w12, w11
+; CHECK-SD-NEXT:    udiv w10, w9, w8
+; CHECK-SD-NEXT:    msub w11, w13, w11, w12
+; CHECK-SD-NEXT:    fmov s0, w11
+; CHECK-SD-NEXT:    udiv w16, w15, w14
+; CHECK-SD-NEXT:    msub w8, w10, w8, w9
+; CHECK-SD-NEXT:    mov v0.s[1], w8
+; CHECK-SD-NEXT:    msub w8, w16, w14, w15
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    mov s1, v1.s[2]
+; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    fmov w12, s3
+; CHECK-GI-NEXT:    fmov w14, s0
+; CHECK-GI-NEXT:    fmov w15, s1
+; CHECK-GI-NEXT:    udiv w13, w11, w12
+; CHECK-GI-NEXT:    msub w8, w10, w9, w8
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    udiv w16, w14, w15
+; CHECK-GI-NEXT:    msub w9, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    msub w8, w16, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <3 x i32> %d, %e
   ret <3 x i32> %s
@@ -3624,360 +3735,692 @@ entry:
 }
 
 define <2 x i128> @sv2i128(<2 x i128> %d, <2 x i128> %e) {
-; CHECK-LABEL: sv2i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
-; CHECK-NEXT:    .cfi_offset w24, -48
-; CHECK-NEXT:    .cfi_offset w30, -64
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    mov x2, x4
-; CHECK-NEXT:    mov x3, x5
-; CHECK-NEXT:    mov x19, x7
-; CHECK-NEXT:    mov x20, x6
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x23, x0
-; CHECK-NEXT:    mov x24, x1
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    mov x2, x20
-; CHECK-NEXT:    mov x3, x19
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    mov x3, x1
-; CHECK-NEXT:    mov x0, x23
-; CHECK-NEXT:    mov x1, x24
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sv2i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w30, -64
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    mov x2, x4
+; CHECK-SD-NEXT:    mov x3, x5
+; CHECK-SD-NEXT:    mov x19, x7
+; CHECK-SD-NEXT:    mov x20, x6
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov x2, x20
+; CHECK-SD-NEXT:    mov x3, x19
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x23
+; CHECK-SD-NEXT:    mov x1, x24
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w30, -64
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x2, x4
+; CHECK-GI-NEXT:    mov x3, x5
+; CHECK-GI-NEXT:    mov x21, x6
+; CHECK-GI-NEXT:    mov x22, x7
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x23, x0
+; CHECK-GI-NEXT:    mov x24, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov x2, x21
+; CHECK-GI-NEXT:    mov x3, x22
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x2, x0
+; CHECK-GI-NEXT:    mov x3, x1
+; CHECK-GI-NEXT:    mov x0, x23
+; CHECK-GI-NEXT:    mov x1, x24
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i128> %d, %e
   ret <2 x i128> %s
 }
 
 define <3 x i128> @sv3i128(<3 x i128> %d, <3 x i128> %e) {
-; CHECK-LABEL: sv3i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
-; CHECK-NEXT:    .cfi_offset w24, -48
-; CHECK-NEXT:    .cfi_offset w25, -56
-; CHECK-NEXT:    .cfi_offset w26, -64
-; CHECK-NEXT:    .cfi_offset w27, -72
-; CHECK-NEXT:    .cfi_offset w28, -80
-; CHECK-NEXT:    .cfi_offset w30, -96
-; CHECK-NEXT:    ldp x23, x24, [sp, #112]
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    ldp x25, x26, [sp, #96]
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    mov x2, x6
-; CHECK-NEXT:    mov x3, x7
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x27, x0
-; CHECK-NEXT:    mov x28, x1
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    mov x2, x25
-; CHECK-NEXT:    mov x3, x26
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x21, x0
-; CHECK-NEXT:    mov x22, x1
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    mov x2, x23
-; CHECK-NEXT:    mov x3, x24
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x4, x0
-; CHECK-NEXT:    mov x5, x1
-; CHECK-NEXT:    mov x0, x27
-; CHECK-NEXT:    mov x1, x28
-; CHECK-NEXT:    mov x2, x21
-; CHECK-NEXT:    mov x3, x22
-; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sv3i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -56
+; CHECK-SD-NEXT:    .cfi_offset w26, -64
+; CHECK-SD-NEXT:    .cfi_offset w27, -72
+; CHECK-SD-NEXT:    .cfi_offset w28, -80
+; CHECK-SD-NEXT:    .cfi_offset w30, -96
+; CHECK-SD-NEXT:    ldp x23, x24, [sp, #112]
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    ldp x25, x26, [sp, #96]
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    mov x2, x6
+; CHECK-SD-NEXT:    mov x3, x7
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x27, x0
+; CHECK-SD-NEXT:    mov x28, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov x2, x25
+; CHECK-SD-NEXT:    mov x3, x26
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov x2, x23
+; CHECK-SD-NEXT:    mov x3, x24
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x27
+; CHECK-SD-NEXT:    mov x1, x28
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w30, -96
+; CHECK-GI-NEXT:    ldp x23, x24, [sp, #96]
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    ldp x25, x26, [sp, #112]
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x2, x6
+; CHECK-GI-NEXT:    mov x3, x7
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x27, x0
+; CHECK-GI-NEXT:    mov x28, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov x2, x23
+; CHECK-GI-NEXT:    mov x3, x24
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    mov x2, x25
+; CHECK-GI-NEXT:    mov x3, x26
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x4, x0
+; CHECK-GI-NEXT:    mov x5, x1
+; CHECK-GI-NEXT:    mov x0, x27
+; CHECK-GI-NEXT:    mov x1, x28
+; CHECK-GI-NEXT:    mov x2, x19
+; CHECK-GI-NEXT:    mov x3, x20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <3 x i128> %d, %e
   ret <3 x i128> %s
 }
 
 define <4 x i128> @sv4i128(<4 x i128> %d, <4 x i128> %e) {
-; CHECK-LABEL: sv4i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
-; CHECK-NEXT:    .cfi_offset w24, -48
-; CHECK-NEXT:    .cfi_offset w25, -56
-; CHECK-NEXT:    .cfi_offset w26, -64
-; CHECK-NEXT:    .cfi_offset w27, -72
-; CHECK-NEXT:    .cfi_offset w28, -80
-; CHECK-NEXT:    .cfi_offset w30, -88
-; CHECK-NEXT:    .cfi_offset w29, -96
-; CHECK-NEXT:    mov x23, x3
-; CHECK-NEXT:    mov x24, x2
-; CHECK-NEXT:    stp x6, x7, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp x8, x26, [sp, #176]
-; CHECK-NEXT:    mov x21, x5
-; CHECK-NEXT:    ldp x2, x3, [sp, #128]
-; CHECK-NEXT:    mov x22, x4
-; CHECK-NEXT:    ldp x27, x28, [sp, #160]
-; CHECK-NEXT:    ldp x29, x19, [sp, #144]
-; CHECK-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    mov x25, x1
-; CHECK-NEXT:    mov x0, x24
-; CHECK-NEXT:    mov x1, x23
-; CHECK-NEXT:    mov x2, x29
-; CHECK-NEXT:    mov x3, x19
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    mov x23, x1
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    mov x2, x27
-; CHECK-NEXT:    mov x3, x28
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x21, x0
-; CHECK-NEXT:    mov x22, x1
-; CHECK-NEXT:    ldr x2, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x0, x1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x3, x26
-; CHECK-NEXT:    bl __modti3
-; CHECK-NEXT:    mov x6, x0
-; CHECK-NEXT:    mov x7, x1
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x25
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x23
-; CHECK-NEXT:    mov x4, x21
-; CHECK-NEXT:    mov x5, x22
-; CHECK-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #128
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sv4i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #128
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -56
+; CHECK-SD-NEXT:    .cfi_offset w26, -64
+; CHECK-SD-NEXT:    .cfi_offset w27, -72
+; CHECK-SD-NEXT:    .cfi_offset w28, -80
+; CHECK-SD-NEXT:    .cfi_offset w30, -88
+; CHECK-SD-NEXT:    .cfi_offset w29, -96
+; CHECK-SD-NEXT:    mov x23, x3
+; CHECK-SD-NEXT:    mov x24, x2
+; CHECK-SD-NEXT:    stp x6, x7, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp x8, x26, [sp, #176]
+; CHECK-SD-NEXT:    mov x21, x5
+; CHECK-SD-NEXT:    ldp x2, x3, [sp, #128]
+; CHECK-SD-NEXT:    mov x22, x4
+; CHECK-SD-NEXT:    ldp x27, x28, [sp, #160]
+; CHECK-SD-NEXT:    ldp x29, x19, [sp, #144]
+; CHECK-SD-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x20, x0
+; CHECK-SD-NEXT:    mov x25, x1
+; CHECK-SD-NEXT:    mov x0, x24
+; CHECK-SD-NEXT:    mov x1, x23
+; CHECK-SD-NEXT:    mov x2, x29
+; CHECK-SD-NEXT:    mov x3, x19
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x19, x0
+; CHECK-SD-NEXT:    mov x23, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov x2, x27
+; CHECK-SD-NEXT:    mov x3, x28
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    ldr x2, [sp, #8] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x0, x1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov x3, x26
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x6, x0
+; CHECK-SD-NEXT:    mov x7, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x25
+; CHECK-SD-NEXT:    mov x2, x19
+; CHECK-SD-NEXT:    mov x3, x23
+; CHECK-SD-NEXT:    mov x4, x21
+; CHECK-SD-NEXT:    mov x5, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #128
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #128
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w30, -88
+; CHECK-GI-NEXT:    .cfi_offset w29, -96
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    ldp x2, x3, [sp, #128]
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    ldp x9, x8, [sp, #176]
+; CHECK-GI-NEXT:    mov x23, x7
+; CHECK-GI-NEXT:    ldp x24, x25, [sp, #144]
+; CHECK-GI-NEXT:    ldp x26, x27, [sp, #160]
+; CHECK-GI-NEXT:    stp x9, x6, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x28, x0
+; CHECK-GI-NEXT:    mov x29, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov x2, x24
+; CHECK-GI-NEXT:    mov x3, x25
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    mov x2, x26
+; CHECK-GI-NEXT:    mov x3, x27
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    ldp x2, x0, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x1, x23
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x6, x0
+; CHECK-GI-NEXT:    mov x7, x1
+; CHECK-GI-NEXT:    mov x0, x28
+; CHECK-GI-NEXT:    mov x1, x29
+; CHECK-GI-NEXT:    mov x2, x19
+; CHECK-GI-NEXT:    mov x3, x20
+; CHECK-GI-NEXT:    mov x4, x21
+; CHECK-GI-NEXT:    mov x5, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #128
+; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i128> %d, %e
   ret <4 x i128> %s
 }
 
 define <2 x i128> @uv2i128(<2 x i128> %d, <2 x i128> %e) {
-; CHECK-LABEL: uv2i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
-; CHECK-NEXT:    .cfi_offset w24, -48
-; CHECK-NEXT:    .cfi_offset w30, -64
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    mov x2, x4
-; CHECK-NEXT:    mov x3, x5
-; CHECK-NEXT:    mov x19, x7
-; CHECK-NEXT:    mov x20, x6
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x23, x0
-; CHECK-NEXT:    mov x24, x1
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    mov x2, x20
-; CHECK-NEXT:    mov x3, x19
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    mov x3, x1
-; CHECK-NEXT:    mov x0, x23
-; CHECK-NEXT:    mov x1, x24
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uv2i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w30, -64
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    mov x2, x4
+; CHECK-SD-NEXT:    mov x3, x5
+; CHECK-SD-NEXT:    mov x19, x7
+; CHECK-SD-NEXT:    mov x20, x6
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov x2, x20
+; CHECK-SD-NEXT:    mov x3, x19
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x23
+; CHECK-SD-NEXT:    mov x1, x24
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w30, -64
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x2, x4
+; CHECK-GI-NEXT:    mov x3, x5
+; CHECK-GI-NEXT:    mov x21, x6
+; CHECK-GI-NEXT:    mov x22, x7
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x23, x0
+; CHECK-GI-NEXT:    mov x24, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov x2, x21
+; CHECK-GI-NEXT:    mov x3, x22
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x2, x0
+; CHECK-GI-NEXT:    mov x3, x1
+; CHECK-GI-NEXT:    mov x0, x23
+; CHECK-GI-NEXT:    mov x1, x24
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <2 x i128> %d, %e
   ret <2 x i128> %s
 }
 
 define <3 x i128> @uv3i128(<3 x i128> %d, <3 x i128> %e) {
-; CHECK-LABEL: uv3i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
-; CHECK-NEXT:    .cfi_offset w24, -48
-; CHECK-NEXT:    .cfi_offset w25, -56
-; CHECK-NEXT:    .cfi_offset w26, -64
-; CHECK-NEXT:    .cfi_offset w27, -72
-; CHECK-NEXT:    .cfi_offset w28, -80
-; CHECK-NEXT:    .cfi_offset w30, -96
-; CHECK-NEXT:    ldp x23, x24, [sp, #112]
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    ldp x25, x26, [sp, #96]
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    mov x2, x6
-; CHECK-NEXT:    mov x3, x7
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x27, x0
-; CHECK-NEXT:    mov x28, x1
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    mov x2, x25
-; CHECK-NEXT:    mov x3, x26
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x21, x0
-; CHECK-NEXT:    mov x22, x1
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x19
-; CHECK-NEXT:    mov x2, x23
-; CHECK-NEXT:    mov x3, x24
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x4, x0
-; CHECK-NEXT:    mov x5, x1
-; CHECK-NEXT:    mov x0, x27
-; CHECK-NEXT:    mov x1, x28
-; CHECK-NEXT:    mov x2, x21
-; CHECK-NEXT:    mov x3, x22
-; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uv3i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -56
+; CHECK-SD-NEXT:    .cfi_offset w26, -64
+; CHECK-SD-NEXT:    .cfi_offset w27, -72
+; CHECK-SD-NEXT:    .cfi_offset w28, -80
+; CHECK-SD-NEXT:    .cfi_offset w30, -96
+; CHECK-SD-NEXT:    ldp x23, x24, [sp, #112]
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    ldp x25, x26, [sp, #96]
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    mov x2, x6
+; CHECK-SD-NEXT:    mov x3, x7
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x27, x0
+; CHECK-SD-NEXT:    mov x28, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov x2, x25
+; CHECK-SD-NEXT:    mov x3, x26
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov x2, x23
+; CHECK-SD-NEXT:    mov x3, x24
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x27
+; CHECK-SD-NEXT:    mov x1, x28
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w30, -96
+; CHECK-GI-NEXT:    ldp x23, x24, [sp, #96]
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    ldp x25, x26, [sp, #112]
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x2, x6
+; CHECK-GI-NEXT:    mov x3, x7
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x27, x0
+; CHECK-GI-NEXT:    mov x28, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov x2, x23
+; CHECK-GI-NEXT:    mov x3, x24
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    mov x2, x25
+; CHECK-GI-NEXT:    mov x3, x26
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x4, x0
+; CHECK-GI-NEXT:    mov x5, x1
+; CHECK-GI-NEXT:    mov x0, x27
+; CHECK-GI-NEXT:    mov x1, x28
+; CHECK-GI-NEXT:    mov x2, x19
+; CHECK-GI-NEXT:    mov x3, x20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <3 x i128> %d, %e
   ret <3 x i128> %s
 }
 
 define <4 x i128> @uv4i128(<4 x i128> %d, <4 x i128> %e) {
-; CHECK-LABEL: uv4i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
-; CHECK-NEXT:    .cfi_offset w24, -48
-; CHECK-NEXT:    .cfi_offset w25, -56
-; CHECK-NEXT:    .cfi_offset w26, -64
-; CHECK-NEXT:    .cfi_offset w27, -72
-; CHECK-NEXT:    .cfi_offset w28, -80
-; CHECK-NEXT:    .cfi_offset w30, -88
-; CHECK-NEXT:    .cfi_offset w29, -96
-; CHECK-NEXT:    mov x23, x3
-; CHECK-NEXT:    mov x24, x2
-; CHECK-NEXT:    stp x6, x7, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp x8, x26, [sp, #176]
-; CHECK-NEXT:    mov x21, x5
-; CHECK-NEXT:    ldp x2, x3, [sp, #128]
-; CHECK-NEXT:    mov x22, x4
-; CHECK-NEXT:    ldp x27, x28, [sp, #160]
-; CHECK-NEXT:    ldp x29, x19, [sp, #144]
-; CHECK-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    mov x25, x1
-; CHECK-NEXT:    mov x0, x24
-; CHECK-NEXT:    mov x1, x23
-; CHECK-NEXT:    mov x2, x29
-; CHECK-NEXT:    mov x3, x19
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    mov x23, x1
-; CHECK-NEXT:    mov x0, x22
-; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    mov x2, x27
-; CHECK-NEXT:    mov x3, x28
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x21, x0
-; CHECK-NEXT:    mov x22, x1
-; CHECK-NEXT:    ldr x2, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x0, x1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x3, x26
-; CHECK-NEXT:    bl __umodti3
-; CHECK-NEXT:    mov x6, x0
-; CHECK-NEXT:    mov x7, x1
-; CHECK-NEXT:    mov x0, x20
-; CHECK-NEXT:    mov x1, x25
-; CHECK-NEXT:    mov x2, x19
-; CHECK-NEXT:    mov x3, x23
-; CHECK-NEXT:    mov x4, x21
-; CHECK-NEXT:    mov x5, x22
-; CHECK-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #128
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uv4i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #128
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -56
+; CHECK-SD-NEXT:    .cfi_offset w26, -64
+; CHECK-SD-NEXT:    .cfi_offset w27, -72
+; CHECK-SD-NEXT:    .cfi_offset w28, -80
+; CHECK-SD-NEXT:    .cfi_offset w30, -88
+; CHECK-SD-NEXT:    .cfi_offset w29, -96
+; CHECK-SD-NEXT:    mov x23, x3
+; CHECK-SD-NEXT:    mov x24, x2
+; CHECK-SD-NEXT:    stp x6, x7, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp x8, x26, [sp, #176]
+; CHECK-SD-NEXT:    mov x21, x5
+; CHECK-SD-NEXT:    ldp x2, x3, [sp, #128]
+; CHECK-SD-NEXT:    mov x22, x4
+; CHECK-SD-NEXT:    ldp x27, x28, [sp, #160]
+; CHECK-SD-NEXT:    ldp x29, x19, [sp, #144]
+; CHECK-SD-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x20, x0
+; CHECK-SD-NEXT:    mov x25, x1
+; CHECK-SD-NEXT:    mov x0, x24
+; CHECK-SD-NEXT:    mov x1, x23
+; CHECK-SD-NEXT:    mov x2, x29
+; CHECK-SD-NEXT:    mov x3, x19
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x19, x0
+; CHECK-SD-NEXT:    mov x23, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov x2, x27
+; CHECK-SD-NEXT:    mov x3, x28
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    ldr x2, [sp, #8] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x0, x1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov x3, x26
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x6, x0
+; CHECK-SD-NEXT:    mov x7, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x25
+; CHECK-SD-NEXT:    mov x2, x19
+; CHECK-SD-NEXT:    mov x3, x23
+; CHECK-SD-NEXT:    mov x4, x21
+; CHECK-SD-NEXT:    mov x5, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #128
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #128
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w30, -88
+; CHECK-GI-NEXT:    .cfi_offset w29, -96
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    ldp x2, x3, [sp, #128]
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    ldp x9, x8, [sp, #176]
+; CHECK-GI-NEXT:    mov x23, x7
+; CHECK-GI-NEXT:    ldp x24, x25, [sp, #144]
+; CHECK-GI-NEXT:    ldp x26, x27, [sp, #160]
+; CHECK-GI-NEXT:    stp x9, x6, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x28, x0
+; CHECK-GI-NEXT:    mov x29, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov x2, x24
+; CHECK-GI-NEXT:    mov x3, x25
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    mov x2, x26
+; CHECK-GI-NEXT:    mov x3, x27
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    ldp x2, x0, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x1, x23
+; CHECK-GI-NEXT:    bl __umodti3
+; CHECK-GI-NEXT:    mov x6, x0
+; CHECK-GI-NEXT:    mov x7, x1
+; CHECK-GI-NEXT:    mov x0, x28
+; CHECK-GI-NEXT:    mov x1, x29
+; CHECK-GI-NEXT:    mov x2, x19
+; CHECK-GI-NEXT:    mov x3, x20
+; CHECK-GI-NEXT:    mov x4, x21
+; CHECK-GI-NEXT:    mov x5, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #128
+; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <4 x i128> %d, %e
   ret <4 x i128> %s


        


More information about the llvm-commits mailing list