[llvm] [AArch64][GlobalISel] Legalize more CTPOP vector types. (PR #131513)

David Green via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 16 05:29:25 PDT 2025


https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/131513

Similar to other operations, s8, s16 s32 and s64 vector elements are clamped to legal vector sizes, odd number of elements are widened to the next power-2 and s128 is scalarized.

This helps legalize cttz as well as ctpop.

>From de52d23579105c350355575f2aa3b7db299b257d Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Sun, 16 Mar 2025 12:27:56 +0000
Subject: [PATCH] [AArch64][GlobalISel] Legalize more CTPOP vector types.

Similar to other operations, s8, s16 s32 and s64 vector elements are clamped to
legal vector sizes, odd number of elements are widened to the next power-2 and
s128 is scalarized.

This helps legalize cttz as well as ctpop.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   1 +
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   8 +-
 llvm/test/CodeGen/AArch64/ctpop.ll            | 461 ++++++----
 llvm/test/CodeGen/AArch64/cttz.ll             | 827 ++++++++++++------
 4 files changed, 868 insertions(+), 429 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed8bd25698c03..0b84e03e05782 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -6139,6 +6139,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case TargetOpcode::G_FCANONICALIZE:
   case TargetOpcode::G_SEXT_INREG:
   case TargetOpcode::G_ABS:
+  case TargetOpcode::G_CTPOP:
     if (TypeIdx != 0)
       return UnableToLegalize;
     Observer.changingInstr(MI);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 0da3c73b6926d..f56c4fc0373b7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -323,7 +323,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampScalar(0, s32, s128)
       .widenScalarToNextPow2(0)
       .minScalarEltSameAsIf(always, 1, 0)
-      .maxScalarEltSameAsIf(always, 1, 0);
+      .maxScalarEltSameAsIf(always, 1, 0)
+      .clampNumElements(0, v8s8, v16s8)
+      .clampNumElements(0, v4s16, v8s16)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64)
+      .moreElementsToNextPow2(0)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
 
   getActionDefinitionsBuilder(G_CTLZ)
       .legalForCartesianProduct(
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index 785a447123b5e..465f8f3a9be4c 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -1,37 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i128
 
 define void @v2i8(ptr %p1) {
-; CHECK-LABEL: v2i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlp v0.4h, v0.8b
-; CHECK-NEXT:    uaddlp v0.2s, v0.4h
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w9, [x0]
-; CHECK-NEXT:    strb w8, [x0, #1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldrb w8, [x0]
+; CHECK-SD-NEXT:    ldrb w9, [x0, #1]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    mov v0.s[1], w9
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strb w9, [x0]
+; CHECK-SD-NEXT:    strb w8, [x0, #1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <2 x i8>, ptr %p1
   %s = call <2 x i8> @llvm.ctpop(<2 x i8> %d)
@@ -40,23 +38,39 @@ entry:
 }
 
 define void @v3i8(ptr %p1) {
-; CHECK-LABEL: v3i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    zip1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlp v0.4h, v0.8b
-; CHECK-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-NEXT:    umov w8, v0.h[2]
-; CHECK-NEXT:    str s1, [sp, #12]
-; CHECK-NEXT:    ldrh w9, [sp, #12]
-; CHECK-NEXT:    strb w8, [x0, #2]
-; CHECK-NEXT:    strh w9, [x0]
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    umov w8, v0.h[2]
+; CHECK-SD-NEXT:    str s1, [sp, #12]
+; CHECK-SD-NEXT:    ldrh w9, [sp, #12]
+; CHECK-SD-NEXT:    strb w8, [x0, #2]
+; CHECK-SD-NEXT:    strh w9, [x0]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <3 x i8>, ptr %p1
   %s = call <3 x i8> @llvm.ctpop(<3 x i8> %d)
@@ -65,15 +79,31 @@ entry:
 }
 
 define void @v4i8(ptr %p1) {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlp v0.4h, v0.8b
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    cnt v0.8b, v2.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <4 x i8>, ptr %p1
   %s = call <4 x i8> @llvm.ctpop(<4 x i8> %d)
@@ -113,20 +143,31 @@ entry:
 }
 
 define void @v2i16(ptr %p1) {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x0, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlp v0.4h, v0.8b
-; CHECK-NEXT:    uaddlp v0.2s, v0.4h
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x0]
-; CHECK-NEXT:    strh w8, [x0, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    mov v0.s[1], w9
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x0]
+; CHECK-SD-NEXT:    strh w8, [x0, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <2 x i16>, ptr %p1
   %s = call <2 x i16> @llvm.ctpop(<2 x i16> %d)
@@ -135,15 +176,29 @@ entry:
 }
 
 define void @v3i16(ptr %p1) {
-; CHECK-LABEL: v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    add x8, x0, #4
-; CHECK-NEXT:    cnt v0.8b, v0.8b
-; CHECK-NEXT:    uaddlp v0.4h, v0.8b
-; CHECK-NEXT:    st1 { v0.h }[2], [x8]
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    add x8, x0, #4
+; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
+; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-SD-NEXT:    st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x0, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <3 x i16>, ptr %p1
   %s = call <3 x i16> @llvm.ctpop(<3 x i16> %d)
@@ -251,25 +306,42 @@ entry:
 }
 
 define <3 x i64> @v3i64(<3 x i64> %d) {
-; CHECK-LABEL: v3i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    cnt v1.16b, v2.16b
-; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlp v1.8h, v1.16b
-; CHECK-NEXT:    uaddlp v2.4s, v1.8h
-; CHECK-NEXT:    uaddlp v0.8h, v0.16b
-; CHECK-NEXT:    uaddlp v2.2d, v2.4s
-; CHECK-NEXT:    uaddlp v0.4s, v0.8h
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    uaddlp v0.2d, v0.4s
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    cnt v1.16b, v2.16b
+; CHECK-SD-NEXT:    cnt v0.16b, v0.16b
+; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-SD-NEXT:    uaddlp v2.4s, v1.8h
+; CHECK-SD-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-SD-NEXT:    uaddlp v2.2d, v2.4s
+; CHECK-SD-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    cnt v1.8b, v2.8b
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT:    mov w8, v1.s[0]
+; CHECK-GI-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <3 x i64> @llvm.ctpop(<3 x i64> %d)
   ret <3 x i64> %s
@@ -293,85 +365,148 @@ entry:
 }
 
 define <2 x i128> @v2i128(<2 x i128> %d) {
-; CHECK-LABEL: v2i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x2
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    mov v1.d[1], x1
-; CHECK-NEXT:    mov v0.d[1], x3
-; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    mov x3, xzr
-; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    addv b1, v1.16b
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov x0, d1
-; CHECK-NEXT:    fmov x2, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x2
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    mov v1.d[1], x1
+; CHECK-SD-NEXT:    mov v0.d[1], x3
+; CHECK-SD-NEXT:    mov x1, xzr
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    cnt v1.16b, v1.16b
+; CHECK-SD-NEXT:    cnt v0.16b, v0.16b
+; CHECK-SD-NEXT:    addv b1, v1.16b
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    fmov x0, d1
+; CHECK-SD-NEXT:    fmov x2, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[0], x2
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v1.d[1], x3
+; CHECK-GI-NEXT:    mov x1, xzr
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
+; CHECK-GI-NEXT:    mov w2, v1.s[0]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <2 x i128> @llvm.ctpop(<2 x i128> %d)
   ret <2 x i128> %s
 }
 
 define <3 x i128> @v3i128(<3 x i128> %d) {
-; CHECK-LABEL: v3i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x4
-; CHECK-NEXT:    fmov d1, x2
-; CHECK-NEXT:    fmov d2, x0
-; CHECK-NEXT:    mov v0.d[1], x5
-; CHECK-NEXT:    mov v1.d[1], x3
-; CHECK-NEXT:    mov v2.d[1], x1
-; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    mov x3, xzr
-; CHECK-NEXT:    mov x5, xzr
-; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    cnt v2.16b, v2.16b
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    addv b1, v1.16b
-; CHECK-NEXT:    addv b2, v2.16b
-; CHECK-NEXT:    fmov x0, d2
-; CHECK-NEXT:    fmov x2, d1
-; CHECK-NEXT:    fmov x4, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x4
+; CHECK-SD-NEXT:    fmov d1, x2
+; CHECK-SD-NEXT:    fmov d2, x0
+; CHECK-SD-NEXT:    mov v0.d[1], x5
+; CHECK-SD-NEXT:    mov v1.d[1], x3
+; CHECK-SD-NEXT:    mov v2.d[1], x1
+; CHECK-SD-NEXT:    mov x1, xzr
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x5, xzr
+; CHECK-SD-NEXT:    cnt v0.16b, v0.16b
+; CHECK-SD-NEXT:    cnt v1.16b, v1.16b
+; CHECK-SD-NEXT:    cnt v2.16b, v2.16b
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    addv b1, v1.16b
+; CHECK-SD-NEXT:    addv b2, v2.16b
+; CHECK-SD-NEXT:    fmov x0, d2
+; CHECK-SD-NEXT:    fmov x2, d1
+; CHECK-SD-NEXT:    fmov x4, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[0], x2
+; CHECK-GI-NEXT:    mov v2.d[0], x4
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v1.d[1], x3
+; CHECK-GI-NEXT:    mov v2.d[1], x5
+; CHECK-GI-NEXT:    mov x1, xzr
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov x5, xzr
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    cnt v2.16b, v2.16b
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    uaddlv h2, v2.16b
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
+; CHECK-GI-NEXT:    mov w2, v1.s[0]
+; CHECK-GI-NEXT:    mov w4, v2.s[0]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <3 x i128> @llvm.ctpop(<3 x i128> %d)
   ret <3 x i128> %s
 }
 
 define <4 x i128> @v4i128(<4 x i128> %d) {
-; CHECK-LABEL: v4i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x6
-; CHECK-NEXT:    fmov d1, x4
-; CHECK-NEXT:    fmov d2, x2
-; CHECK-NEXT:    fmov d3, x0
-; CHECK-NEXT:    mov v1.d[1], x5
-; CHECK-NEXT:    mov v2.d[1], x3
-; CHECK-NEXT:    mov v0.d[1], x7
-; CHECK-NEXT:    mov v3.d[1], x1
-; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    mov x3, xzr
-; CHECK-NEXT:    mov x5, xzr
-; CHECK-NEXT:    mov x7, xzr
-; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    cnt v2.16b, v2.16b
-; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    cnt v3.16b, v3.16b
-; CHECK-NEXT:    addv b1, v1.16b
-; CHECK-NEXT:    addv b2, v2.16b
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    addv b3, v3.16b
-; CHECK-NEXT:    fmov x2, d2
-; CHECK-NEXT:    fmov x4, d1
-; CHECK-NEXT:    fmov x6, d0
-; CHECK-NEXT:    fmov x0, d3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x6
+; CHECK-SD-NEXT:    fmov d1, x4
+; CHECK-SD-NEXT:    fmov d2, x2
+; CHECK-SD-NEXT:    fmov d3, x0
+; CHECK-SD-NEXT:    mov v1.d[1], x5
+; CHECK-SD-NEXT:    mov v2.d[1], x3
+; CHECK-SD-NEXT:    mov v0.d[1], x7
+; CHECK-SD-NEXT:    mov v3.d[1], x1
+; CHECK-SD-NEXT:    mov x1, xzr
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x5, xzr
+; CHECK-SD-NEXT:    mov x7, xzr
+; CHECK-SD-NEXT:    cnt v1.16b, v1.16b
+; CHECK-SD-NEXT:    cnt v2.16b, v2.16b
+; CHECK-SD-NEXT:    cnt v0.16b, v0.16b
+; CHECK-SD-NEXT:    cnt v3.16b, v3.16b
+; CHECK-SD-NEXT:    addv b1, v1.16b
+; CHECK-SD-NEXT:    addv b2, v2.16b
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    addv b3, v3.16b
+; CHECK-SD-NEXT:    fmov x2, d2
+; CHECK-SD-NEXT:    fmov x4, d1
+; CHECK-SD-NEXT:    fmov x6, d0
+; CHECK-SD-NEXT:    fmov x0, d3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[0], x2
+; CHECK-GI-NEXT:    mov v2.d[0], x4
+; CHECK-GI-NEXT:    mov v3.d[0], x6
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v1.d[1], x3
+; CHECK-GI-NEXT:    mov v2.d[1], x5
+; CHECK-GI-NEXT:    mov v3.d[1], x7
+; CHECK-GI-NEXT:    mov x1, xzr
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov x5, xzr
+; CHECK-GI-NEXT:    mov x7, xzr
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    cnt v2.16b, v2.16b
+; CHECK-GI-NEXT:    cnt v3.16b, v3.16b
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    uaddlv h2, v2.16b
+; CHECK-GI-NEXT:    uaddlv h3, v3.16b
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
+; CHECK-GI-NEXT:    mov w2, v1.s[0]
+; CHECK-GI-NEXT:    mov w4, v2.s[0]
+; CHECK-GI-NEXT:    mov w6, v3.s[0]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <4 x i128> @llvm.ctpop(<4 x i128> %d)
   ret <4 x i128> %s
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index a254df229c127..5d9ca94e0720d 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -1,40 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i128
 
 define void @v2i8(ptr %p1) {
-; CHECK-LABEL: v2i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
-; CHECK-NEXT:    movi v1.2s, #1
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    orr v0.2s, #1, lsl #8
-; CHECK-NEXT:    sub v1.2s, v0.2s, v1.2s
-; CHECK-NEXT:    bic v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    movi v1.2s, #32
-; CHECK-NEXT:    clz v0.2s, v0.2s
-; CHECK-NEXT:    sub v0.2s, v1.2s, v0.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w9, [x0]
-; CHECK-NEXT:    strb w8, [x0, #1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-SD-NEXT:    add x8, x0, #1
+; CHECK-SD-NEXT:    movi v1.2s, #1
+; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    orr v0.2s, #1, lsl #8
+; CHECK-SD-NEXT:    sub v1.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT:    movi v1.2s, #32
+; CHECK-SD-NEXT:    clz v0.2s, v0.2s
+; CHECK-SD-NEXT:    sub v0.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strb w9, [x0]
+; CHECK-SD-NEXT:    strb w8, [x0, #1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x0]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    bic v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <2 x i8>, ptr %p1
   %s = call <2 x i8> @llvm.cttz(<2 x i8> %d, i1 false)
@@ -43,27 +45,51 @@ entry:
 }
 
 define void @v3i8(ptr %p1) {
-; CHECK-LABEL: v3i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    movi v1.4h, #1
-; CHECK-NEXT:    zip1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    orr v0.4h, #1, lsl #8
-; CHECK-NEXT:    sub v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    bic v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    movi v1.4h, #16
-; CHECK-NEXT:    clz v0.4h, v0.4h
-; CHECK-NEXT:    sub v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-NEXT:    umov w8, v0.h[2]
-; CHECK-NEXT:    str s1, [sp, #12]
-; CHECK-NEXT:    ldrh w9, [sp, #12]
-; CHECK-NEXT:    strb w8, [x0, #2]
-; CHECK-NEXT:    strh w9, [x0]
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    movi v1.4h, #1
+; CHECK-SD-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    orr v0.4h, #1, lsl #8
+; CHECK-SD-NEXT:    sub v1.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT:    movi v1.4h, #16
+; CHECK-SD-NEXT:    clz v0.4h, v0.4h
+; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    umov w8, v0.h[2]
+; CHECK-SD-NEXT:    str s1, [sp, #12]
+; CHECK-SD-NEXT:    ldrh w9, [sp, #12]
+; CHECK-SD-NEXT:    strb w8, [x0, #2]
+; CHECK-SD-NEXT:    strh w9, [x0]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldrb w9, [x0]
+; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    ldrb w9, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    eor v2.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    add v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    and v0.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <3 x i8>, ptr %p1
   %s = call <3 x i8> @llvm.cttz(<3 x i8> %d, i1 false)
@@ -72,20 +98,47 @@ entry:
 }
 
 define void @v4i8(ptr %p1) {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    movi v1.4h, #1
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    orr v0.4h, #1, lsl #8
-; CHECK-NEXT:    sub v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    bic v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    movi v1.4h, #16
-; CHECK-NEXT:    clz v0.4h, v0.4h
-; CHECK-NEXT:    sub v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    movi v1.4h, #1
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    orr v0.4h, #1, lsl #8
+; CHECK-SD-NEXT:    sub v1.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT:    movi v1.4h, #16
+; CHECK-SD-NEXT:    clz v0.4h, v0.4h
+; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr w9, [x0]
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[3], w9
+; CHECK-GI-NEXT:    mov v1.h[3], w8
+; CHECK-GI-NEXT:    eor v2.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    and v0.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <4 x i8>, ptr %p1
   %s = call <4 x i8> @llvm.cttz(<4 x i8> %d, i1 false)
@@ -136,39 +189,69 @@ entry:
 }
 
 define <32 x i8> @v32i8(<32 x i8> %d) {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v2.16b, #1
-; CHECK-NEXT:    sub v3.16b, v0.16b, v2.16b
-; CHECK-NEXT:    sub v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    bic v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    bic v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.16b, #1
+; CHECK-SD-NEXT:    sub v3.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    sub v2.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-SD-NEXT:    bic v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    cnt v0.16b, v0.16b
+; CHECK-SD-NEXT:    cnt v1.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    add v3.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    add v2.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-GI-NEXT:    bic v1.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <32 x i8> @llvm.cttz(<32 x i8> %d, i1 false)
   ret <32 x i8> %s
 }
 
 define void @v2i16(ptr %p1) {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    movi v1.2s, #1
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    orr v0.2s, #1, lsl #16
-; CHECK-NEXT:    sub v1.2s, v0.2s, v1.2s
-; CHECK-NEXT:    bic v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    movi v1.2s, #32
-; CHECK-NEXT:    clz v0.2s, v0.2s
-; CHECK-NEXT:    sub v0.2s, v1.2s, v0.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x0]
-; CHECK-NEXT:    strh w8, [x0, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    movi v1.2s, #1
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    orr v0.2s, #1, lsl #16
+; CHECK-SD-NEXT:    sub v1.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT:    movi v1.2s, #32
+; CHECK-SD-NEXT:    clz v0.2s, v0.2s
+; CHECK-SD-NEXT:    sub v0.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x0]
+; CHECK-SD-NEXT:    strh w8, [x0, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT:    ld1 { v1.h }[0], [x0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    eor v2.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    and v0.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <2 x i16>, ptr %p1
   %s = call <2 x i16> @llvm.cttz(<2 x i16> %d, i1 false)
@@ -177,19 +260,40 @@ entry:
 }
 
 define void @v3i16(ptr %p1) {
-; CHECK-LABEL: v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v0.4h, #1
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    add x8, x0, #4
-; CHECK-NEXT:    sub v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    bic v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    movi v1.4h, #16
-; CHECK-NEXT:    clz v0.4h, v0.4h
-; CHECK-NEXT:    sub v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    st1 { v0.h }[2], [x8]
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v0.4h, #1
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    add x8, x0, #4
+; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    bic v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    movi v1.4h, #16
+; CHECK-SD-NEXT:    clz v0.4h, v0.4h
+; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT:    ldr h1, [x0]
+; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    add x10, x0, #4
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    eor v2.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    add v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    and v0.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x9]
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x10]
+; CHECK-GI-NEXT:    ret
 entry:
   %d = load <3 x i16>, ptr %p1
   %s = call <3 x i16> @llvm.cttz(<3 x i16> %d, i1 false)
@@ -246,19 +350,32 @@ entry:
 }
 
 define <16 x i16> @v16i16(<16 x i16> %d) {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v2.8h, #1
-; CHECK-NEXT:    sub v3.8h, v0.8h, v2.8h
-; CHECK-NEXT:    sub v2.8h, v1.8h, v2.8h
-; CHECK-NEXT:    bic v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    bic v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.8h, #16
-; CHECK-NEXT:    clz v0.8h, v0.8h
-; CHECK-NEXT:    clz v1.8h, v1.8h
-; CHECK-NEXT:    sub v0.8h, v2.8h, v0.8h
-; CHECK-NEXT:    sub v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.8h, #1
+; CHECK-SD-NEXT:    sub v3.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    sub v2.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-SD-NEXT:    bic v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    movi v2.8h, #16
+; CHECK-SD-NEXT:    clz v0.8h, v0.8h
+; CHECK-SD-NEXT:    clz v1.8h, v1.8h
+; CHECK-SD-NEXT:    sub v0.8h, v2.8h, v0.8h
+; CHECK-SD-NEXT:    sub v1.8h, v2.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    add v3.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    add v2.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-GI-NEXT:    bic v1.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <16 x i16> @llvm.cttz(<16 x i16> %d, i1 false)
   ret <16 x i16> %s
@@ -290,15 +407,29 @@ entry:
 }
 
 define <3 x i32> @v3i32(<3 x i32> %d) {
-; CHECK-LABEL: v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    sub v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    movi v1.4s, #32
-; CHECK-NEXT:    clz v0.4s, v0.4s
-; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.4s, #1
+; CHECK-SD-NEXT:    sub v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    movi v1.4s, #32
+; CHECK-SD-NEXT:    clz v0.4s, v0.4s
+; CHECK-SD-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    eor v2.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <3 x i32> @llvm.cttz(<3 x i32> %d, i1 false)
   ret <3 x i32> %s
@@ -330,19 +461,34 @@ entry:
 }
 
 define <8 x i32> @v8i32(<8 x i32> %d) {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    sub v3.4s, v0.4s, v2.4s
-; CHECK-NEXT:    sub v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bic v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    bic v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    movi v2.4s, #32
-; CHECK-NEXT:    clz v0.4s, v0.4s
-; CHECK-NEXT:    clz v1.4s, v1.4s
-; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    sub v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    sub v3.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    sub v2.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-SD-NEXT:    bic v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    movi v2.4s, #32
+; CHECK-SD-NEXT:    clz v0.4s, v0.4s
+; CHECK-SD-NEXT:    clz v1.4s, v1.4s
+; CHECK-SD-NEXT:    sub v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    sub v1.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    add v3.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    add v2.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-GI-NEXT:    bic v1.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-GI-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-GI-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <8 x i32> @llvm.cttz(<8 x i32> %d, i1 false)
   ret <8 x i32> %s
@@ -377,154 +523,305 @@ entry:
 }
 
 define <3 x i64> @v3i64(<3 x i64> %d) {
-; CHECK-LABEL: v3i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    dup v1.2d, x8
-; CHECK-NEXT:    sub v3.2d, v0.2d, v1.2d
-; CHECK-NEXT:    sub v1.2d, v2.2d, v1.2d
-; CHECK-NEXT:    bic v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    bic v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    uaddlp v0.8h, v0.16b
-; CHECK-NEXT:    uaddlp v1.8h, v1.16b
-; CHECK-NEXT:    uaddlp v0.4s, v0.8h
-; CHECK-NEXT:    uaddlp v2.4s, v1.8h
-; CHECK-NEXT:    uaddlp v0.2d, v0.4s
-; CHECK-NEXT:    uaddlp v2.2d, v2.4s
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    dup v1.2d, x8
+; CHECK-SD-NEXT:    sub v3.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    sub v1.2d, v2.2d, v1.2d
+; CHECK-SD-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-SD-NEXT:    bic v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    cnt v0.16b, v0.16b
+; CHECK-SD-NEXT:    cnt v1.16b, v1.16b
+; CHECK-SD-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-SD-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-SD-NEXT:    uaddlp v2.4s, v1.8h
+; CHECK-SD-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-SD-NEXT:    uaddlp v2.2d, v2.4s
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v3.16b, v0.16b
+; CHECK-GI-NEXT:    movi v4.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    sub x8, x8, #1
+; CHECK-GI-NEXT:    bic x8, x8, x9
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    bic v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    cnt v1.8b, v1.8b
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT:    mov w8, v1.s[0]
+; CHECK-GI-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <3 x i64> @llvm.cttz(<3 x i64> %d, i1 false)
   ret <3 x i64> %s
 }
 
 define <4 x i64> @v4i64(<4 x i64> %d) {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    dup v2.2d, x8
-; CHECK-NEXT:    sub v3.2d, v0.2d, v2.2d
-; CHECK-NEXT:    sub v2.2d, v1.2d, v2.2d
-; CHECK-NEXT:    bic v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    bic v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    uaddlp v0.8h, v0.16b
-; CHECK-NEXT:    uaddlp v1.8h, v1.16b
-; CHECK-NEXT:    uaddlp v0.4s, v0.8h
-; CHECK-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-NEXT:    uaddlp v0.2d, v0.4s
-; CHECK-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    dup v2.2d, x8
+; CHECK-SD-NEXT:    sub v3.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    sub v2.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-SD-NEXT:    bic v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    cnt v0.16b, v0.16b
+; CHECK-SD-NEXT:    cnt v1.16b, v1.16b
+; CHECK-SD-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-SD-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-SD-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    add v3.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    add v2.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    bic v0.16b, v3.16b, v0.16b
+; CHECK-GI-NEXT:    bic v1.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-GI-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-GI-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-GI-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-GI-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-GI-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <4 x i64> @llvm.cttz(<4 x i64> %d, i1 false)
   ret <4 x i64> %s
 }
 
 define <2 x i128> @v2i128(<2 x i128> %d) {
-; CHECK-LABEL: v2i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rbit x8, x1
-; CHECK-NEXT:    rbit x9, x0
-; CHECK-NEXT:    rbit x10, x3
-; CHECK-NEXT:    rbit x11, x2
-; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    clz x8, x8
-; CHECK-NEXT:    clz x9, x9
-; CHECK-NEXT:    clz x10, x10
-; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    mov x3, xzr
-; CHECK-NEXT:    csel x0, x9, x8, ne
-; CHECK-NEXT:    clz x8, x11
-; CHECK-NEXT:    add x9, x10, #64
-; CHECK-NEXT:    cmp x2, #0
-; CHECK-NEXT:    csel x2, x8, x9, ne
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rbit x8, x1
+; CHECK-SD-NEXT:    rbit x9, x0
+; CHECK-SD-NEXT:    rbit x10, x3
+; CHECK-SD-NEXT:    rbit x11, x2
+; CHECK-SD-NEXT:    cmp x0, #0
+; CHECK-SD-NEXT:    mov x1, xzr
+; CHECK-SD-NEXT:    clz x8, x8
+; CHECK-SD-NEXT:    clz x9, x9
+; CHECK-SD-NEXT:    clz x10, x10
+; CHECK-SD-NEXT:    add x8, x8, #64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    csel x0, x9, x8, ne
+; CHECK-SD-NEXT:    clz x8, x11
+; CHECK-SD-NEXT:    add x9, x10, #64
+; CHECK-SD-NEXT:    cmp x2, #0
+; CHECK-SD-NEXT:    csel x2, x8, x9, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-GI-NEXT:    subs x9, x0, #1
+; CHECK-GI-NEXT:    adc x10, x1, x8
+; CHECK-GI-NEXT:    subs x11, x2, #1
+; CHECK-GI-NEXT:    bic x9, x9, x0
+; CHECK-GI-NEXT:    mov v0.d[0], x9
+; CHECK-GI-NEXT:    bic x9, x11, x2
+; CHECK-GI-NEXT:    adc x8, x3, x8
+; CHECK-GI-NEXT:    mov v1.d[0], x9
+; CHECK-GI-NEXT:    bic x9, x10, x1
+; CHECK-GI-NEXT:    bic x8, x8, x3
+; CHECK-GI-NEXT:    mov x1, xzr
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
+; CHECK-GI-NEXT:    mov w2, v1.s[0]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <2 x i128> @llvm.cttz(<2 x i128> %d, i1 false)
   ret <2 x i128> %s
 }
 
 define <3 x i128> @v3i128(<3 x i128> %d) {
-; CHECK-LABEL: v3i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rbit x8, x1
-; CHECK-NEXT:    rbit x9, x0
-; CHECK-NEXT:    rbit x11, x3
-; CHECK-NEXT:    rbit x10, x2
-; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    rbit x12, x5
-; CHECK-NEXT:    clz x8, x8
-; CHECK-NEXT:    clz x9, x9
-; CHECK-NEXT:    clz x11, x11
-; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    clz x10, x10
-; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    csel x0, x9, x8, ne
-; CHECK-NEXT:    add x8, x11, #64
-; CHECK-NEXT:    cmp x2, #0
-; CHECK-NEXT:    rbit x9, x4
-; CHECK-NEXT:    csel x2, x10, x8, ne
-; CHECK-NEXT:    clz x8, x12
-; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    cmp x4, #0
-; CHECK-NEXT:    mov x3, xzr
-; CHECK-NEXT:    clz x9, x9
-; CHECK-NEXT:    mov x5, xzr
-; CHECK-NEXT:    csel x4, x9, x8, ne
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rbit x8, x1
+; CHECK-SD-NEXT:    rbit x9, x0
+; CHECK-SD-NEXT:    rbit x11, x3
+; CHECK-SD-NEXT:    rbit x10, x2
+; CHECK-SD-NEXT:    cmp x0, #0
+; CHECK-SD-NEXT:    rbit x12, x5
+; CHECK-SD-NEXT:    clz x8, x8
+; CHECK-SD-NEXT:    clz x9, x9
+; CHECK-SD-NEXT:    clz x11, x11
+; CHECK-SD-NEXT:    add x8, x8, #64
+; CHECK-SD-NEXT:    clz x10, x10
+; CHECK-SD-NEXT:    mov x1, xzr
+; CHECK-SD-NEXT:    csel x0, x9, x8, ne
+; CHECK-SD-NEXT:    add x8, x11, #64
+; CHECK-SD-NEXT:    cmp x2, #0
+; CHECK-SD-NEXT:    rbit x9, x4
+; CHECK-SD-NEXT:    csel x2, x10, x8, ne
+; CHECK-SD-NEXT:    clz x8, x12
+; CHECK-SD-NEXT:    add x8, x8, #64
+; CHECK-SD-NEXT:    cmp x4, #0
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    clz x9, x9
+; CHECK-SD-NEXT:    mov x5, xzr
+; CHECK-SD-NEXT:    csel x4, x9, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-GI-NEXT:    subs x9, x0, #1
+; CHECK-GI-NEXT:    adc x10, x1, x8
+; CHECK-GI-NEXT:    subs x11, x2, #1
+; CHECK-GI-NEXT:    bic x9, x9, x0
+; CHECK-GI-NEXT:    adc x12, x3, x8
+; CHECK-GI-NEXT:    subs x13, x4, #1
+; CHECK-GI-NEXT:    bic x11, x11, x2
+; CHECK-GI-NEXT:    mov v0.d[0], x9
+; CHECK-GI-NEXT:    bic x9, x13, x4
+; CHECK-GI-NEXT:    mov v1.d[0], x11
+; CHECK-GI-NEXT:    mov v2.d[0], x9
+; CHECK-GI-NEXT:    adc x8, x5, x8
+; CHECK-GI-NEXT:    bic x9, x10, x1
+; CHECK-GI-NEXT:    bic x10, x12, x3
+; CHECK-GI-NEXT:    bic x8, x8, x5
+; CHECK-GI-NEXT:    mov x1, xzr
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov x5, xzr
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    mov v1.d[1], x10
+; CHECK-GI-NEXT:    mov v2.d[1], x8
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    cnt v2.16b, v2.16b
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    uaddlv h2, v2.16b
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
+; CHECK-GI-NEXT:    mov w2, v1.s[0]
+; CHECK-GI-NEXT:    mov w4, v2.s[0]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <3 x i128> @llvm.cttz(<3 x i128> %d, i1 false)
   ret <3 x i128> %s
 }
 
 define <4 x i128> @v4i128(<4 x i128> %d) {
-; CHECK-LABEL: v4i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rbit x9, x1
-; CHECK-NEXT:    rbit x10, x0
-; CHECK-NEXT:    rbit x8, x3
-; CHECK-NEXT:    rbit x11, x2
-; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    clz x9, x9
-; CHECK-NEXT:    clz x10, x10
-; CHECK-NEXT:    clz x8, x8
-; CHECK-NEXT:    add x9, x9, #64
-; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    mov x3, xzr
-; CHECK-NEXT:    csel x0, x10, x9, ne
-; CHECK-NEXT:    clz x9, x11
-; CHECK-NEXT:    rbit x10, x4
-; CHECK-NEXT:    rbit x11, x5
-; CHECK-NEXT:    cmp x2, #0
-; CHECK-NEXT:    mov x5, xzr
-; CHECK-NEXT:    csel x2, x9, x8, ne
-; CHECK-NEXT:    clz x8, x10
-; CHECK-NEXT:    rbit x10, x7
-; CHECK-NEXT:    clz x9, x11
-; CHECK-NEXT:    cmp x4, #0
-; CHECK-NEXT:    rbit x11, x6
-; CHECK-NEXT:    add x9, x9, #64
-; CHECK-NEXT:    mov x7, xzr
-; CHECK-NEXT:    csel x4, x8, x9, ne
-; CHECK-NEXT:    clz x8, x10
-; CHECK-NEXT:    clz x9, x11
-; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    cmp x6, #0
-; CHECK-NEXT:    csel x6, x9, x8, ne
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rbit x9, x1
+; CHECK-SD-NEXT:    rbit x10, x0
+; CHECK-SD-NEXT:    rbit x8, x3
+; CHECK-SD-NEXT:    rbit x11, x2
+; CHECK-SD-NEXT:    cmp x0, #0
+; CHECK-SD-NEXT:    mov x1, xzr
+; CHECK-SD-NEXT:    clz x9, x9
+; CHECK-SD-NEXT:    clz x10, x10
+; CHECK-SD-NEXT:    clz x8, x8
+; CHECK-SD-NEXT:    add x9, x9, #64
+; CHECK-SD-NEXT:    add x8, x8, #64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    csel x0, x10, x9, ne
+; CHECK-SD-NEXT:    clz x9, x11
+; CHECK-SD-NEXT:    rbit x10, x4
+; CHECK-SD-NEXT:    rbit x11, x5
+; CHECK-SD-NEXT:    cmp x2, #0
+; CHECK-SD-NEXT:    mov x5, xzr
+; CHECK-SD-NEXT:    csel x2, x9, x8, ne
+; CHECK-SD-NEXT:    clz x8, x10
+; CHECK-SD-NEXT:    rbit x10, x7
+; CHECK-SD-NEXT:    clz x9, x11
+; CHECK-SD-NEXT:    cmp x4, #0
+; CHECK-SD-NEXT:    rbit x11, x6
+; CHECK-SD-NEXT:    add x9, x9, #64
+; CHECK-SD-NEXT:    mov x7, xzr
+; CHECK-SD-NEXT:    csel x4, x8, x9, ne
+; CHECK-SD-NEXT:    clz x8, x10
+; CHECK-SD-NEXT:    clz x9, x11
+; CHECK-SD-NEXT:    add x8, x8, #64
+; CHECK-SD-NEXT:    cmp x6, #0
+; CHECK-SD-NEXT:    csel x6, x9, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-GI-NEXT:    subs x9, x0, #1
+; CHECK-GI-NEXT:    adc x10, x1, x8
+; CHECK-GI-NEXT:    subs x11, x2, #1
+; CHECK-GI-NEXT:    bic x9, x9, x0
+; CHECK-GI-NEXT:    adc x12, x3, x8
+; CHECK-GI-NEXT:    subs x13, x4, #1
+; CHECK-GI-NEXT:    mov v0.d[0], x9
+; CHECK-GI-NEXT:    adc x14, x5, x8
+; CHECK-GI-NEXT:    subs x15, x6, #1
+; CHECK-GI-NEXT:    bic x9, x11, x2
+; CHECK-GI-NEXT:    bic x11, x13, x4
+; CHECK-GI-NEXT:    mov v1.d[0], x9
+; CHECK-GI-NEXT:    bic x9, x15, x6
+; CHECK-GI-NEXT:    mov v2.d[0], x11
+; CHECK-GI-NEXT:    mov v3.d[0], x9
+; CHECK-GI-NEXT:    adc x8, x7, x8
+; CHECK-GI-NEXT:    bic x9, x10, x1
+; CHECK-GI-NEXT:    bic x10, x14, x5
+; CHECK-GI-NEXT:    bic x8, x8, x7
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    bic x9, x12, x3
+; CHECK-GI-NEXT:    mov x1, xzr
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov x5, xzr
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    mov x7, xzr
+; CHECK-GI-NEXT:    cnt v0.16b, v0.16b
+; CHECK-GI-NEXT:    cnt v1.16b, v1.16b
+; CHECK-GI-NEXT:    cnt v2.16b, v2.16b
+; CHECK-GI-NEXT:    cnt v3.16b, v3.16b
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    uaddlv h2, v2.16b
+; CHECK-GI-NEXT:    uaddlv h3, v3.16b
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
+; CHECK-GI-NEXT:    mov w2, v1.s[0]
+; CHECK-GI-NEXT:    mov w4, v2.s[0]
+; CHECK-GI-NEXT:    mov w6, v3.s[0]
+; CHECK-GI-NEXT:    ret
 entry:
   %s = call <4 x i128> @llvm.cttz(<4 x i128> %d, i1 false)
   ret <4 x i128> %s
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}



More information about the llvm-commits mailing list