[llvm] [AArch64][GlobalISel] Bitcast and Build Illegal G_CONCAT_VECTOR Instructions (PR #96492)

Tue Jul 2 06:55:12 PDT 2024

https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/96492

>From cdb5607cf859fdbdb2948c129e935052dee60c5a Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 4 Jun 2024 13:14:15 +0000
Subject: [PATCH 1/2] [AArch64][NFC] Pre-commit tets for BITCAST Illegal
 CONCAT_VECTORS

---
 .../GlobalISel/legalize-concat-vectors.mir    |  99 +++++++++++++---
 llvm/test/CodeGen/AArch64/concat-vector.ll    | 106 +++++++++++++++++-
 .../AArch64/fixed-vector-interleave.ll        |   2 +
 3 files changed, 186 insertions(+), 21 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir
index 3263fdcdee662..8ac7b114744d5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -global-isel-abort=1 -verify-machineinstrs -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -global-isel-abort=2 -verify-machineinstrs -o - | FileCheck %s
 
 ---
 name:            legal_v4s32_v2s32
@@ -9,11 +9,12 @@ body: |
     liveins: $d0, $d1
     ; CHECK-LABEL: name: legal_v4s32_v2s32
     ; CHECK: liveins: $d0, $d1
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
-    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[COPY1]](<2 x s32>)
-    ; CHECK: $q0 = COPY [[CONCAT_VECTORS]](<4 x s32>)
-    ; CHECK: RET_ReallyLR
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[COPY1]](<2 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR
     %0:_(<2 x s32>) = COPY $d0
     %1:_(<2 x s32>) = COPY $d1
     %2:_(<4 x s32>) = G_CONCAT_VECTORS %0(<2 x s32>), %1(<2 x s32>)
@@ -28,11 +29,12 @@ body: |
     liveins: $d0, $d1
     ; CHECK-LABEL: name: legal_v8s16_v4s16
     ; CHECK: liveins: $d0, $d1
-    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1
-    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>)
-    ; CHECK: $q0 = COPY [[CONCAT_VECTORS]](<8 x s16>)
-    ; CHECK: RET_ReallyLR
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>)
+    ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR
     %0:_(<4 x s16>) = COPY $d0
     %1:_(<4 x s16>) = COPY $d1
     %2:_(<8 x s16>) = G_CONCAT_VECTORS %0(<4 x s16>), %1(<4 x s16>)
@@ -47,14 +49,79 @@ body: |
     liveins: $q0
     ; CHECK-LABEL: name: legal_v16s8_v8s8
     ; CHECK: liveins: $q0
-    ; CHECK: %a:_(<8 x s8>) = G_IMPLICIT_DEF
-    ; CHECK: %b:_(<8 x s8>) = G_IMPLICIT_DEF
-    ; CHECK: %concat:_(<16 x s8>) = G_CONCAT_VECTORS %a(<8 x s8>), %b(<8 x s8>)
-    ; CHECK: $q0 = COPY %concat(<16 x s8>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(<8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %b:_(<8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %concat:_(<16 x s8>) = G_CONCAT_VECTORS %a(<8 x s8>), %b(<8 x s8>)
+    ; CHECK-NEXT: $q0 = COPY %concat(<16 x s8>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %a:_(<8 x s8>) = G_IMPLICIT_DEF
     %b:_(<8 x s8>) = G_IMPLICIT_DEF
     %concat:_(<16 x s8>) = G_CONCAT_VECTORS %a:_(<8 x s8>), %b:_(<8 x s8>)
     $q0 = COPY %concat(<16 x s8>)
     RET_ReallyLR implicit $q0
 ...
+---
+name:            illegal_v16s8_v4s8
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: illegal_v16s8_v4s8
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(p0) = COPY $x0
+    ; CHECK-NEXT: %b:_(s32) = G_LOAD %a(p0) :: (load (s32))
+    ; CHECK-NEXT: %c:_(<4 x s8>) = G_BITCAST %b(s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY [[DEF]](s16)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY [[DEF]](s16)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[DEF]](s16)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[COPY]](s16), [[COPY1]](s16), [[COPY2]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[BUILD_VECTOR]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s8>), [[UV1:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>)
+    ; CHECK-NEXT: %f:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[UV]](<4 x s8>), [[UV]](<4 x s8>), [[UV]](<4 x s8>)
+    ; CHECK-NEXT: $q0 = COPY %f(<16 x s8>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %a:_(p0) = COPY $x0
+    %b:_(s32) = G_LOAD %a:_(p0) :: (load (s32))
+    %c:_(<4 x s8>) = G_BITCAST %b:_(s32)
+    %d:_(s8) = G_IMPLICIT_DEF
+    %e:_(<4 x s8>) = G_BUILD_VECTOR %d:_(s8), %d:_(s8), %d:_(s8), %d:_(s8)
+    %f:_(<16 x s8>) = G_CONCAT_VECTORS %c:_(<4 x s8>), %e:_(<4 x s8>), %e:_(<4 x s8>), %e:_(<4 x s8>)
+
+    $q0 = COPY %f(<16 x s8>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            illegal_v8s16_v2s16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: illegal_v8s16_v2s16
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(p0) = COPY $x0
+    ; CHECK-NEXT: %b:_(s32) = G_LOAD %a(p0) :: (load (s32))
+    ; CHECK-NEXT: %c:_(<2 x s16>) = G_BITCAST %b(s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[TRUNC]](<4 x s16>)
+    ; CHECK-NEXT: %f:_(<8 x s16>) = G_CONCAT_VECTORS %c(<2 x s16>), [[UV]](<2 x s16>), [[UV]](<2 x s16>), [[UV]](<2 x s16>)
+    ; CHECK-NEXT: $q0 = COPY %f(<8 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %a:_(p0) = COPY $x0
+    %b:_(s32) = G_LOAD %a:_(p0) :: (load (s32))
+    %c:_(<2 x s16>) = G_BITCAST %b:_(s32)
+    %d:_(s16) = G_IMPLICIT_DEF
+    %e:_(<2 x s16>) = G_BUILD_VECTOR %d:_(s16), %d:_(s16)
+    %f:_(<8 x s16>) = G_CONCAT_VECTORS %c:_(<2 x s16>), %e:_(<2 x s16>), %e:_(<2 x s16>), %e:_(<2 x s16>)
+
+    $q0 = COPY %f(<8 x s16>)
+    RET_ReallyLR implicit $q0
+...
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index bd48c32566fc9..e9e2c0d1fced0 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -1,11 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for concat2
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat4
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat9
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v8s16_v2s16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v16s8_v4s8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v16s8_v4s8_load
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v16s8_v4s8_reg
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v8s16_v2s16_reg
 
 define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
-; CHECK-LABEL: concat1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v2.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
    %v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    ret <4 x i8> %v4i8
 }
@@ -112,3 +135,76 @@ define <16 x half> @concat11(<8 x half> %A, <8 x half> %B) {
    %v16half= shufflevector <8 x half> %A, <8 x half> %B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    ret <16 x half> %v16half
 }
+
+define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) {
+; CHECK-LABEL: concat_v8s16_v2s16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <2 x i16>, ptr %ptr
+    %b = shufflevector <2 x i16> %a, <2 x i16> %a, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+    ret <8 x i16> %b
+}
+
+define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
+; CHECK-LABEL: concat_v16s8_v4s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i8>, ptr %ptr
+    %b = shufflevector <4 x i8> %a, <4 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+    ret <16 x i8> %b
+}
+
+define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) {
+; CHECK-LABEL: concat_v16s8_v4s8_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
+; CHECK-NEXT:    ld1 { v0.s }[2], [x2]
+; CHECK-NEXT:    ld1 { v0.s }[3], [x3]
+; CHECK-NEXT:    ret
+    %A = load <4 x i8>, ptr %ptrA
+    %B = load <4 x i8>, ptr %ptrB
+    %C = load <4 x i8>, ptr %ptrC
+    %D = load <4 x i8>, ptr %ptrD
+    %b = shufflevector <4 x i8> %A, <4 x i8> %B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+    %c = shufflevector <4 x i8> %C, <4 x i8> %D, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+    %d = shufflevector <16 x i8> %b, <16 x i8> %c, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+    ret <16 x i8> %d
+}
+
+
+define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, <4 x i8> %D) {
+; CHECK-LABEL: concat_v16s8_v4s8_reg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ret
+    %b = shufflevector <4 x i8> %A, <4 x i8> %B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+    %c = shufflevector <4 x i8> %C, <4 x i8> %D, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+    %d = shufflevector <16 x i8> %b, <16 x i8> %c, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+    ret <16 x i8> %d
+}
+
+define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> %C, <2 x i16> %D) {
+; CHECK-LABEL: concat_v8s16_v2s16_reg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI15_0
+; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
+; CHECK-NEXT:    ret
+    %b = shufflevector <2 x i16> %A, <2 x i16> %B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+    %c = shufflevector <2 x i16> %C, <2 x i16> %D, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+    %d = shufflevector <8 x i16> %b, <8 x i16> %c, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+    ret <8 x i16> %d
+}
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index 2e992964f5986..cbf12980a801a 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
+; CHECK-GI:         warning: Instruction selection used fallback path for interleave2_v4f16
+
 define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 ; CHECK-LABEL: interleave2_v4f16:
 ; CHECK:       // %bb.0:

>From 28ca2883cae735147a7b7a1d0ff735c94d773342 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 4 Jun 2024 13:19:53 +0000
Subject: [PATCH 2/2] [AArch64][GlobalISel] Bitcast and Build Illegal
 G_CONCAT_VECTOR Instructions

Attempts to handle illegal G_CONCAT_VECTOR instructions by bitcasting the source
into scalar values and using G_BUILD_VECTOR instead

Treating the G_CONCAT_VECTORS instruction in the legalization artefact by folding
away concat(bitcast, ...) into buildvector(...) would require check for ImpDef created
by the shuffles in llvm.
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   2 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  50 ++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  16 +-
 .../GlobalISel/legalize-concat-vectors.mir    |  14 +-
 .../GlobalISel/legalizer-info-validation.mir  |   4 +-
 llvm/test/CodeGen/AArch64/concat-vector.ll    | 252 ++++++++++++++----
 .../AArch64/fixed-vector-interleave.ll        |  24 +-
 7 files changed, 294 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 284f434fbb9b0..ede8af13b9e75 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -373,6 +373,8 @@ class LegalizerHelper {
   /// Perform Bitcast legalize action on G_INSERT_VECTOR_ELT.
   LegalizeResult bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
                                         LLT CastTy);
+  LegalizeResult bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
+                                     LLT CastTy);
 
   LegalizeResult lowerConstant(MachineInstr &MI);
   LegalizeResult lowerFConstant(MachineInstr &MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 6f0cae2edab17..2bec7db133407 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3386,6 +3386,54 @@ LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
   return UnableToLegalize;
 }
 
+// This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly
+// those that have smaller than legal operands.
+//
+// <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8>
+//
+// ===>
+//
+// s32 = G_BITCAST <4 x s8>
+// s32 = G_BITCAST <4 x s8>
+// s32 = G_BITCAST <4 x s8>
+// s32 = G_BITCAST <4 x s8>
+// <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32
+// <16 x s8> = G_BITCAST <4 x s32>
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
+                                     LLT CastTy) {
+  // Convert it to CONCAT instruction
+  auto ConcatMI = dyn_cast<GConcatVectors>(&MI);
+  if (!ConcatMI) {
+    return UnableToLegalize;
+  }
+
+  // Check if bitcast is Legal
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  LLT SrcScalTy = LLT::scalar(SrcTy.getSizeInBits());
+
+  // Check if the build vector is Legal
+  if (!LI.isLegal({TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
+    return UnableToLegalize;
+  }
+
+  // Bitcast the sources
+  SmallVector<Register> BitcastRegs;
+  for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) {
+    BitcastRegs.push_back(
+        MIRBuilder.buildBitcast(SrcScalTy, ConcatMI->getSourceReg(i))
+            .getReg(0));
+  }
+
+  // Build the scalar values into a vector
+  Register BuildReg =
+      MIRBuilder.buildBuildVector(CastTy, BitcastRegs).getReg(0);
+  MIRBuilder.buildBitcast(DstReg, BuildReg);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
   Register DstReg = LoadMI.getDstReg();
@@ -3687,6 +3735,8 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
   case TargetOpcode::G_INSERT_VECTOR_ELT:
     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
+  case TargetOpcode::G_CONCAT_VECTORS:
+    return bitcastConcatVector(MI, TypeIdx, CastTy);
   default:
     return UnableToLegalize;
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 07a0473888ee5..8e7a684c55c34 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -984,7 +984,21 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s64, v2s64);
 
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
-      .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}});
+      .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}})
+      .bitcastIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[0].getSizeInBits() <= 128 &&
+                   Query.Types[1].getSizeInBits() <= 64;
+          },
+          [=](const LegalityQuery &Query) {
+            const LLT DstTy = Query.Types[0];
+            const LLT SrcTy = Query.Types[1];
+            return std::pair(
+                0, DstTy.changeElementSize(SrcTy.getSizeInBits())
+                       .changeElementCount(
+                           DstTy.getElementCount().divideCoefficientBy(
+                               SrcTy.getNumElements())));
+          });
 
   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir
index 8ac7b114744d5..87bbbee35d6d6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-concat-vectors.mir
@@ -81,7 +81,12 @@ body:             |
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[COPY]](s16), [[COPY1]](s16), [[COPY2]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[BUILD_VECTOR]](<8 x s16>)
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s8>), [[UV1:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>)
-    ; CHECK-NEXT: %f:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[UV]](<4 x s8>), [[UV]](<4 x s8>), [[UV]](<4 x s8>)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST %c(<4 x s8>)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<4 x s8>)
+    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<4 x s8>)
+    ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<4 x s8>)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[BITCAST1]](s32), [[BITCAST2]](s32), [[BITCAST3]](s32)
+    ; CHECK-NEXT: %f:_(<16 x s8>) = G_BITCAST [[BUILD_VECTOR1]](<4 x s32>)
     ; CHECK-NEXT: $q0 = COPY %f(<16 x s8>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %a:_(p0) = COPY $x0
@@ -112,7 +117,12 @@ body:             |
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[TRUNC]](<4 x s16>)
-    ; CHECK-NEXT: %f:_(<8 x s16>) = G_CONCAT_VECTORS %c(<2 x s16>), [[UV]](<2 x s16>), [[UV]](<2 x s16>), [[UV]](<2 x s16>)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST %c(<2 x s16>)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[BITCAST1]](s32), [[BITCAST2]](s32), [[BITCAST3]](s32)
+    ; CHECK-NEXT: %f:_(<8 x s16>) = G_BITCAST [[BUILD_VECTOR1]](<4 x s32>)
     ; CHECK-NEXT: $q0 = COPY %f(<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %a:_(p0) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index d71111b57efe5..54229b8306337 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -115,8 +115,8 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_CONCAT_VECTORS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. the first uncovered type index: 2, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_PTRTOINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index e9e2c0d1fced0..f6eeeef4faf7e 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -2,15 +2,6 @@
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:         warning: Instruction selection used fallback path for concat2
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat4
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat9
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v8s16_v2s16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v16s8_v4s8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v16s8_v4s8_load
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v16s8_v4s8_reg
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for concat_v8s16_v2s16_reg
-
 define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-SD-LABEL: concat1:
 ; CHECK-SD:       // %bb.0:
@@ -34,10 +25,33 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 }
 
 define <8 x i8> @concat2(<4 x i8> %A, <4 x i8> %B) {
-; CHECK-LABEL: concat2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h3, v0.h[1]
+; CHECK-GI-NEXT:    mov h4, v1.h[2]
+; CHECK-GI-NEXT:    mov h5, v1.h[3]
+; CHECK-GI-NEXT:    mov h6, v0.h[3]
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v6.h[0]
+; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
    %v8i8 = shufflevector <4 x i8> %A, <4 x i8> %B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    ret <8 x i8> %v8i8
 }
@@ -54,10 +68,25 @@ define <16 x i8> @concat3(<8 x i8> %A, <8 x i8> %B) {
 }
 
 define <4 x i16> @concat4(<2 x i16> %A, <2 x i16> %B) {
-; CHECK-LABEL: concat4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s3, v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v3.s[0]
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
    %v4i16 = shufflevector <2 x i16> %A, <2 x i16> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    ret <4 x i16> %v4i16
 }
@@ -109,10 +138,18 @@ define <8 x i32> @concat8(ptr %A, ptr %B) {
 }
 
 define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) {
-; CHECK-LABEL: concat9:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat9:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat9:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
    %v4half= shufflevector <2 x half> %A, <2 x half> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    ret <4 x half> %v4half
 }
@@ -137,33 +174,68 @@ define <16 x half> @concat11(<8 x half> %A, <8 x half> %B) {
 }
 
 define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) {
-; CHECK-LABEL: concat_v8s16_v2s16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat_v8s16_v2s16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat_v8s16_v2s16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v0.4s, w8
+; CHECK-GI-NEXT:    ldr h1, [x0]
+; CHECK-GI-NEXT:    ldr h2, [x0, #2]
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT:    xtn v2.4h, v0.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    ret
     %a = load <2 x i16>, ptr %ptr
     %b = shufflevector <2 x i16> %a, <2 x i16> %a, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     ret <8 x i16> %b
 }
 
 define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
-; CHECK-LABEL: concat_v16s8_v4s8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat_v16s8_v4s8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat_v16s8_v4s8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v0.8h, w8
+; CHECK-GI-NEXT:    xtn v1.8b, v0.8h
+; CHECK-GI-NEXT:    ldr s0, [x0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT:    ret
     %a = load <4 x i8>, ptr %ptr
     %b = shufflevector <4 x i8> %a, <4 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     ret <16 x i8> %b
 }
 
 define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) {
-; CHECK-LABEL: concat_v16s8_v4s8_load:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v0.s }[2], [x2]
-; CHECK-NEXT:    ld1 { v0.s }[3], [x3]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat_v16s8_v4s8_load:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x1]
+; CHECK-SD-NEXT:    ld1 { v0.s }[2], [x2]
+; CHECK-SD-NEXT:    ld1 { v0.s }[3], [x3]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat_v16s8_v4s8_load:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr s0, [x0]
+; CHECK-GI-NEXT:    ldr s1, [x1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    ldr s1, [x2]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    ldr s1, [x3]
+; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT:    ret
     %A = load <4 x i8>, ptr %ptrA
     %B = load <4 x i8>, ptr %ptrB
     %C = load <4 x i8>, ptr %ptrC
@@ -176,16 +248,58 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p
 
 
 define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, <4 x i8> %D) {
-; CHECK-LABEL: concat_v16s8_v4s8_reg:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat_v16s8_v4s8_reg:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat_v16s8_v4s8_reg:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h4, v1.h[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h5, v0.h[1]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT:    mov h6, v1.h[2]
+; CHECK-GI-NEXT:    mov h7, v1.h[3]
+; CHECK-GI-NEXT:    mov h16, v2.h[1]
+; CHECK-GI-NEXT:    mov h17, v0.h[3]
+; CHECK-GI-NEXT:    mov h18, v2.h[3]
+; CHECK-GI-NEXT:    mov v1.h[1], v4.h[0]
+; CHECK-GI-NEXT:    mov h4, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.h[1], v5.h[0]
+; CHECK-GI-NEXT:    mov h5, v2.h[2]
+; CHECK-GI-NEXT:    mov v2.h[1], v16.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v6.h[0]
+; CHECK-GI-NEXT:    mov h6, v3.h[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
+; CHECK-GI-NEXT:    mov h4, v3.h[2]
+; CHECK-GI-NEXT:    mov h5, v3.h[3]
+; CHECK-GI-NEXT:    mov v1.h[3], v7.h[0]
+; CHECK-GI-NEXT:    mov v3.h[1], v6.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v17.h[0]
+; CHECK-GI-NEXT:    mov v2.h[3], v18.h[0]
+; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    xtn v1.8b, v3.8h
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    ret
     %b = shufflevector <4 x i8> %A, <4 x i8> %B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     %c = shufflevector <4 x i8> %C, <4 x i8> %D, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
     %d = shufflevector <16 x i8> %b, <16 x i8> %c, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -193,16 +307,42 @@ define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, <
 }
 
 define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> %C, <2 x i16> %D) {
-; CHECK-LABEL: concat_v8s16_v2s16_reg:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI15_0
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat_v8s16_v2s16_reg:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI15_0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat_v8s16_v2s16_reg:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov s4, v1.s[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s5, v0.s[1]
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT:    mov v1.s[1], v4.s[0]
+; CHECK-GI-NEXT:    mov s4, v2.s[1]
+; CHECK-GI-NEXT:    mov v0.s[1], v5.s[0]
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    mov v2.s[1], v4.s[0]
+; CHECK-GI-NEXT:    mov s4, v3.s[1]
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    mov v3.s[1], v4.s[0]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    xtn v1.4h, v3.4s
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    ret
     %b = shufflevector <2 x i16> %A, <2 x i16> %B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
     %c = shufflevector <2 x i16> %C, <2 x i16> %D, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
     %d = shufflevector <8 x i16> %b, <8 x i16> %c, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index cbf12980a801a..2ea7e0f3c44a9 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -1,14 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:         warning: Instruction selection used fallback path for interleave2_v4f16
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
-; CHECK-LABEL: interleave2_v4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: interleave2_v4f16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    zip1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: interleave2_v4f16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v2.4s, w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    zip1 v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
   %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
   ret <4 x half> %retval
 }