[llvm] [AArch64] Handle v2i16 and v2i8 in concat load combine. (PR #86264)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 22 03:04:39 PDT 2024
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/86264
This extends the concat load patch from https://reviews.llvm.org/D121400, which was later moved to a combine, to handle v2i8 and v2i16 concat loads too.
>From e07047f9937ddbda7c81a1d70448d114b4734675 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 22 Mar 2024 09:56:38 +0000
Subject: [PATCH] [AArch64] Handle v2i16 and v2i8 in concat load combine.
This extends the concat load patch from https://reviews.llvm.org/D121400, which
was later moved to a combine, to handle v2i8 and v2i16 concat loads too.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 21 +++--
llvm/test/CodeGen/AArch64/insert-subvector.ll | 85 ++++---------------
2 files changed, 28 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7fab274ab957c8..043a617898a69e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18629,12 +18629,16 @@ static SDValue performConcatVectorsCombine(SDNode *N,
}
}
- if (N->getOperand(0).getValueType() == MVT::v4i8) {
+ if (N->getOperand(0).getValueType() == MVT::v4i8 ||
+ N->getOperand(0).getValueType() == MVT::v2i16 ||
+ N->getOperand(0).getValueType() == MVT::v2i8) {
+ EVT SrcVT = N->getOperand(0).getValueType();
// If we have a concat of v4i8 loads, convert them to a buildvector of f32
// loads to prevent having to go through the v4i8 load legalization that
// needs to extend each element into a larger type.
- if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
- if (V.getValueType() != MVT::v4i8)
+ if (N->getNumOperands() % 2 == 0 &&
+ all_of(N->op_values(), [SrcVT](SDValue V) {
+ if (V.getValueType() != SrcVT)
return false;
if (V.isUndef())
return true;
@@ -18642,19 +18646,18 @@ static SDValue performConcatVectorsCombine(SDNode *N,
return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
LD->getExtensionType() == ISD::NON_EXTLOAD;
})) {
- EVT NVT =
- EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
+ EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
SmallVector<SDValue> Ops;
for (unsigned i = 0; i < N->getNumOperands(); i++) {
SDValue V = N->getOperand(i);
if (V.isUndef())
- Ops.push_back(DAG.getUNDEF(MVT::f32));
+ Ops.push_back(DAG.getUNDEF(FVT));
else {
LoadSDNode *LD = cast<LoadSDNode>(V);
- SDValue NewLoad =
- DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
- LD->getMemOperand());
+ SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
+ LD->getBasePtr(), LD->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
Ops.push_back(NewLoad);
}
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll
index 95ad9807ed6390..6828fa9f1508c8 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll
@@ -377,12 +377,8 @@ define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v2.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[0], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -395,12 +391,9 @@ define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: adrp x8, .LCPI33_0
-; CHECK-NEXT: mov v0.b[1], v0.b[4]
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI33_0]
; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
@@ -414,12 +407,8 @@ define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v2.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[1], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -432,12 +421,8 @@ define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v2.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[2], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -450,12 +435,8 @@ define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v2.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[3], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -468,11 +449,9 @@ define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v4i8_2_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: mov v0.s[1], v1.s[1]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -485,10 +464,8 @@ define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v4i8_2_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -504,13 +481,8 @@ define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.s[0], v2.s[0]
+; CHECK-NEXT: ld1 { v0.s }[0], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -521,13 +493,9 @@ define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1
-; CHECK-NEXT: fmov s2, w8
; CHECK-NEXT: adrp x8, .LCPI40_0
-; CHECK-NEXT: ld1 { v2.h }[2], [x9]
-; CHECK-NEXT: xtn v0.4h, v2.4s
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI40_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
@@ -540,13 +508,8 @@ define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-NEXT: ld1 { v0.s }[1], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -557,13 +520,8 @@ define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.s[2], v2.s[0]
+; CHECK-NEXT: ld1 { v0.s }[2], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -574,13 +532,8 @@ define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.s[3], v2.s[0]
+; CHECK-NEXT: ld1 { v0.s }[3], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -591,11 +544,8 @@ define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v4i16_2_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-NEXT: add x8, x0, #2
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: mov v0.s[1], v1.s[1]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -608,11 +558,8 @@ define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v4i16_2_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-NEXT: add x8, x0, #2
-; CHECK-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr s2, [x0]
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list