[clang] [libcxx] [compiler-rt] [libc] [flang] [llvm] [clang-tools-extra] [libcxxabi] [openmp] [mlir] [AArch64] Add custom lowering for load <3 x i8>. (PR #78632)
Florian Hahn via cfe-commits
cfe-commits at lists.llvm.org
Thu Jan 25 09:02:59 PST 2024
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/78632
>From a786cdedc2c9a9898cd0b80d84f5b11aace5da1c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 28 Nov 2023 15:44:02 +0000
Subject: [PATCH 1/6] [AArch64] Add custom lowering for load <3 x i8>.
Add custom combine to lower load <3 x i8> as the more efficient sequence
below:
ldrb wX, [x0, #2]
ldrh wY, [x0]
orr wX, wY, wX, lsl #16
fmov s0, wX
At the moment, there are almost no cases in which such vector operations
will be generated automatically. The motivating case is non-power-of-2
SLP vectorization: https://github.com/llvm/llvm-project/pull/77790
---
.../Target/AArch64/AArch64ISelLowering.cpp | 54 ++++++++++++++++++-
.../AArch64/vec3-loads-ext-trunc-stores.ll | 44 +++++----------
2 files changed, 65 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8a6f1dc7487bae8..e1139c2fede8e41 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21095,6 +21095,50 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
return SDValue();
}
+// A custom combine to lower load <3 x i8> as the more efficient sequence
+// below:
+// ldrb wX, [x0, #2]
+// ldrh wY, [x0]
+// orr wX, wY, wX, lsl #16
+// fmov s0, wX
+//
+static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
+ EVT MemVT = LD->getMemoryVT();
+ if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
+ LD->getOriginalAlign() >= 4)
+ return SDValue();
+
+ SDLoc DL(LD);
+ SDValue Chain = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+
+ // Load 2 x i8, then 1 x i8.
+ SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(),
+ LD->getOriginalAlign());
+ SDValue L8 =
+ DAG.getLoad(MVT::i8, DL, Chain,
+ DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL),
+ LD->getPointerInfo(), LD->getOriginalAlign());
+
+ // Extend to i32.
+ SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
+ SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
+
+ // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
+ SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
+ DAG.getConstant(16, DL, MVT::i32));
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr);
+ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
+
+ // Extract v3i8 again.
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
+ DAG.getConstant(0, DL, MVT::i64));
+ SDValue TokenFactor = DAG.getNode(
+ ISD::TokenFactor, DL, MVT::Other,
+ {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
+ return DAG.getMergeValues({Extract, TokenFactor}, DL);
+}
+
// Perform TBI simplification if supported by the target and try to break up
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
// load instructions can be selected.
@@ -21106,10 +21150,16 @@ static SDValue performLOADCombine(SDNode *N,
performTBISimplification(N->getOperand(1), DCI, DAG);
LoadSDNode *LD = cast<LoadSDNode>(N);
- EVT MemVT = LD->getMemoryVT();
- if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
+ if (LD->isVolatile() || !Subtarget->isLittleEndian())
+ return SDValue(N, 0);
+
+ if (SDValue Res = combineV3I8LoadExt(LD, DAG))
+ return Res;
+
+ if (!LD->isNonTemporal())
return SDValue(N, 0);
+ EVT MemVT = LD->getMemoryVT();
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
MemVT.getSizeInBits() % 256 == 0 ||
256 % MemVT.getScalarSizeInBits() != 0)
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 9eeb194409df6fa..7cac4134f0e1598 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -5,19 +5,10 @@
define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
; CHECK-LABEL: load_v3i8:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: umov.h w8, v0[0]
-; CHECK-NEXT: umov.h w9, v0[1]
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: add x8, x0, #2
-; CHECK-NEXT: mov.b v0[1], w9
-; CHECK-NEXT: ld1.b { v0 }[2], [x8]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8:
@@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) {
; CHECK-LABEL: load_v3i8_to_4xi32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ldrsb w8, [x0, #2]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: mov.h v0[1], v0[1]
-; CHECK-NEXT: mov.h v0[2], w8
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32:
@@ -193,19 +179,15 @@ entry:
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_to_64bits:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: add x8, x0, #2
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: ld1.b { v0 }[4], [x8]
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: add x8, x1, #4
+; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_ext_to_64bits:
>From 192233f0fda044c759054ae9d79c5b33d66fb1af Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 19 Jan 2024 16:49:34 +0000
Subject: [PATCH 2/6] !fixup adjust alignment and pointer info
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e1139c2fede8e41..95bc6b5cdff57d3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21115,10 +21115,10 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
// Load 2 x i8, then 1 x i8.
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(),
LD->getOriginalAlign());
- SDValue L8 =
- DAG.getLoad(MVT::i8, DL, Chain,
- DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL),
- LD->getPointerInfo(), LD->getOriginalAlign());
+ TypeSize Offset2 = TypeSize::getFixed(2);
+ SDValue L8 = DAG.getLoad(
+ MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
+ LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2));
// Extend to i32.
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
>From 39d6794cceb832afbf3e3bafe2c00413ef405eb7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 22 Jan 2024 16:11:35 +0000
Subject: [PATCH 3/6] !fixup add offset assert and update new tests.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 1 +
.../AArch64/vec3-loads-ext-trunc-stores.ll | 30 +++++++------------
2 files changed, 11 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c4e2a6f90827026..e26bb093ee5cbe0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21173,6 +21173,7 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
SDLoc DL(LD);
SDValue Chain = LD->getChain();
SDValue BasePtr = LD->getBasePtr();
+ assert(LD->getOffset().isUndef() && "undef offset expected");
// Load 2 x i8, then 1 x i8.
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(),
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 31a3874126d4baf..7435dde4f551bf3 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -76,19 +76,14 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldurh w8, [x0, #1]
+; CHECK-NEXT: ldrb w8, [x0, #3]
+; CHECK-NEXT: ldurh w9, [x0, #1]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ldrsb w8, [x0, #3]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: mov.h v0[1], v0[1]
-; CHECK-NEXT: mov.h v0[2], w8
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32_const_offset_1:
@@ -120,19 +115,14 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldurh w8, [x0, #3]
+; CHECK-NEXT: ldrb w8, [x0, #5]
+; CHECK-NEXT: ldurh w9, [x0, #3]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ldrsb w8, [x0, #5]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: mov.h v0[1], v0[1]
-; CHECK-NEXT: mov.h v0[2], w8
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32_const_offset_3:
>From e96af2fa4ca83bade36e1f0aa1ab2e2b1d6dc49e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 23 Jan 2024 14:19:32 +0000
Subject: [PATCH 4/6] !fixup update on top of new test coverage.
Update checks after adding more tests in e7b4ff8
---
.../CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index a4698c27cfd2cd3..5a253bea6f1e9fe 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -348,24 +348,20 @@ entry:
define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_add_to_64bits:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: ldrb w9, [x0, #2]
+; CHECK-NEXT: ldrh w10, [x0]
; CHECK-NEXT: Lloh2:
; CHECK-NEXT: adrp x8, lCPI9_0 at PAGE
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr d1, [x8, lCPI9_0 at PAGEOFF]
; CHECK-NEXT: add x8, x1, #4
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: ld1.b { v0 }[4], [x9]
+; CHECK-NEXT: orr w9, w10, w9, lsl #16
+; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: add.4h v0, v0, v1
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
;
>From 7e2bf68358fc55e6e770601872b4a6ffd9349ec3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 24 Jan 2024 21:17:34 +0000
Subject: [PATCH 5/6] !fixup update tests and use MMO.
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 ++++++-----
.../AArch64/vec3-loads-ext-trunc-stores.ll | 15 +++++----------
2 files changed, 11 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a53c4740bd3d49f..00d62b7450f3cb8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21262,17 +21262,18 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
return SDValue();
SDLoc DL(LD);
+ MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = LD->getChain();
SDValue BasePtr = LD->getBasePtr();
+ MachineMemOperand *MMO = LD->getMemOperand();
assert(LD->getOffset().isUndef() && "undef offset expected");
// Load 2 x i8, then 1 x i8.
- SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(),
- LD->getOriginalAlign());
+ SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
TypeSize Offset2 = TypeSize::getFixed(2);
- SDValue L8 = DAG.getLoad(
- MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
- LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2));
+ SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
+ DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
+ MF.getMachineMemOperand(MMO, 2, 1));
// Extend to i32.
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 796580f083d0dc0..275e5ac8b7062e0 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -76,19 +76,14 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_align_2:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ldrsb w8, [x0, #2]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: mov.h v0[1], v0[1]
-; CHECK-NEXT: mov.h v0[2], w8
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32_align_2:
>From 109038bab1328d667a6e2eaf01acc82c33c95431 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 19 Jan 2024 16:40:46 +0000
Subject: [PATCH 6/6] Try using LD1r.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 68 ++++++++++++++++---
.../AArch64/vec3-loads-ext-trunc-stores.ll | 58 +++++-----------
2 files changed, 75 insertions(+), 51 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 00d62b7450f3cb8..6dc56ab3347a1eb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11012,6 +11012,48 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
MaskSourceVec);
}
+// Check if Op is a BUILD_VECTOR with 2 extracts and a load that is cheaper to
+// insert into a vector and use a shuffle. This improves lowering for loads of
+// <3 x i8>.
+static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) {
+ if (Op.getNumOperands() != 4 || Op.getValueType() != MVT::v4i16)
+ return SDValue();
+
+ SDValue V0 = Op.getOperand(0);
+ SDValue V1 = Op.getOperand(1);
+ SDValue V2 = Op.getOperand(2);
+ SDValue V3 = Op.getOperand(3);
+ if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ V2.getOpcode() != ISD::LOAD || !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT))
+ return SDValue();
+
+ if (V0.getOperand(0) != V1.getOperand(0) ||
+ V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || !(V3.isUndef() || V3.getConstantOperandVal(1) == 3))
+ return SDValue();
+
+ SDLoc dl(Op);
+ auto *L = cast<LoadSDNode>(Op.getOperand(2));
+ auto Vec = V0.getOperand(0);
+
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec,
+ SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64));
+ Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec);
+
+ SDValue ShuffleOps[] = {DAG.getUNDEF(MVT::v4i16), DAG.getUNDEF(MVT::v4i16)};
+ ShuffleOps[0] = Vec;
+
+ SmallVector<int, 8> Mask(4, -1);
+ Mask[0] = 0;
+ Mask[1] = 1;
+ Mask[2] = 2;
+ if (!V3.isUndef())
+ Mask[3] = 3;
+ SDValue Shuffle =
+ DAG.getVectorShuffle(MVT::v4i16, dl, ShuffleOps[0], ShuffleOps[1], Mask);
+ return Shuffle;
+}
+
// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
@@ -11022,6 +11064,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
EVT VT = Op.getValueType();
assert(!VT.isScalableVector() &&
"Scalable vectors cannot be used with ISD::BUILD_VECTOR");
+
+ if (SDValue S = shuffleWithSingleLoad(Op, DAG))
+ return S;
+
unsigned NumElts = VT.getVectorNumElements();
struct ShuffleSourceInfo {
@@ -11048,6 +11094,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
// First gather all vectors used as an immediate source for this BUILD_VECTOR
// node.
+ //
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
@@ -21269,24 +21316,23 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
assert(LD->getOffset().isUndef() && "undef offset expected");
// Load 2 x i8, then 1 x i8.
- SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
+ SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr,
+ MF.getMachineMemOperand(MMO, 0, 2));
TypeSize Offset2 = TypeSize::getFixed(2);
SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
MF.getMachineMemOperand(MMO, 2, 1));
- // Extend to i32.
- SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
- SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
+ SDValue Ins16 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::v4i16, L16);
- // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
- SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
- DAG.getConstant(16, DL, MVT::i32));
- SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr);
- SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
+ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Ins16);
+
+ SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
+ SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8);
- // Extract v3i8 again.
- SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
+ SDValue Ins8 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i8, Cast,
+ Trunc8, DAG.getConstant(2, DL, MVT::i64));
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Ins8,
DAG.getConstant(0, DL, MVT::i64));
SDValue TokenFactor = DAG.getNode(
ISD::TokenFactor, DL, MVT::Other,
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 275e5ac8b7062e0..248aa20bab63298 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -5,10 +5,8 @@
define <16 x i8> @load_v3i8(ptr %src) {
; CHECK-LABEL: load_v3i8:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrh w9, [x0]
-; CHECK-NEXT: orr w8, w9, w8, lsl #16
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
+; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8:
@@ -38,12 +36,9 @@ define <16 x i8> @load_v3i8(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: orr w8, w9, w8, lsl #16
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: zip1.8b v0, v0, v0
+; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
@@ -59,7 +54,6 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #2]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
-; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -76,12 +70,9 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_align_2:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: orr w8, w9, w8, lsl #16
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: zip1.8b v0, v0, v0
+; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
@@ -97,7 +88,6 @@ define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #2]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
-; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -141,12 +131,11 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #3]
-; CHECK-NEXT: ldurh w9, [x0, #1]
+; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: orr w8, w9, w8, lsl #16
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: zip1.8b v0, v0, v0
+; CHECK-NEXT: ld1r.4h { v0 }, [x8]
+; CHECK-NEXT: add x8, x0, #3
+; CHECK-NEXT: ld1.b { v0 }[2], [x8]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
@@ -162,7 +151,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #3]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
-; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -180,12 +168,11 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #5]
-; CHECK-NEXT: ldurh w9, [x0, #3]
+; CHECK-NEXT: add x8, x0, #3
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: orr w8, w9, w8, lsl #16
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: zip1.8b v0, v0, v0
+; CHECK-NEXT: ld1r.4h { v0 }, [x8]
+; CHECK-NEXT: add x8, x0, #5
+; CHECK-NEXT: ld1.b { v0 }[2], [x8]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
@@ -201,7 +188,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #5]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
-; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -263,7 +249,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ldrsb w8, [x0, #2]
; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: mov.h v0[1], v0[1]
; CHECK-NEXT: mov.h v0[2], w8
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
@@ -281,7 +266,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #2]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
-; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
@@ -410,12 +394,9 @@ entry:
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_to_64bits:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrh w9, [x0]
-; CHECK-NEXT: orr w8, w9, w8, lsl #16
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: add x8, x1, #4
-; CHECK-NEXT: zip1.8b v0, v0, v0
+; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
@@ -507,16 +488,13 @@ entry:
define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_add_to_64bits:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: ldrb w9, [x0, #2]
-; CHECK-NEXT: ldrh w10, [x0]
+; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: Lloh2:
; CHECK-NEXT: adrp x8, lCPI13_0 at PAGE
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr d1, [x8, lCPI13_0 at PAGEOFF]
; CHECK-NEXT: add x8, x1, #4
-; CHECK-NEXT: orr w9, w10, w9, lsl #16
-; CHECK-NEXT: fmov s0, w9
-; CHECK-NEXT: zip1.8b v0, v0, v0
+; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: add.4h v0, v0, v1
; CHECK-NEXT: st1.h { v0 }[2], [x8]
More information about the cfe-commits
mailing list