[llvm] 54a5dd4 - [DAGCombiner] allow store merging non-i8 truncated ops
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 26 12:26:01 PDT 2020
Author: Sanjay Patel
Date: 2020-08-26T15:23:08-04:00
New Revision: 54a5dd485c4d04d142a58c9349ada0c897cbeae6
URL: https://github.com/llvm/llvm-project/commit/54a5dd485c4d04d142a58c9349ada0c897cbeae6
DIFF: https://github.com/llvm/llvm-project/commit/54a5dd485c4d04d142a58c9349ada0c897cbeae6.diff
LOG: [DAGCombiner] allow store merging non-i8 truncated ops
We have a gap in our store merging capabilities for shift+truncate
patterns as discussed in:
https://llvm.org/PR46662
I generalized the code/comments for this function in earlier commits,
so we only need ease the type restriction and adjust the address/endian
checking to make this work.
AArch64 lets us switch endian to make sure that patterns are matched
either way.
Differential Revision: https://reviews.llvm.org/D86420
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/merge-trunc-store.ll
llvm/test/CodeGen/X86/stores-merging.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 40502f0c6993..445e2bff6c05 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6869,8 +6869,9 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
SmallVector<StoreSDNode *, 8> Stores;
for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
// TODO: Allow unordered atomics when wider type is legal (see D66309)
- if (Store->getMemoryVT() != MVT::i8 || !Store->isSimple() ||
- Store->isIndexed())
+ EVT MemVT = Store->getMemoryVT();
+ if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
+ !Store->isSimple() || Store->isIndexed())
return SDValue();
Stores.push_back(Store);
Chain = Store->getChain();
@@ -6959,12 +6960,6 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
assert(FirstOffset != INT64_MAX && "First byte offset must be set");
assert(FirstStore && "First store must be set");
- // Check if the bytes of the combined value we are looking at match with
- // either big or little endian value store.
- Optional<bool> IsBigEndian = isBigEndian(OffsetMap, FirstOffset);
- if (!IsBigEndian.hasValue())
- return SDValue();
-
// Check that a store of the wide type is both allowed and fast on the target
const DataLayout &Layout = DAG.getDataLayout();
bool Fast = false;
@@ -6973,6 +6968,31 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
if (!Allowed || !Fast)
return SDValue();
+ // Check if the pieces of the value are going to the expected places in memory
+ // to merge the stores.
+ auto checkOffsets = [&](bool MatchLittleEndian) {
+ if (MatchLittleEndian) {
+ for (unsigned i = 0; i != NumStores; ++i)
+ if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset)
+ return false;
+ } else { // MatchBigEndian by reversing loop counter.
+ for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j)
+ if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset)
+ return false;
+ }
+ return true;
+ };
+
+ // Check if the offsets line up for the native data layout of this target.
+ bool NeedBswap = false;
+ if (!checkOffsets(Layout.isLittleEndian())) {
+ // Special-case: check if byte offsets line up for the opposite endian.
+ // TODO: We could use rotates for 16/32-bit merge pairs.
+ if (NarrowNumBits != 8 || !checkOffsets(Layout.isBigEndian()))
+ return SDValue();
+ NeedBswap = true;
+ }
+
SDLoc DL(N);
if (WideVT != SourceValue.getValueType()) {
assert(SourceValue.getValueType().getSizeInBits() > WideNumBits &&
@@ -6983,7 +7003,6 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
// Before legalize we can introduce illegal bswaps which will be later
// converted to an explicit bswap sequence. This way we end up with a single
// store and byte shuffling instead of several stores and byte shuffling.
- bool NeedBswap = Layout.isBigEndian() != *IsBigEndian;
if (NeedBswap)
SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
index 9d9ea3ec8951..3f8fa3e9e383 100644
--- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
@@ -200,12 +200,17 @@ define void @be_i32_to_i8_order(i32 %x, i8* %p0) {
}
define void @le_i32_to_i16(i32 %x, i16* %p0) {
-; CHECK-LABEL: le_i32_to_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr w8, w0, #16
-; CHECK-NEXT: strh w0, [x1]
-; CHECK-NEXT: strh w8, [x1, #2]
-; CHECK-NEXT: ret
+; LE-LABEL: le_i32_to_i16:
+; LE: // %bb.0:
+; LE-NEXT: str w0, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: le_i32_to_i16:
+; BE: // %bb.0:
+; BE-NEXT: lsr w8, w0, #16
+; BE-NEXT: strh w0, [x1]
+; BE-NEXT: strh w8, [x1, #2]
+; BE-NEXT: ret
%sh1 = lshr i32 %x, 16
%t0 = trunc i32 %x to i16
%t1 = trunc i32 %sh1 to i16
@@ -216,12 +221,17 @@ define void @le_i32_to_i16(i32 %x, i16* %p0) {
}
define void @le_i32_to_i16_order(i32 %x, i16* %p0) {
-; CHECK-LABEL: le_i32_to_i16_order:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr w8, w0, #16
-; CHECK-NEXT: strh w8, [x1, #2]
-; CHECK-NEXT: strh w0, [x1]
-; CHECK-NEXT: ret
+; LE-LABEL: le_i32_to_i16_order:
+; LE: // %bb.0:
+; LE-NEXT: str w0, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: le_i32_to_i16_order:
+; BE: // %bb.0:
+; BE-NEXT: lsr w8, w0, #16
+; BE-NEXT: strh w8, [x1, #2]
+; BE-NEXT: strh w0, [x1]
+; BE-NEXT: ret
%sh1 = lshr i32 %x, 16
%t0 = trunc i32 %x to i16
%t1 = trunc i32 %sh1 to i16
@@ -232,12 +242,17 @@ define void @le_i32_to_i16_order(i32 %x, i16* %p0) {
}
define void @be_i32_to_i16(i32 %x, i16* %p0) {
-; CHECK-LABEL: be_i32_to_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr w8, w0, #16
-; CHECK-NEXT: strh w0, [x1, #2]
-; CHECK-NEXT: strh w8, [x1]
-; CHECK-NEXT: ret
+; LE-LABEL: be_i32_to_i16:
+; LE: // %bb.0:
+; LE-NEXT: lsr w8, w0, #16
+; LE-NEXT: strh w0, [x1, #2]
+; LE-NEXT: strh w8, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: be_i32_to_i16:
+; BE: // %bb.0:
+; BE-NEXT: str w0, [x1]
+; BE-NEXT: ret
%sh1 = lshr i32 %x, 16
%t0 = trunc i32 %x to i16
%t1 = trunc i32 %sh1 to i16
@@ -248,12 +263,17 @@ define void @be_i32_to_i16(i32 %x, i16* %p0) {
}
define void @be_i32_to_i16_order(i32 %x, i16* %p0) {
-; CHECK-LABEL: be_i32_to_i16_order:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr w8, w0, #16
-; CHECK-NEXT: strh w8, [x1]
-; CHECK-NEXT: strh w0, [x1, #2]
-; CHECK-NEXT: ret
+; LE-LABEL: be_i32_to_i16_order:
+; LE: // %bb.0:
+; LE-NEXT: lsr w8, w0, #16
+; LE-NEXT: strh w8, [x1]
+; LE-NEXT: strh w0, [x1, #2]
+; LE-NEXT: ret
+;
+; BE-LABEL: be_i32_to_i16_order:
+; BE: // %bb.0:
+; BE-NEXT: str w0, [x1]
+; BE-NEXT: ret
%sh1 = lshr i32 %x, 16
%t0 = trunc i32 %x to i16
%t1 = trunc i32 %sh1 to i16
@@ -440,16 +460,21 @@ define void @be_i64_to_i8_order(i64 %x, i8* %p0) {
}
define void @le_i64_to_i16(i64 %x, i16* %p0) {
-; CHECK-LABEL: le_i64_to_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #16
-; CHECK-NEXT: lsr x9, x0, #32
-; CHECK-NEXT: lsr x10, x0, #48
-; CHECK-NEXT: strh w0, [x1]
-; CHECK-NEXT: strh w8, [x1, #2]
-; CHECK-NEXT: strh w9, [x1, #4]
-; CHECK-NEXT: strh w10, [x1, #6]
-; CHECK-NEXT: ret
+; LE-LABEL: le_i64_to_i16:
+; LE: // %bb.0:
+; LE-NEXT: str x0, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: le_i64_to_i16:
+; BE: // %bb.0:
+; BE-NEXT: lsr x8, x0, #16
+; BE-NEXT: lsr x9, x0, #32
+; BE-NEXT: lsr x10, x0, #48
+; BE-NEXT: strh w0, [x1]
+; BE-NEXT: strh w8, [x1, #2]
+; BE-NEXT: strh w9, [x1, #4]
+; BE-NEXT: strh w10, [x1, #6]
+; BE-NEXT: ret
%sh1 = lshr i64 %x, 16
%sh2 = lshr i64 %x, 32
%sh3 = lshr i64 %x, 48
@@ -468,16 +493,21 @@ define void @le_i64_to_i16(i64 %x, i16* %p0) {
}
define void @le_i64_to_i16_order(i64 %x, i16* %p0) {
-; CHECK-LABEL: le_i64_to_i16_order:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #16
-; CHECK-NEXT: lsr x9, x0, #32
-; CHECK-NEXT: lsr x10, x0, #48
-; CHECK-NEXT: strh w0, [x1]
-; CHECK-NEXT: strh w8, [x1, #2]
-; CHECK-NEXT: strh w10, [x1, #6]
-; CHECK-NEXT: strh w9, [x1, #4]
-; CHECK-NEXT: ret
+; LE-LABEL: le_i64_to_i16_order:
+; LE: // %bb.0:
+; LE-NEXT: str x0, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: le_i64_to_i16_order:
+; BE: // %bb.0:
+; BE-NEXT: lsr x8, x0, #16
+; BE-NEXT: lsr x9, x0, #32
+; BE-NEXT: lsr x10, x0, #48
+; BE-NEXT: strh w0, [x1]
+; BE-NEXT: strh w8, [x1, #2]
+; BE-NEXT: strh w10, [x1, #6]
+; BE-NEXT: strh w9, [x1, #4]
+; BE-NEXT: ret
%sh1 = lshr i64 %x, 16
%sh2 = lshr i64 %x, 32
%sh3 = lshr i64 %x, 48
@@ -496,16 +526,21 @@ define void @le_i64_to_i16_order(i64 %x, i16* %p0) {
}
define void @be_i64_to_i16(i64 %x, i16* %p0) {
-; CHECK-LABEL: be_i64_to_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #16
-; CHECK-NEXT: lsr x9, x0, #32
-; CHECK-NEXT: lsr x10, x0, #48
-; CHECK-NEXT: strh w0, [x1, #6]
-; CHECK-NEXT: strh w8, [x1, #4]
-; CHECK-NEXT: strh w9, [x1, #2]
-; CHECK-NEXT: strh w10, [x1]
-; CHECK-NEXT: ret
+; LE-LABEL: be_i64_to_i16:
+; LE: // %bb.0:
+; LE-NEXT: lsr x8, x0, #16
+; LE-NEXT: lsr x9, x0, #32
+; LE-NEXT: lsr x10, x0, #48
+; LE-NEXT: strh w0, [x1, #6]
+; LE-NEXT: strh w8, [x1, #4]
+; LE-NEXT: strh w9, [x1, #2]
+; LE-NEXT: strh w10, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: be_i64_to_i16:
+; BE: // %bb.0:
+; BE-NEXT: str x0, [x1]
+; BE-NEXT: ret
%sh1 = lshr i64 %x, 16
%sh2 = lshr i64 %x, 32
%sh3 = lshr i64 %x, 48
@@ -524,16 +559,21 @@ define void @be_i64_to_i16(i64 %x, i16* %p0) {
}
define void @be_i64_to_i16_order(i64 %x, i16* %p0) {
-; CHECK-LABEL: be_i64_to_i16_order:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #16
-; CHECK-NEXT: lsr x9, x0, #32
-; CHECK-NEXT: lsr x10, x0, #48
-; CHECK-NEXT: strh w0, [x1, #6]
-; CHECK-NEXT: strh w10, [x1]
-; CHECK-NEXT: strh w9, [x1, #2]
-; CHECK-NEXT: strh w8, [x1, #4]
-; CHECK-NEXT: ret
+; LE-LABEL: be_i64_to_i16_order:
+; LE: // %bb.0:
+; LE-NEXT: lsr x8, x0, #16
+; LE-NEXT: lsr x9, x0, #32
+; LE-NEXT: lsr x10, x0, #48
+; LE-NEXT: strh w0, [x1, #6]
+; LE-NEXT: strh w10, [x1]
+; LE-NEXT: strh w9, [x1, #2]
+; LE-NEXT: strh w8, [x1, #4]
+; LE-NEXT: ret
+;
+; BE-LABEL: be_i64_to_i16_order:
+; BE: // %bb.0:
+; BE-NEXT: str x0, [x1]
+; BE-NEXT: ret
%sh1 = lshr i64 %x, 16
%sh2 = lshr i64 %x, 32
%sh3 = lshr i64 %x, 48
@@ -552,11 +592,16 @@ define void @be_i64_to_i16_order(i64 %x, i16* %p0) {
}
define void @le_i64_to_i32(i64 %x, i32* %p0) {
-; CHECK-LABEL: le_i64_to_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #32
-; CHECK-NEXT: stp w0, w8, [x1]
-; CHECK-NEXT: ret
+; LE-LABEL: le_i64_to_i32:
+; LE: // %bb.0:
+; LE-NEXT: str x0, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: le_i64_to_i32:
+; BE: // %bb.0:
+; BE-NEXT: lsr x8, x0, #32
+; BE-NEXT: stp w0, w8, [x1]
+; BE-NEXT: ret
%sh1 = lshr i64 %x, 32
%t0 = trunc i64 %x to i32
%t1 = trunc i64 %sh1 to i32
@@ -567,11 +612,16 @@ define void @le_i64_to_i32(i64 %x, i32* %p0) {
}
define void @le_i64_to_i32_order(i64 %x, i32* %p0) {
-; CHECK-LABEL: le_i64_to_i32_order:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #32
-; CHECK-NEXT: stp w0, w8, [x1]
-; CHECK-NEXT: ret
+; LE-LABEL: le_i64_to_i32_order:
+; LE: // %bb.0:
+; LE-NEXT: str x0, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: le_i64_to_i32_order:
+; BE: // %bb.0:
+; BE-NEXT: lsr x8, x0, #32
+; BE-NEXT: stp w0, w8, [x1]
+; BE-NEXT: ret
%sh1 = lshr i64 %x, 32
%t0 = trunc i64 %x to i32
%t1 = trunc i64 %sh1 to i32
@@ -582,11 +632,16 @@ define void @le_i64_to_i32_order(i64 %x, i32* %p0) {
}
define void @be_i64_to_i32(i64 %x, i32* %p0) {
-; CHECK-LABEL: be_i64_to_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #32
-; CHECK-NEXT: stp w8, w0, [x1]
-; CHECK-NEXT: ret
+; LE-LABEL: be_i64_to_i32:
+; LE: // %bb.0:
+; LE-NEXT: lsr x8, x0, #32
+; LE-NEXT: stp w8, w0, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: be_i64_to_i32:
+; BE: // %bb.0:
+; BE-NEXT: str x0, [x1]
+; BE-NEXT: ret
%sh1 = lshr i64 %x, 32
%t0 = trunc i64 %x to i32
%t1 = trunc i64 %sh1 to i32
@@ -597,11 +652,16 @@ define void @be_i64_to_i32(i64 %x, i32* %p0) {
}
define void @be_i64_to_i32_order(i64 %x, i32* %p0) {
-; CHECK-LABEL: be_i64_to_i32_order:
-; CHECK: // %bb.0:
-; CHECK-NEXT: lsr x8, x0, #32
-; CHECK-NEXT: stp w8, w0, [x1]
-; CHECK-NEXT: ret
+; LE-LABEL: be_i64_to_i32_order:
+; LE: // %bb.0:
+; LE-NEXT: lsr x8, x0, #32
+; LE-NEXT: stp w8, w0, [x1]
+; LE-NEXT: ret
+;
+; BE-LABEL: be_i64_to_i32_order:
+; BE: // %bb.0:
+; BE-NEXT: str x0, [x1]
+; BE-NEXT: ret
%sh1 = lshr i64 %x, 32
%t0 = trunc i64 %x to i32
%t1 = trunc i64 %sh1 to i32
@@ -611,6 +671,8 @@ define void @be_i64_to_i32_order(i64 %x, i32* %p0) {
ret void
}
+; Negative test - not consecutive addresses
+
define void @i64_to_i32_wrong_addr(i64 %x, i32* %p0) {
; CHECK-LABEL: i64_to_i32_wrong_addr:
; CHECK: // %bb.0:
@@ -627,6 +689,8 @@ define void @i64_to_i32_wrong_addr(i64 %x, i32* %p0) {
ret void
}
+; Negative test - addresses don't line up with shift amounts
+
define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) {
; CHECK-LABEL: i64_to_i16_wrong_order:
; CHECK: // %bb.0:
@@ -655,6 +719,8 @@ define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) {
ret void
}
+; Negative test - no store of 't1'
+
define void @i32_to_i8_incomplete(i32 %x, i8* %p0) {
; CHECK-LABEL: i32_to_i8_incomplete:
; CHECK: // %bb.0:
@@ -680,6 +746,8 @@ define void @i32_to_i8_incomplete(i32 %x, i8* %p0) {
ret void
}
+; Negative test - no store of 't3'
+
define void @i64_to_i8_incomplete(i64 %x, i8* %p0) {
; CHECK-LABEL: i64_to_i8_incomplete:
; CHECK: // %bb.0:
@@ -729,6 +797,8 @@ define void @i64_to_i8_incomplete(i64 %x, i8* %p0) {
ret void
}
+; Negative test - not consecutive addresses
+
define void @i32_to_i16_wrong_addr(i32 %x, i16* %p0) {
; CHECK-LABEL: i32_to_i16_wrong_addr:
; CHECK: // %bb.0:
@@ -745,6 +815,8 @@ define void @i32_to_i16_wrong_addr(i32 %x, i16* %p0) {
ret void
}
+; Negative test - addresses don't line up with shift amounts
+
define void @i32_to_i8_wrong_order(i32 %x, i8* %p0) {
; CHECK-LABEL: i32_to_i8_wrong_order:
; CHECK: // %bb.0:
diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll
index 4467fec9f2b4..85a086503410 100644
--- a/llvm/test/CodeGen/X86/stores-merging.ll
+++ b/llvm/test/CodeGen/X86/stores-merging.ll
@@ -468,9 +468,7 @@ define void @trunc_i32_to_i8(i32 %x, i8* %p) {
define void @trunc_i32_to_i16(i32 %x, i16* %p) {
; CHECK-LABEL: trunc_i32_to_i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: movw %di, (%rsi)
-; CHECK-NEXT: shrl $16, %edi
-; CHECK-NEXT: movw %di, 2(%rsi)
+; CHECK-NEXT: movl %edi, (%rsi)
; CHECK-NEXT: retq
%t1 = trunc i32 %x to i16
%sh = lshr i32 %x, 16
@@ -522,15 +520,7 @@ define void @trunc_i64_to_i8(i64 %x, i8* %p) {
define void @trunc_i64_to_i16(i64 %x, i16* %p) {
; CHECK-LABEL: trunc_i64_to_i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: movw %di, (%rsi)
-; CHECK-NEXT: shrq $16, %rdi
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: shrq $48, %rcx
-; CHECK-NEXT: movw %di, 2(%rsi)
-; CHECK-NEXT: movw %ax, 4(%rsi)
-; CHECK-NEXT: movw %cx, 6(%rsi)
+; CHECK-NEXT: movq %rdi, (%rsi)
; CHECK-NEXT: retq
%t1 = trunc i64 %x to i16
%sh1 = lshr i64 %x, 16
@@ -552,9 +542,7 @@ define void @trunc_i64_to_i16(i64 %x, i16* %p) {
define void @trunc_i64_to_i32(i64 %x, i32* %p) {
; CHECK-LABEL: trunc_i64_to_i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, (%rsi)
-; CHECK-NEXT: shrq $32, %rdi
-; CHECK-NEXT: movl %edi, 4(%rsi)
+; CHECK-NEXT: movq %rdi, (%rsi)
; CHECK-NEXT: retq
%t1 = trunc i64 %x to i32
%sh = lshr i64 %x, 32
More information about the llvm-commits
mailing list