[llvm] df672f6 - [DAG] scalarizeExtractedVectorLoad - replace getABITypeAlign with allowsMemoryAccess (PR45116)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 1 13:08:17 PDT 2021
Author: Simon Pilgrim
Date: 2021-10-01T21:07:34+01:00
New Revision: df672f66b669ca6f347858d26af1cae0aeddf8c2
URL: https://github.com/llvm/llvm-project/commit/df672f66b669ca6f347858d26af1cae0aeddf8c2
DIFF: https://github.com/llvm/llvm-project/commit/df672f66b669ca6f347858d26af1cae0aeddf8c2.diff
LOG: [DAG] scalarizeExtractedVectorLoad - replace getABITypeAlign with allowsMemoryAccess (PR45116)
One of the cases identified in PR45116 - we don't need to limit extracted loads to ABI alignment, we can use allowsMemoryAccess - which tests using getABITypeAlign, but also checks if a target permits (fast) misaligned memory loads by checking allowsMisalignedMemoryAccesses as a fallback.
I've also cleaned up the alignment calculation code - if we have a constant extraction index then the alignment can be based on an offset from the original vector load alignment, but for non-constant indices we should assume the worst (single element alignment only).
Differential Revision: https://reviews.llvm.org/D110486
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 22fea302410eb..61d74e76f96f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18662,32 +18662,35 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
if (!VecEltVT.isByteSized())
return SDValue();
- Align Alignment = OriginalLoad->getAlign();
- Align NewAlign = DAG.getDataLayout().getABITypeAlign(
- VecEltVT.getTypeForEVT(*DAG.getContext()));
-
- if (NewAlign > Alignment ||
- !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
+ ISD::LoadExtType ExtTy =
+ ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
+ if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
+ !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
return SDValue();
- ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
- ISD::NON_EXTLOAD : ISD::EXTLOAD;
- if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
- return SDValue();
-
- Alignment = NewAlign;
-
+ Align Alignment = OriginalLoad->getAlign();
MachinePointerInfo MPI;
SDLoc DL(EVE);
if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
int Elt = ConstEltNo->getZExtValue();
unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+ Alignment = commonAlignment(Alignment, PtrOff);
} else {
// Discard the pointer info except the address space because the memory
// operand can't represent this new access since the offset is variable.
MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
+ Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
}
+
+ bool IsFast = false;
+ if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
+ OriginalLoad->getAddressSpace(), Alignment,
+ OriginalLoad->getMemOperand()->getFlags(),
+ &IsFast) ||
+ !IsFast)
+ return SDValue();
+
SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(),
InVecVT, EltNo);
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index d4aaa9c1eecaf..076466dda5b11 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -9143,18 +9143,12 @@ define i32 @load_single_extract_variable_index_i32(<4 x i32>* %A, i32 %idx) {
define i32 @load_single_extract_variable_index_v3i32_small_align(<3 x i32>* %A, i32 %idx) {
; CHECK-LABEL: load_single_extract_variable_index_v3i32_small_align:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: add x8, x0, #8
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ld1.s { v0 }[2], [x8]
-; CHECK-NEXT: and x8, x1, #0x3
-; CHECK-NEXT: bfi x9, x8, #2, #2
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: ldr w0, [x9]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: sxtw x8, w1
+; CHECK-NEXT: cmp x8, #2
+; CHECK-NEXT: mov w9, #2
+; CHECK-NEXT: csel x8, x8, x9, lo
+; CHECK-NEXT: ldr w0, [x0, x8, lsl #2]
; CHECK-NEXT: ret
%lv = load <3 x i32>, <3 x i32>* %A, align 2
%e = extractelement <3 x i32> %lv, i32 %idx
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
index a4fa485f35b2a..2883beb6b01df 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
@@ -332,14 +332,14 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
; X86-SSE4A: # %bb.0:
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE4A-NEXT: movups (%ecx), %xmm0
-; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
+; X86-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
-; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax)
; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
+; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax)
; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
-; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
+; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax)
; X86-SSE4A-NEXT: retl
;
; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
@@ -360,14 +360,14 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
;
; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
; X64-SSE4A: # %bb.0:
-; X64-SSE4A-NEXT: movups (%rdi), %xmm0
-; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
+; X64-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
-; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
+; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
-; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
+; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi)
; X64-SSE4A-NEXT: retq
;
; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
@@ -445,14 +445,14 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-SSE4A: # %bb.0:
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE4A-NEXT: movups (%ecx), %xmm0
-; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
+; X86-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
-; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax)
; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
+; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax)
; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
-; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
+; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax)
; X86-SSE4A-NEXT: retl
;
; X64-SSE2-LABEL: merge_2_v4f32_align1:
@@ -473,14 +473,14 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
;
; X64-SSE4A-LABEL: merge_2_v4f32_align1:
; X64-SSE4A: # %bb.0:
-; X64-SSE4A-NEXT: movups (%rdi), %xmm0
-; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
+; X64-SSE4A-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE4A-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
-; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
+; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
-; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
+; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi)
; X64-SSE4A-NEXT: retq
;
; X64-SSE41-LABEL: merge_2_v4f32_align1:
More information about the llvm-commits
mailing list