[llvm-branch-commits] [llvm] [SelectionDAG][X86] Widen <2 x T> vector types for atomic load (PR #120598)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jan 22 10:20:02 PST 2025
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120598
>From 91864936c2f03d83f25b3274a66fc838a54f861c Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Thu, 19 Dec 2024 11:19:39 -0500
Subject: [PATCH] [SelectionDAG][X86] Widen <2 x T> vector types for atomic
load
Vector types of 2 elements must be widened. This change does this
for vector types of atomic load in SelectionDAG
so that it can translate aligned vectors of >1 size. Also,
it also adds Pats to remove an extra MOV.
commit-id:2894ccd1
---
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 79 ++++++++++++++++++
llvm/lib/Target/X86/X86InstrCompiler.td | 7 ++
llvm/test/CodeGen/X86/atomic-load-store.ll | 81 +++++++++++++++++++
llvm/test/CodeGen/X86/atomic-unordered.ll | 3 +-
5 files changed, 169 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 893fca233d191b..593d511f50f5c3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1048,6 +1048,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N);
SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
+ SDValue WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N);
SDValue WidenVecRes_LOAD(SDNode* N);
SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 980c157fe6f3f1..db2641b207733f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4521,6 +4521,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
break;
case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
+ case ISD::ATOMIC_LOAD:
+ Res = WidenVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N));
+ break;
case ISD::LOAD: Res = WidenVecRes_LOAD(N); break;
case ISD::STEP_VECTOR:
case ISD::SPLAT_VECTOR:
@@ -5907,6 +5910,82 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
N->getOperand(1), N->getOperand(2));
}
+static std::optional<EVT> findMemType(SelectionDAG &DAG,
+ const TargetLowering &TLI, unsigned Width,
+ EVT WidenVT, unsigned Align,
+ unsigned WidenEx);
+
+SDValue DAGTypeLegalizer::WidenVecRes_ATOMIC_LOAD(AtomicSDNode *LD) {
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+ EVT LdVT = LD->getMemoryVT();
+ SDLoc dl(LD);
+ assert(LdVT.isVector() && WidenVT.isVector());
+ assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
+ assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
+
+ // Load information
+ SDValue Chain = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = LD->getAAInfo();
+
+ TypeSize LdWidth = LdVT.getSizeInBits();
+ TypeSize WidenWidth = WidenVT.getSizeInBits();
+ TypeSize WidthDiff = WidenWidth - LdWidth;
+ // Allow wider loads if they are sufficiently aligned to avoid memory faults
+ // and if the original load is simple.
+ unsigned LdAlign =
+ (!LD->isSimple() || LdVT.isScalableVector()) ? 0 : LD->getAlign().value();
+
+ // Find the vector type that can load from.
+ std::optional<EVT> FirstVT =
+ findMemType(DAG, TLI, LdWidth.getKnownMinValue(), WidenVT, LdAlign,
+ WidthDiff.getKnownMinValue());
+
+ if (!FirstVT)
+ return SDValue();
+
+ SmallVector<EVT, 8> MemVTs;
+ TypeSize FirstVTWidth = FirstVT->getSizeInBits();
+
+ SDValue LdOp = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, *FirstVT, *FirstVT, Chain,
+ BasePtr, LD->getMemOperand());
+
+ // Load the element with one instruction.
+ SDValue Result;
+ assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth));
+ if (!FirstVT->isVector()) {
+ unsigned NumElts =
+ WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
+ EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), *FirstVT, NumElts);
+ SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
+ Result = DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
+ }
+ else if (FirstVT == WidenVT)
+ Result = LdOp;
+ else {
+ // TODO: We don't currently have any tests that exercise this code path.
+ assert(WidenWidth.getFixedValue() % FirstVTWidth.getFixedValue() == 0);
+ unsigned NumConcat =
+ WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
+ SmallVector<SDValue, 16> ConcatOps(NumConcat);
+ SDValue UndefVal = DAG.getUNDEF(*FirstVT);
+ ConcatOps[0] = LdOp;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ ConcatOps[i] = UndefVal;
+ Result = DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
+ }
+
+ if (Result) {
+ // Modified the chain - switch anything that used the old chain to use
+ // the new one.
+ ReplaceValueWith(SDValue(LD, 1), LdOp.getValue(1));
+ return Result;
+ }
+
+ report_fatal_error("Unable to widen atomic vector load");
+}
+
SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
LoadSDNode *LD = cast<LoadSDNode>(N);
ISD::LoadExtType ExtType = LD->getExtensionType();
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 7d4c5c0e10e492..7d8845b91fd1ed 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1198,6 +1198,13 @@ def : Pat<(i16 (atomic_load_16 addr:$src)), (MOV16rm addr:$src)>;
def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>;
def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (anyext (i16 (atomic_load_16 addr:$src)))))),
+ (MOVDI2PDIrm addr:$src)>; // load atomic <2 x i8>
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src)))),
+ (MOVDI2PDIrm addr:$src)>; // load atomic <2 x i16>
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))),
+ (MOV64toPQIrm addr:$src)>; // load atomic <2 x i32,float>
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 39e9fdfa5e62b0..935d058a52f8fe 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -146,6 +146,64 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
ret <1 x i64> %ret
}
+define <2 x i8> @atomic_vec2_i8(ptr %x) {
+; CHECK3-LABEL: atomic_vec2_i8:
+; CHECK3: ## %bb.0:
+; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK3-NEXT: retq
+;
+; CHECK0-LABEL: atomic_vec2_i8:
+; CHECK0: ## %bb.0:
+; CHECK0-NEXT: movw (%rdi), %cx
+; CHECK0-NEXT: ## implicit-def: $eax
+; CHECK0-NEXT: movw %cx, %ax
+; CHECK0-NEXT: movd %eax, %xmm0
+; CHECK0-NEXT: retq
+ %ret = load atomic <2 x i8>, ptr %x acquire, align 4
+ ret <2 x i8> %ret
+}
+
+define <2 x i16> @atomic_vec2_i16(ptr %x) {
+; CHECK3-LABEL: atomic_vec2_i16:
+; CHECK3: ## %bb.0:
+; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK3-NEXT: retq
+;
+; CHECK0-LABEL: atomic_vec2_i16:
+; CHECK0: ## %bb.0:
+; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK0-NEXT: retq
+ %ret = load atomic <2 x i16>, ptr %x acquire, align 4
+ ret <2 x i16> %ret
+}
+
+define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
+; CHECK-LABEL: atomic_vec2_ptr270:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
+ ret <2 x ptr addrspace(270)> %ret
+}
+
+define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
+; CHECK-LABEL: atomic_vec2_i32_align:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %ret = load atomic <2 x i32>, ptr %x acquire, align 8
+ ret <2 x i32> %ret
+}
+
+define <2 x float> @atomic_vec2_float_align(ptr %x) {
+; CHECK-LABEL: atomic_vec2_float_align:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %ret = load atomic <2 x float>, ptr %x acquire, align 8
+ ret <2 x float> %ret
+}
+
define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
; CHECK3-LABEL: atomic_vec1_ptr:
; CHECK3: ## %bb.0:
@@ -295,6 +353,29 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
ret <2 x i32> %ret
}
+define <4 x i8> @atomic_vec4_i8(ptr %x) nounwind {
+; CHECK3-LABEL: atomic_vec4_i8:
+; CHECK3: ## %bb.0:
+; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK3-NEXT: retq
+;
+; CHECK0-LABEL: atomic_vec4_i8:
+; CHECK0: ## %bb.0:
+; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK0-NEXT: retq
+ %ret = load atomic <4 x i8>, ptr %x acquire, align 4
+ ret <4 x i8> %ret
+}
+
+define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec4_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %ret = load atomic <4 x i16>, ptr %x acquire, align 8
+ ret <4 x i16> %ret
+}
+
define <4 x float> @atomic_vec4_float_align(ptr %x) nounwind {
; CHECK-LABEL: atomic_vec4_float_align:
; CHECK: ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a3..e2803d206c6c57 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2275,8 +2275,7 @@ define i64 @load_i16_anyext_i64(ptr %ptr) {
;
; CHECK-O3-LABEL: load_i16_anyext_i64:
; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-O3-NEXT: vmovd %eax, %xmm0
+; CHECK-O3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-O3-NEXT: vmovq %xmm0, %rax
; CHECK-O3-NEXT: retq
%v = load atomic i16, ptr %ptr unordered, align 8
More information about the llvm-branch-commits
mailing list