[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Oct 31 09:05:34 PDT 2025
https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/165818
>From 5c2428c58f77157dc79aa76478beb03e5cf45f1f Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load
Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 37 ++
llvm/test/CodeGen/X86/atomic-load-store.ll | 352 +++++++++++++++++-
3 files changed, 386 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22f9fd548f52b..e34b9fa8e787c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
SplitVecRes_STEP_VECTOR(N, Lo, Hi);
break;
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+ case ISD::ATOMIC_LOAD:
+ SplitVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N), Lo, Hi);
+ break;
case ISD::LOAD:
SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
}
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+ SDValue &Hi) {
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+ SDLoc dl(LD);
+ EVT VT = LD->getValueType(0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ SDValue Ch = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ EVT MemIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+ SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+ Ptr, LD->getMemOperand());
+
+ EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+ EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+ SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+ SDValue ExtractHi =
+ DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+ DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+ ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+ Lo = DAG.getBitcast(LoVT, ExtractLo);
+ Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 7e15b9303887f..928dfef3143da 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3,CHECK-SSE2-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3,CHECK-SSE4-O3
; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX2-O3
; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX512-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0,CHECK-SSE2-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0,CHECK-SSE4-O0
; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX2-O0
; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX512-O0
@@ -295,6 +295,96 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
ret <2 x float> %ret
}
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: shrl $16, %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE-O0-NEXT: movw %cx, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <2 x half>, ptr %x acquire, align 4
+ ret <2 x half> %ret
+}
+define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
+; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: shrl $16, %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: shrl $16, %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-SSE-O0-NEXT: movw %ax, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %dx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movl (%rdi), %eax
+; CHECK-AVX-O0-NEXT: movw %ax, %cx
+; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: shrl $16, %eax
+; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
+ ret <2 x bfloat> %ret
+}
define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
; CHECK-O3-LABEL: atomic_vec1_ptr:
; CHECK-O3: # %bb.0:
@@ -585,6 +675,260 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
ret <4 x i16> %ret
}
+define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec4_half:
+; CHECK-SSE2-O3: # %bb.0:
+; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE2-O3-NEXT: movl %eax, %ecx
+; CHECK-SSE2-O3-NEXT: shrl $16, %ecx
+; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE2-O3-NEXT: movq %rax, %rcx
+; CHECK-SSE2-O3-NEXT: shrq $32, %rcx
+; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE2-O3-NEXT: shrq $48, %rax
+; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-O3-NEXT: retq
+;
+; CHECK-SSE4-O3-LABEL: atomic_vec4_half:
+; CHECK-SSE4-O3: # %bb.0:
+; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE4-O3-NEXT: movl %eax, %ecx
+; CHECK-SSE4-O3-NEXT: shrl $16, %ecx
+; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE4-O3-NEXT: movq %rax, %rcx
+; CHECK-SSE4-O3-NEXT: shrq $32, %rcx
+; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE4-O3-NEXT: shrq $48, %rax
+; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE4-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE2-O0-LABEL: atomic_vec4_half:
+; CHECK-SSE2-O0: # %bb.0:
+; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax
+; CHECK-SSE2-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE2-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE2-O0-NEXT: movw %cx, %dx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE2-O0-NEXT: movw %dx, %cx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2
+; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE2-O0-NEXT: movw %ax, %dx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE2-O0-NEXT: movw %dx, %cx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm0
+; CHECK-SSE2-O0-NEXT: movq %rax, %rcx
+; CHECK-SSE2-O0-NEXT: shrq $32, %rcx
+; CHECK-SSE2-O0-NEXT: movw %cx, %dx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE2-O0-NEXT: movw %dx, %cx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE2-O0-NEXT: shrq $48, %rax
+; CHECK-SSE2-O0-NEXT: movw %ax, %cx
+; CHECK-SSE2-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE2-O0-NEXT: movw %cx, %ax
+; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm3
+; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-O0-NEXT: retq
+;
+; CHECK-SSE4-O0-LABEL: atomic_vec4_half:
+; CHECK-SSE4-O0: # %bb.0:
+; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax
+; CHECK-SSE4-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE4-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE4-O0-NEXT: movw %cx, %dx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE4-O0-NEXT: movw %dx, %cx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2
+; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE4-O0-NEXT: movw %ax, %dx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE4-O0-NEXT: movw %dx, %cx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm0
+; CHECK-SSE4-O0-NEXT: movq %rax, %rcx
+; CHECK-SSE4-O0-NEXT: shrq $32, %rcx
+; CHECK-SSE4-O0-NEXT: movw %cx, %dx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE4-O0-NEXT: movw %dx, %cx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE4-O0-NEXT: shrq $48, %rax
+; CHECK-SSE4-O0-NEXT: movw %ax, %cx
+; CHECK-SSE4-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE4-O0-NEXT: movw %cx, %ax
+; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm3
+; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-SSE4-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <4 x half>, ptr %x acquire, align 8
+ ret <4 x half> %ret
+}
+define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE2-O3: # %bb.0:
+; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE2-O3-NEXT: movq %rax, %rcx
+; CHECK-SSE2-O3-NEXT: movq %rax, %rdx
+; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE2-O3-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-SSE2-O3-NEXT: shrl $16, %eax
+; CHECK-SSE2-O3-NEXT: shrq $32, %rcx
+; CHECK-SSE2-O3-NEXT: shrq $48, %rdx
+; CHECK-SSE2-O3-NEXT: pinsrw $0, %edx, %xmm1
+; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-SSE2-O3-NEXT: retq
+;
+; CHECK-SSE4-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE4-O3: # %bb.0:
+; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE4-O3-NEXT: movq %rax, %rcx
+; CHECK-SSE4-O3-NEXT: movq %rax, %rdx
+; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE4-O3-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-SSE4-O3-NEXT: shrl $16, %eax
+; CHECK-SSE4-O3-NEXT: shrq $32, %rcx
+; CHECK-SSE4-O3-NEXT: shrq $48, %rdx
+; CHECK-SSE4-O3-NEXT: pinsrw $0, %edx, %xmm1
+; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE4-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movq (%rdi), %rax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX-O3-NEXT: shrq $48, %rcx
+; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX-O3-NEXT: shrq $32, %rcx
+; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: shrl $16, %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE2-O0-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE2-O0: # %bb.0:
+; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax
+; CHECK-SSE2-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE2-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE2-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-SSE2-O0-NEXT: movw %ax, %dx
+; CHECK-SSE2-O0-NEXT: movq %rax, %rsi
+; CHECK-SSE2-O0-NEXT: shrq $32, %rsi
+; CHECK-SSE2-O0-NEXT: # kill: def $si killed $si killed $rsi
+; CHECK-SSE2-O0-NEXT: shrq $48, %rax
+; CHECK-SSE2-O0-NEXT: movw %ax, %di
+; CHECK-SSE2-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE2-O0-NEXT: movw %di, %ax
+; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE2-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE2-O0-NEXT: movw %si, %ax
+; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE2-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE2-O0-NEXT: movw %dx, %ax
+; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE2-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE2-O0-NEXT: movw %cx, %ax
+; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2
+; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm2
+; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-O0-NEXT: retq
+;
+; CHECK-SSE4-O0-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE4-O0: # %bb.0:
+; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax
+; CHECK-SSE4-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE4-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE4-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-SSE4-O0-NEXT: movw %ax, %dx
+; CHECK-SSE4-O0-NEXT: movq %rax, %rsi
+; CHECK-SSE4-O0-NEXT: shrq $32, %rsi
+; CHECK-SSE4-O0-NEXT: # kill: def $si killed $si killed $rsi
+; CHECK-SSE4-O0-NEXT: shrq $48, %rax
+; CHECK-SSE4-O0-NEXT: movw %ax, %di
+; CHECK-SSE4-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE4-O0-NEXT: movw %di, %ax
+; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE4-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE4-O0-NEXT: movw %si, %ax
+; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE4-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE4-O0-NEXT: movw %dx, %ax
+; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE4-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE4-O0-NEXT: movw %cx, %ax
+; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2
+; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm2
+; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-SSE4-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movq (%rdi), %rax
+; CHECK-AVX-O0-NEXT: movq %rax, %rcx
+; CHECK-AVX-O0-NEXT: shrq $48, %rcx
+; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx
+; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: movq %rax, %rcx
+; CHECK-AVX-O0-NEXT: shrq $32, %rcx
+; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx
+; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: movw %ax, %cx
+; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-AVX-O0-NEXT: shrl $16, %eax
+; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8
+ ret <4 x bfloat> %ret
+}
+
define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
; CHECK-SSE-O3-LABEL: atomic_vec4_float:
; CHECK-SSE-O3: # %bb.0:
More information about the llvm-branch-commits
mailing list