[llvm-branch-commits] [llvm] [SelectionDAG] Keep split vector atomic store value in a vector register (PR #201566)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jun 4 05:25:57 PDT 2026
https://github.com/jofrn created https://github.com/llvm/llvm-project/pull/201566
When the value of an ATOMIC_STORE has a vector type whose legalization
action is split (e.g. <4 x half>/<4 x bfloat> on X86 without F16C),
SplitVecOp_ATOMIC_STORE bitcast the value straight to a scalar integer
spanning the memory width. For a split vector that bitcast is expanded
element by element, reassembling the value in GPRs (a long pextrw/shl/or
sequence) before the store.
Instead, keep the value in a vector register when a legal vector form
exists: reinterpret it as a same-shaped integer-element vector (an FP
element type may have no legal vector form, e.g. bfloat on SSE2, while
the integer-of-element-size form does), widen that to a legal vector,
and extract the low integer element of the memory width. This issues the
store directly from a vector register (a single MOVQ/MOVD on X86),
matching the widen-path codegen already produced on AVX targets. Falls
back to the scalar bitcast when no suitable legal vector type exists.
>From 6c022f5972acc87806d2bc7118d82b80c08cba71 Mon Sep 17 00:00:00 2001
From: jofrn <165626406+jofrn at users.noreply.github.com>
Date: Wed, 3 Jun 2026 05:46:34 -0700
Subject: [PATCH] [SelectionDAG] Keep split vector atomic store value in a
vector register
When the value of an ATOMIC_STORE has a vector type whose legalization
action is split (e.g. <4 x half>/<4 x bfloat> on X86 without F16C),
SplitVecOp_ATOMIC_STORE bitcast the value straight to a scalar integer
spanning the memory width. For a split vector that bitcast is expanded
element by element, reassembling the value in GPRs (a long pextrw/shl/or
sequence) before the store.
Instead, keep the value in a vector register when a legal vector form
exists: reinterpret it as a same-shaped integer-element vector (an FP
element type may have no legal vector form, e.g. bfloat on SSE2, while
the integer-of-element-size form does), widen that to a legal vector,
and extract the low integer element of the memory width. This issues the
store directly from a vector register (a single MOVQ/MOVD on X86),
matching the widen-path codegen already produced on AVX targets. Falls
back to the scalar bitcast when no suitable legal vector type exists.
---
.../SelectionDAG/LegalizeVectorTypes.cpp | 43 +-
llvm/test/CodeGen/X86/atomic-load-store.ll | 506 +++++++-----------
2 files changed, 227 insertions(+), 322 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5350be412176d..73cc5dc76c1eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4742,16 +4742,47 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
SDValue DAGTypeLegalizer::SplitVecOp_ATOMIC_STORE(AtomicSDNode *N) {
SDLoc DL(N);
+ LLVMContext &Ctx = *DAG.getContext();
SDValue StVal = N->getVal();
EVT VT = StVal.getValueType();
+ EVT MemIntVT = EVT::getIntegerVT(Ctx, N->getMemoryVT().getSizeInBits());
+
+ // The store needs a single value spanning the full memory width. If the
+ // value can be held in a legal vector register, keep it there and extract
+ // the low integer element of the memory width. This lets the store be issued
+ // directly from a vector register (e.g. a single MOVQ/MOVD) instead of
+ // bitcasting the split vector straight to a scalar integer, which would
+ // reassemble the value element by element in GPRs.
+ //
+ // Reinterpret the value as a same-shaped integer vector first: an FP element
+ // type may not have a legal vector form (e.g. bfloat on SSE2) while the
+ // integer-of-element-size form does.
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT IntEltVT = EVT::getIntegerVT(Ctx, VT.getScalarSizeInBits());
+ EVT IntVecVT = EVT::getVectorVT(Ctx, IntEltVT, NumElts);
+ if (DAG.getDataLayout().isLittleEndian() && TLI.isTypeLegal(MemIntVT) &&
+ IntEltVT.getSizeInBits() <= MemIntVT.getSizeInBits()) {
+ EVT WideVT = IntVecVT;
+ while (!TLI.isTypeLegal(WideVT) && WideVT.getSizeInBits() < 512)
+ WideVT =
+ EVT::getVectorVT(Ctx, IntEltVT, WideVT.getVectorNumElements() * 2);
+ if (TLI.isTypeLegal(WideVT) &&
+ WideVT.getSizeInBits() % MemIntVT.getSizeInBits() == 0) {
+ SDValue Wide = ModifyToType(DAG.getBitcast(IntVecVT, StVal), WideVT);
+ unsigned NumMemElts = WideVT.getSizeInBits() / MemIntVT.getSizeInBits();
+ EVT MemVecVT = EVT::getVectorVT(Ctx, MemIntVT, NumMemElts);
+ SDValue Elt = DAG.getExtractVectorElt(DL, MemIntVT,
+ DAG.getBitcast(MemVecVT, Wide), 0);
+ return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MemIntVT, N->getChain(), Elt,
+ N->getBasePtr(), N->getMemOperand());
+ }
+ }
- // Issue a single atomic store of an integer that spans the full memory
- // width. Bitcasting the (illegal) vector value to that integer lets the
- // type legalizer further legalize the BITCAST input as needed, while the
+ // Otherwise issue a single atomic store of an integer that spans the full
+ // memory width. Bitcasting the (illegal) vector value to that integer lets
+ // the type legalizer further legalize the BITCAST input as needed, while the
// ATOMIC_STORE itself uses only the legal integer type.
- EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
- EVT MemIntVT =
- EVT::getIntegerVT(*DAG.getContext(), N->getMemoryVT().getSizeInBits());
+ EVT IntVT = EVT::getIntegerVT(Ctx, VT.getSizeInBits());
SDValue AsInt = DAG.getBitcast(IntVT, StVal);
return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MemIntVT, N->getChain(), AsInt,
N->getBasePtr(), N->getMemOperand());
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 5be8d4d47fa9e..1ac3066a393a8 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -754,13 +754,7 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
; CHECK-SSE-O3-LABEL: store_atomic_vec2_half:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-O3-NEXT: psrld $16, %xmm0
-; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %ecx
-; CHECK-SSE-O3-NEXT: shll $16, %ecx
-; CHECK-SSE-O3-NEXT: movzwl %ax, %eax
-; CHECK-SSE-O3-NEXT: orl %ecx, %eax
-; CHECK-SSE-O3-NEXT: movl %eax, (%rdi)
+; CHECK-SSE-O3-NEXT: movss %xmm0, (%rdi)
; CHECK-SSE-O3-NEXT: retq
;
; CHECK-AVX-O3-LABEL: store_atomic_vec2_half:
@@ -773,16 +767,8 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
; CHECK-SSE-O0-NEXT: movaps %xmm1, %xmm0
; CHECK-SSE-O0-NEXT: psrld $16, %xmm1
-; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
-; CHECK-SSE-O0-NEXT: movw %ax, %cx
-; CHECK-SSE-O0-NEXT: shll $16, %ecx
-; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-SSE-O0-NEXT: movzwl %ax, %eax
-; CHECK-SSE-O0-NEXT: orl %ecx, %eax
-; CHECK-SSE-O0-NEXT: movl %eax, (%rdi)
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT: movd %xmm0, (%rdi)
; CHECK-SSE-O0-NEXT: retq
;
; CHECK-AVX-O0-LABEL: store_atomic_vec2_half:
@@ -796,13 +782,7 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) nounwind {
; CHECK-SSE-O3-LABEL: store_atomic_vec2_bfloat:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-O3-NEXT: psrld $16, %xmm0
-; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %ecx
-; CHECK-SSE-O3-NEXT: shll $16, %ecx
-; CHECK-SSE-O3-NEXT: movzwl %ax, %eax
-; CHECK-SSE-O3-NEXT: orl %ecx, %eax
-; CHECK-SSE-O3-NEXT: movl %eax, (%rdi)
+; CHECK-SSE-O3-NEXT: movss %xmm0, (%rdi)
; CHECK-SSE-O3-NEXT: retq
;
; CHECK-AVX-O3-LABEL: store_atomic_vec2_bfloat:
@@ -810,68 +790,78 @@ define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) nounwind {
; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: subq $24, %rsp
-; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
-; CHECK-SSE-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-SSE-O0-NEXT: pextrw $1, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
-; CHECK-SSE-O0-NEXT: movd %xmm1, %eax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movd %eax, %xmm1
-; CHECK-SSE-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
-; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: movw %ax, %cx
-; CHECK-SSE-O0-NEXT: # implicit-def: $eax
-; CHECK-SSE-O0-NEXT: movw %cx, %ax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-SSE-O0-NEXT: movzwl %ax, %eax
-; CHECK-SSE-O0-NEXT: orl %ecx, %eax
-; CHECK-SSE-O0-NEXT: movl %eax, (%rdi)
-; CHECK-SSE-O0-NEXT: addq $24, %rsp
-; CHECK-SSE-O0-NEXT: retq
+; CHECK-SSE2-O0-LABEL: store_atomic_vec2_bfloat:
+; CHECK-SSE2-O0: # %bb.0:
+; CHECK-SSE2-O0-NEXT: subq $24, %rsp
+; CHECK-SSE2-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE2-O0-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-SSE2-O0-NEXT: shll $16, %eax
+; CHECK-SSE2-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE2-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE2-O0-NEXT: movd %xmm0, %eax
+; CHECK-SSE2-O0-NEXT: shll $16, %eax
+; CHECK-SSE2-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE2-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE2-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE2-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; CHECK-SSE2-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE2-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; CHECK-SSE2-O0-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE2-O0-NEXT: movl %eax, (%rdi)
+; CHECK-SSE2-O0-NEXT: addq $24, %rsp
+; CHECK-SSE2-O0-NEXT: retq
+;
+; CHECK-SSE4-O0-LABEL: store_atomic_vec2_bfloat:
+; CHECK-SSE4-O0: # %bb.0:
+; CHECK-SSE4-O0-NEXT: subq $24, %rsp
+; CHECK-SSE4-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-SSE4-O0-NEXT: shll $16, %eax
+; CHECK-SSE4-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE4-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE4-O0-NEXT: movd %xmm0, %eax
+; CHECK-SSE4-O0-NEXT: shll $16, %eax
+; CHECK-SSE4-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE4-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE4-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE4-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-SSE4-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-SSE4-O0-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE4-O0-NEXT: movl %eax, (%rdi)
+; CHECK-SSE4-O0-NEXT: addq $24, %rsp
+; CHECK-SSE4-O0-NEXT: retq
;
; CHECK-AVX-O0-LABEL: store_atomic_vec2_bfloat:
; CHECK-AVX-O0: # %bb.0:
; CHECK-AVX-O0-NEXT: subq $24, %rsp
-; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
; CHECK-AVX-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm1, %eax
-; CHECK-AVX-O0-NEXT: shll $16, %eax
-; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
-; CHECK-AVX-O0-NEXT: vmovd %xmm1, %eax
+; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-AVX-O0-NEXT: shll $16, %eax
; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1
; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vmovd %xmm0, %eax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
; CHECK-AVX-O0-NEXT: callq __truncsfbf2 at PLT
; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax
-; CHECK-AVX-O0-NEXT: movw %ax, %cx
-; CHECK-AVX-O0-NEXT: # implicit-def: $eax
-; CHECK-AVX-O0-NEXT: movw %cx, %ax
-; CHECK-AVX-O0-NEXT: shll $16, %eax
-; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, {{[0-9]+}}(%rsp)
; CHECK-AVX-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-AVX-O0-NEXT: movzwl %ax, %eax
-; CHECK-AVX-O0-NEXT: orl %ecx, %eax
+; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: movl {{[0-9]+}}(%rsp), %eax
; CHECK-AVX-O0-NEXT: movl %eax, (%rdi)
; CHECK-AVX-O0-NEXT: addq $24, %rsp
; CHECK-AVX-O0-NEXT: retq
@@ -880,48 +870,10 @@ define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) nounwind {
}
define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
-; CHECK-SSE2-O3-LABEL: store_atomic_vec4_half:
-; CHECK-SSE2-O3: # %bb.0:
-; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-O3-NEXT: psrld $16, %xmm1
-; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %ecx
-; CHECK-SSE2-O3-NEXT: shll $16, %eax
-; CHECK-SSE2-O3-NEXT: movzwl %cx, %ecx
-; CHECK-SSE2-O3-NEXT: orl %eax, %ecx
-; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-O3-NEXT: psrlq $48, %xmm1
-; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE2-O3-NEXT: shll $16, %eax
-; CHECK-SSE2-O3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %edx
-; CHECK-SSE2-O3-NEXT: movzwl %dx, %edx
-; CHECK-SSE2-O3-NEXT: orl %eax, %edx
-; CHECK-SSE2-O3-NEXT: shlq $32, %rdx
-; CHECK-SSE2-O3-NEXT: orq %rcx, %rdx
-; CHECK-SSE2-O3-NEXT: movq %rdx, (%rdi)
-; CHECK-SSE2-O3-NEXT: retq
-;
-; CHECK-SSE4-O3-LABEL: store_atomic_vec4_half:
-; CHECK-SSE4-O3: # %bb.0:
-; CHECK-SSE4-O3-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE4-O3-NEXT: psrld $16, %xmm1
-; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE4-O3-NEXT: shll $16, %eax
-; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %ecx
-; CHECK-SSE4-O3-NEXT: movzwl %cx, %ecx
-; CHECK-SSE4-O3-NEXT: orl %eax, %ecx
-; CHECK-SSE4-O3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE4-O3-NEXT: psrlq $48, %xmm0
-; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE4-O3-NEXT: shll $16, %eax
-; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %edx
-; CHECK-SSE4-O3-NEXT: movzwl %dx, %edx
-; CHECK-SSE4-O3-NEXT: orl %eax, %edx
-; CHECK-SSE4-O3-NEXT: shlq $32, %rdx
-; CHECK-SSE4-O3-NEXT: orq %rcx, %rdx
-; CHECK-SSE4-O3-NEXT: movq %rdx, (%rdi)
-; CHECK-SSE4-O3-NEXT: retq
+; CHECK-SSE-O3-LABEL: store_atomic_vec4_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movlps %xmm0, (%rdi)
+; CHECK-SSE-O3-NEXT: retq
;
; CHECK-AVX-O3-LABEL: store_atomic_vec4_half:
; CHECK-AVX-O3: # %bb.0:
@@ -931,70 +883,30 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
; CHECK-SSE2-O0-LABEL: store_atomic_vec4_half:
; CHECK-SSE2-O0: # %bb.0:
; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm3
+; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm0
; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm2
+; CHECK-SSE2-O0-NEXT: psrlq $48, %xmm2
; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm1
-; CHECK-SSE2-O0-NEXT: psrlq $48, %xmm1
-; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm0
-; CHECK-SSE2-O0-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE2-O0-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; CHECK-SSE2-O0-NEXT: psrld $16, %xmm3
-; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm3, %eax
-; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx
-; CHECK-SSE2-O0-NEXT: movw %ax, %cx
-; CHECK-SSE2-O0-NEXT: shll $16, %ecx
-; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm2, %eax
-; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-SSE2-O0-NEXT: movzwl %ax, %eax
-; CHECK-SSE2-O0-NEXT: orl %ecx, %eax
-; CHECK-SSE2-O0-NEXT: # kill: def $rax killed $eax
-; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %ecx
-; CHECK-SSE2-O0-NEXT: movw %cx, %dx
-; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx
-; CHECK-SSE2-O0-NEXT: movw %dx, %cx
-; CHECK-SSE2-O0-NEXT: shll $16, %ecx
-; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm0, %edx
-; CHECK-SSE2-O0-NEXT: # kill: def $dx killed $dx killed $edx
-; CHECK-SSE2-O0-NEXT: movzwl %dx, %edx
-; CHECK-SSE2-O0-NEXT: orl %ecx, %edx
-; CHECK-SSE2-O0-NEXT: # implicit-def: $rcx
-; CHECK-SSE2-O0-NEXT: movl %edx, %ecx
-; CHECK-SSE2-O0-NEXT: shlq $32, %rcx
-; CHECK-SSE2-O0-NEXT: orq %rcx, %rax
-; CHECK-SSE2-O0-NEXT: movq %rax, (%rdi)
+; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-O0-NEXT: movq %xmm0, (%rdi)
; CHECK-SSE2-O0-NEXT: retq
;
; CHECK-SSE4-O0-LABEL: store_atomic_vec4_half:
; CHECK-SSE4-O0: # %bb.0:
; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm3
+; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm0
; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm2
-; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm1
-; CHECK-SSE4-O0-NEXT: psrlq $48, %xmm1
-; CHECK-SSE4-O0-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; CHECK-SSE4-O0-NEXT: psrlq $48, %xmm2
+; CHECK-SSE4-O0-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; CHECK-SSE4-O0-NEXT: psrld $16, %xmm3
-; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm3, %eax
-; CHECK-SSE4-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx
-; CHECK-SSE4-O0-NEXT: movw %ax, %cx
-; CHECK-SSE4-O0-NEXT: shll $16, %ecx
-; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm2, %eax
-; CHECK-SSE4-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-SSE4-O0-NEXT: movzwl %ax, %eax
-; CHECK-SSE4-O0-NEXT: orl %ecx, %eax
-; CHECK-SSE4-O0-NEXT: # kill: def $rax killed $eax
-; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, %ecx
-; CHECK-SSE4-O0-NEXT: movw %cx, %dx
-; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx
-; CHECK-SSE4-O0-NEXT: movw %dx, %cx
-; CHECK-SSE4-O0-NEXT: shll $16, %ecx
-; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, %edx
-; CHECK-SSE4-O0-NEXT: # kill: def $dx killed $dx killed $edx
-; CHECK-SSE4-O0-NEXT: movzwl %dx, %edx
-; CHECK-SSE4-O0-NEXT: orl %ecx, %edx
-; CHECK-SSE4-O0-NEXT: # implicit-def: $rcx
-; CHECK-SSE4-O0-NEXT: movl %edx, %ecx
-; CHECK-SSE4-O0-NEXT: shlq $32, %rcx
-; CHECK-SSE4-O0-NEXT: orq %rcx, %rax
-; CHECK-SSE4-O0-NEXT: movq %rax, (%rdi)
+; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-SSE4-O0-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE4-O0-NEXT: movq %xmm0, (%rdi)
; CHECK-SSE4-O0-NEXT: retq
;
; CHECK-AVX-O0-LABEL: store_atomic_vec4_half:
@@ -1006,184 +918,146 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
}
define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind {
-; CHECK-SSE2-O3-LABEL: store_atomic_vec4_bfloat:
-; CHECK-SSE2-O3: # %bb.0:
-; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-O3-NEXT: psrld $16, %xmm1
-; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %ecx
-; CHECK-SSE2-O3-NEXT: shll $16, %eax
-; CHECK-SSE2-O3-NEXT: movzwl %cx, %ecx
-; CHECK-SSE2-O3-NEXT: orl %eax, %ecx
-; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-O3-NEXT: psrlq $48, %xmm1
-; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE2-O3-NEXT: shll $16, %eax
-; CHECK-SSE2-O3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %edx
-; CHECK-SSE2-O3-NEXT: movzwl %dx, %edx
-; CHECK-SSE2-O3-NEXT: orl %eax, %edx
-; CHECK-SSE2-O3-NEXT: shlq $32, %rdx
-; CHECK-SSE2-O3-NEXT: orq %rcx, %rdx
-; CHECK-SSE2-O3-NEXT: movq %rdx, (%rdi)
-; CHECK-SSE2-O3-NEXT: retq
-;
-; CHECK-SSE4-O3-LABEL: store_atomic_vec4_bfloat:
-; CHECK-SSE4-O3: # %bb.0:
-; CHECK-SSE4-O3-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE4-O3-NEXT: psrld $16, %xmm1
-; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE4-O3-NEXT: shll $16, %eax
-; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %ecx
-; CHECK-SSE4-O3-NEXT: movzwl %cx, %ecx
-; CHECK-SSE4-O3-NEXT: orl %eax, %ecx
-; CHECK-SSE4-O3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE4-O3-NEXT: psrlq $48, %xmm0
-; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE4-O3-NEXT: shll $16, %eax
-; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %edx
-; CHECK-SSE4-O3-NEXT: movzwl %dx, %edx
-; CHECK-SSE4-O3-NEXT: orl %eax, %edx
-; CHECK-SSE4-O3-NEXT: shlq $32, %rdx
-; CHECK-SSE4-O3-NEXT: orq %rcx, %rdx
-; CHECK-SSE4-O3-NEXT: movq %rdx, (%rdi)
-; CHECK-SSE4-O3-NEXT: retq
+; CHECK-SSE-O3-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movlps %xmm0, (%rdi)
+; CHECK-SSE-O3-NEXT: retq
;
; CHECK-AVX-O3-LABEL: store_atomic_vec4_bfloat:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-SSE-O0-LABEL: store_atomic_vec4_bfloat:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: subq $40, %rsp
-; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
-; CHECK-SSE-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-SSE-O0-NEXT: pextrw $3, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
-; CHECK-SSE-O0-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-O0-NEXT: pextrw $2, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
-; CHECK-SSE-O0-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-O0-NEXT: pextrw $1, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movd %eax, %xmm0
-; CHECK-SSE-O0-NEXT: movd %xmm1, %eax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movd %eax, %xmm1
-; CHECK-SSE-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
-; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: movw %ax, %cx
-; CHECK-SSE-O0-NEXT: # implicit-def: $eax
-; CHECK-SSE-O0-NEXT: movw %cx, %ax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
-; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-SSE-O0-NEXT: movzwl %ax, %eax
-; CHECK-SSE-O0-NEXT: orl %ecx, %eax
-; CHECK-SSE-O0-NEXT: # kill: def $rax killed $eax
-; CHECK-SSE-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-SSE-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1
-; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax
-; CHECK-SSE-O0-NEXT: movw %ax, %cx
-; CHECK-SSE-O0-NEXT: # implicit-def: $eax
-; CHECK-SSE-O0-NEXT: movw %cx, %ax
-; CHECK-SSE-O0-NEXT: shll $16, %eax
-; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %edx
-; CHECK-SSE-O0-NEXT: # kill: def $dx killed $dx killed $edx
-; CHECK-SSE-O0-NEXT: movzwl %dx, %edx
-; CHECK-SSE-O0-NEXT: orl %ecx, %edx
-; CHECK-SSE-O0-NEXT: # implicit-def: $rcx
-; CHECK-SSE-O0-NEXT: movl %edx, %ecx
-; CHECK-SSE-O0-NEXT: shlq $32, %rcx
-; CHECK-SSE-O0-NEXT: orq %rcx, %rax
-; CHECK-SSE-O0-NEXT: movq %rax, (%rdi)
-; CHECK-SSE-O0-NEXT: addq $40, %rsp
-; CHECK-SSE-O0-NEXT: retq
+; CHECK-SSE2-O0-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE2-O0: # %bb.0:
+; CHECK-SSE2-O0-NEXT: subq $40, %rsp
+; CHECK-SSE2-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE2-O0-NEXT: pextrw $3, %xmm0, %eax
+; CHECK-SSE2-O0-NEXT: shll $16, %eax
+; CHECK-SSE2-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE2-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE2-O0-NEXT: pextrw $2, %xmm0, %eax
+; CHECK-SSE2-O0-NEXT: shll $16, %eax
+; CHECK-SSE2-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE2-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE2-O0-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-SSE2-O0-NEXT: shll $16, %eax
+; CHECK-SSE2-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE2-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE2-O0-NEXT: movd %xmm0, %eax
+; CHECK-SSE2-O0-NEXT: shll $16, %eax
+; CHECK-SSE2-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE2-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE2-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE2-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; CHECK-SSE2-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE2-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE2-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; CHECK-SSE2-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE2-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE2-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; CHECK-SSE2-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE2-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-SSE2-O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; CHECK-SSE2-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-SSE2-O0-NEXT: movq %rax, (%rdi)
+; CHECK-SSE2-O0-NEXT: addq $40, %rsp
+; CHECK-SSE2-O0-NEXT: retq
+;
+; CHECK-SSE4-O0-LABEL: store_atomic_vec4_bfloat:
+; CHECK-SSE4-O0: # %bb.0:
+; CHECK-SSE4-O0-NEXT: subq $40, %rsp
+; CHECK-SSE4-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: pextrw $3, %xmm0, %eax
+; CHECK-SSE4-O0-NEXT: shll $16, %eax
+; CHECK-SSE4-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE4-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE4-O0-NEXT: pextrw $2, %xmm0, %eax
+; CHECK-SSE4-O0-NEXT: shll $16, %eax
+; CHECK-SSE4-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE4-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE4-O0-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-SSE4-O0-NEXT: shll $16, %eax
+; CHECK-SSE4-O0-NEXT: movd %eax, %xmm1
+; CHECK-SSE4-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE4-O0-NEXT: movd %xmm0, %eax
+; CHECK-SSE4-O0-NEXT: shll $16, %eax
+; CHECK-SSE4-O0-NEXT: movd %eax, %xmm0
+; CHECK-SSE4-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE4-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE4-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-SSE4-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE4-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE4-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-SSE4-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE4-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-SSE4-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-SSE4-O0-NEXT: callq __truncsfbf2 at PLT
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-SSE4-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-SSE4-O0-NEXT: movq %rax, (%rdi)
+; CHECK-SSE4-O0-NEXT: addq $40, %rsp
+; CHECK-SSE4-O0-NEXT: retq
;
; CHECK-AVX-O0-LABEL: store_atomic_vec4_bfloat:
; CHECK-AVX-O0: # %bb.0:
; CHECK-AVX-O0-NEXT: subq $40, %rsp
-; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
; CHECK-AVX-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-AVX-O0-NEXT: vpextrw $3, %xmm1, %eax
-; CHECK-AVX-O0-NEXT: shll $16, %eax
-; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
-; CHECK-AVX-O0-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX-O0-NEXT: vpextrw $2, %xmm1, %eax
+; CHECK-AVX-O0-NEXT: vpextrw $3, %xmm0, %eax
; CHECK-AVX-O0-NEXT: shll $16, %eax
-; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
-; CHECK-AVX-O0-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm1, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $2, %xmm0, %eax
; CHECK-AVX-O0-NEXT: shll $16, %eax
-; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
-; CHECK-AVX-O0-NEXT: vmovd %xmm1, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-AVX-O0-NEXT: shll $16, %eax
; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1
; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vmovd %xmm0, %eax
+; CHECK-AVX-O0-NEXT: shll $16, %eax
+; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0
; CHECK-AVX-O0-NEXT: callq __truncsfbf2 at PLT
; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax
-; CHECK-AVX-O0-NEXT: movw %ax, %cx
-; CHECK-AVX-O0-NEXT: # implicit-def: $eax
-; CHECK-AVX-O0-NEXT: movw %cx, %ax
-; CHECK-AVX-O0-NEXT: shll $16, %eax
-; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, {{[0-9]+}}(%rsp)
; CHECK-AVX-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax
-; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-AVX-O0-NEXT: movzwl %ax, %eax
-; CHECK-AVX-O0-NEXT: orl %ecx, %eax
-; CHECK-AVX-O0-NEXT: # kill: def $rax killed $eax
-; CHECK-AVX-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, {{[0-9]+}}(%rsp)
; CHECK-AVX-O0-NEXT: callq __truncsfbf2 at PLT
; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1
; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero
-; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax
-; CHECK-AVX-O0-NEXT: movw %ax, %cx
-; CHECK-AVX-O0-NEXT: # implicit-def: $eax
-; CHECK-AVX-O0-NEXT: movw %cx, %ax
-; CHECK-AVX-O0-NEXT: shll $16, %eax
-; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, {{[0-9]+}}(%rsp)
; CHECK-AVX-O0-NEXT: callq __truncsfbf2 at PLT
-; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, %edx
-; CHECK-AVX-O0-NEXT: # kill: def $dx killed $dx killed $edx
-; CHECK-AVX-O0-NEXT: movzwl %dx, %edx
-; CHECK-AVX-O0-NEXT: orl %ecx, %edx
-; CHECK-AVX-O0-NEXT: # implicit-def: $rcx
-; CHECK-AVX-O0-NEXT: movl %edx, %ecx
-; CHECK-AVX-O0-NEXT: shlq $32, %rcx
-; CHECK-AVX-O0-NEXT: orq %rcx, %rax
+; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-AVX-O0-NEXT: movq %rax, (%rdi)
; CHECK-AVX-O0-NEXT: addq $40, %rsp
; CHECK-AVX-O0-NEXT: retq
More information about the llvm-branch-commits
mailing list