[llvm] f11526b - [X86][BF16] Do not scalarize masked load for BF16 when we have AVX512BF16
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 22 03:21:00 PDT 2023
Author: Phoebe Wang
Date: 2023-07-22T18:16:49+08:00
New Revision: f11526b091c489ed0b96538bc91a2e4dcfd9ed4f
URL: https://github.com/llvm/llvm-project/commit/f11526b091c489ed0b96538bc91a2e4dcfd9ed4f
DIFF: https://github.com/llvm/llvm-project/commit/f11526b091c489ed0b96538bc91a2e4dcfd9ed4f.diff
LOG: [X86][BF16] Do not scalarize masked load for BF16 when we have AVX512BF16
Fixes #63017
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D155952
Added:
Modified:
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/test/CodeGen/X86/bfloat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8bfbd27a5b900a..7dccb9161d5a77 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5839,6 +5839,9 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (ScalarTy->isHalfTy() && ST->hasBWI())
return true;
+ if (ScalarTy->isBFloatTy() && ST->hasBF16())
+ return true;
+
if (!ScalarTy->isIntegerTy())
return false;
@@ -6294,16 +6297,18 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
bool UseMaskForCond, bool UseMaskForGaps) {
auto *VecTy = cast<FixedVectorType>(BaseTy);
- auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
+ auto isSupportedOnAVX512 = [&](Type *VecTy) {
Type *EltTy = cast<VectorType>(VecTy)->getElementType();
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
EltTy->isIntegerTy(32) || EltTy->isPointerTy())
return true;
if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
- return HasBW;
+ return ST->hasBWI();
+ if (EltTy->isBFloatTy())
+ return ST->hasBF16();
return false;
};
- if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
+ if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
return getInterleavedMemoryOpCostAVX512(
Opcode, VecTy, Factor, Indices, Alignment,
AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 4caeaf381c874e..43213761bb5cd8 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -581,3 +581,581 @@ define <32 x bfloat> @pr63017() {
; BF16-NEXT: retq
ret <32 x bfloat> zeroinitializer
}
+
+define <32 x bfloat> @pr63017_2() nounwind {
+; SSE2-LABEL: pr63017_2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %r14
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: subq $200, %rsp
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_1
+; SSE2-NEXT: # %bb.2: # %cond.load
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: jmp .LBB12_3
+; SSE2-NEXT: .LBB12_1:
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: .LBB12_3: # %else
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_5
+; SSE2-NEXT: # %bb.4: # %cond.load1
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: .LBB12_5: # %else2
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_6
+; SSE2-NEXT: # %bb.7: # %cond.load4
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm14
+; SSE2-NEXT: movdqa %xmm1, %xmm15
+; SSE2-NEXT: movdqa %xmm1, %xmm12
+; SSE2-NEXT: movdqa %xmm1, %xmm13
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: jmp .LBB12_8
+; SSE2-NEXT: .LBB12_6:
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm14
+; SSE2-NEXT: movdqa %xmm1, %xmm15
+; SSE2-NEXT: movdqa %xmm1, %xmm12
+; SSE2-NEXT: movdqa %xmm1, %xmm13
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: .LBB12_8: # %else5
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_10
+; SSE2-NEXT: # %bb.9: # %cond.load7
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_10: # %else8
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_12
+; SSE2-NEXT: # %bb.11: # %cond.load10
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_12: # %else11
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_14
+; SSE2-NEXT: # %bb.13: # %cond.load13
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_14: # %else14
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_16
+; SSE2-NEXT: # %bb.15: # %cond.load16
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_16: # %else17
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_18
+; SSE2-NEXT: # %bb.17: # %cond.load19
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_18: # %else20
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_20
+; SSE2-NEXT: # %bb.19: # %cond.load22
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_20: # %else23
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_22
+; SSE2-NEXT: # %bb.21: # %cond.load25
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_22: # %else26
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_24
+; SSE2-NEXT: # %bb.23: # %cond.load28
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_24: # %else29
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_26
+; SSE2-NEXT: # %bb.25: # %cond.load31
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_26: # %else32
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_28
+; SSE2-NEXT: # %bb.27: # %cond.load34
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_28: # %else35
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_30
+; SSE2-NEXT: # %bb.29: # %cond.load37
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_30: # %else38
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_32
+; SSE2-NEXT: # %bb.31: # %cond.load40
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_32: # %else41
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_34
+; SSE2-NEXT: # %bb.33: # %cond.load43
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_34: # %else44
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_36
+; SSE2-NEXT: # %bb.35: # %cond.load46
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_36: # %else47
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_38
+; SSE2-NEXT: # %bb.37: # %cond.load49
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_38: # %else50
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_40
+; SSE2-NEXT: # %bb.39: # %cond.load52
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: .LBB12_40: # %else53
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_42
+; SSE2-NEXT: # %bb.41: # %cond.load55
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm15
+; SSE2-NEXT: .LBB12_42: # %else56
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_44
+; SSE2-NEXT: # %bb.43: # %cond.load58
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: .LBB12_44: # %else59
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_46
+; SSE2-NEXT: # %bb.45: # %cond.load61
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm13
+; SSE2-NEXT: .LBB12_46: # %else62
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_48
+; SSE2-NEXT: # %bb.47: # %cond.load64
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: .LBB12_48: # %else65
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_50
+; SSE2-NEXT: # %bb.49: # %cond.load67
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm11
+; SSE2-NEXT: .LBB12_50: # %else68
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_52
+; SSE2-NEXT: # %bb.51: # %cond.load70
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: .LBB12_52: # %else71
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_54
+; SSE2-NEXT: # %bb.53: # %cond.load73
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: .LBB12_54: # %else74
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_56
+; SSE2-NEXT: # %bb.55: # %cond.load76
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: .LBB12_56: # %else77
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_58
+; SSE2-NEXT: # %bb.57: # %cond.load79
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm7
+; SSE2-NEXT: .LBB12_58: # %else80
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_60
+; SSE2-NEXT: # %bb.59: # %cond.load82
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: .LBB12_60: # %else83
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_62
+; SSE2-NEXT: # %bb.61: # %cond.load85
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: .LBB12_62: # %else86
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_64
+; SSE2-NEXT: # %bb.63: # %cond.load88
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: .LBB12_64: # %else89
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: jne .LBB12_65
+; SSE2-NEXT: # %bb.66: # %cond.load91
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: jmp .LBB12_67
+; SSE2-NEXT: .LBB12_65:
+; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: .LBB12_67: # %else92
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT: addq $200, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %r14
+; SSE2-NEXT: retq
+;
+; BF16-LABEL: pr63017_2:
+; BF16: # %bb.0:
+; BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
+; BF16-NEXT: retq
+ %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
+ ret <32 x bfloat> %1
+}
+
+declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
More information about the llvm-commits
mailing list