[llvm] ca1c052 - [X86][BF16] Do not scalarize masked load for BF16 when we have BWI
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 21 08:19:10 PDT 2023
Author: Phoebe Wang
Date: 2023-07-21T23:18:54+08:00
New Revision: ca1c05208ed35ba72869c65ad773b2cca4bbd360
URL: https://github.com/llvm/llvm-project/commit/ca1c05208ed35ba72869c65ad773b2cca4bbd360
DIFF: https://github.com/llvm/llvm-project/commit/ca1c05208ed35ba72869c65ad773b2cca4bbd360.diff
LOG: [X86][BF16] Do not scalarize masked load for BF16 when we have BWI
Fixes #63017
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D155952
Added:
Modified:
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/test/CodeGen/X86/bfloat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8bfbd27a5b900a..a7d25acd457245 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5836,7 +5836,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
return true;
- if (ScalarTy->isHalfTy() && ST->hasBWI())
+ if (ScalarTy->is16bitFPTy() && ST->hasBWI())
return true;
if (!ScalarTy->isIntegerTy())
@@ -6299,7 +6299,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
EltTy->isIntegerTy(32) || EltTy->isPointerTy())
return true;
- if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->is16bitFPTy())
return HasBW;
return false;
};
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 4caeaf381c874e..43213761bb5cd8 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -581,3 +581,581 @@ define <32 x bfloat> @pr63017() {
; BF16-NEXT: retq
ret <32 x bfloat> zeroinitializer
}
+
+define <32 x bfloat> @pr63017_2() nounwind {
+; SSE2-LABEL: pr63017_2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %r14
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: subq $200, %rsp
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_1
+; SSE2-NEXT: # %bb.2: # %cond.load
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: jmp .LBB12_3
+; SSE2-NEXT: .LBB12_1:
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: .LBB12_3: # %else
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_5
+; SSE2-NEXT: # %bb.4: # %cond.load1
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: .LBB12_5: # %else2
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_6
+; SSE2-NEXT: # %bb.7: # %cond.load4
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm14
+; SSE2-NEXT: movdqa %xmm1, %xmm15
+; SSE2-NEXT: movdqa %xmm1, %xmm12
+; SSE2-NEXT: movdqa %xmm1, %xmm13
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: jmp .LBB12_8
+; SSE2-NEXT: .LBB12_6:
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm14
+; SSE2-NEXT: movdqa %xmm1, %xmm15
+; SSE2-NEXT: movdqa %xmm1, %xmm12
+; SSE2-NEXT: movdqa %xmm1, %xmm13
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: .LBB12_8: # %else5
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_10
+; SSE2-NEXT: # %bb.9: # %cond.load7
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_10: # %else8
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_12
+; SSE2-NEXT: # %bb.11: # %cond.load10
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_12: # %else11
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_14
+; SSE2-NEXT: # %bb.13: # %cond.load13
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_14: # %else14
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_16
+; SSE2-NEXT: # %bb.15: # %cond.load16
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_16: # %else17
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_18
+; SSE2-NEXT: # %bb.17: # %cond.load19
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_18: # %else20
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_20
+; SSE2-NEXT: # %bb.19: # %cond.load22
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_20: # %else23
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_22
+; SSE2-NEXT: # %bb.21: # %cond.load25
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_22: # %else26
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_24
+; SSE2-NEXT: # %bb.23: # %cond.load28
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_24: # %else29
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_26
+; SSE2-NEXT: # %bb.25: # %cond.load31
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_26: # %else32
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_28
+; SSE2-NEXT: # %bb.27: # %cond.load34
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_28: # %else35
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_30
+; SSE2-NEXT: # %bb.29: # %cond.load37
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_30: # %else38
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_32
+; SSE2-NEXT: # %bb.31: # %cond.load40
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_32: # %else41
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_34
+; SSE2-NEXT: # %bb.33: # %cond.load43
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_34: # %else44
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_36
+; SSE2-NEXT: # %bb.35: # %cond.load46
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_36: # %else47
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_38
+; SSE2-NEXT: # %bb.37: # %cond.load49
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: .LBB12_38: # %else50
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_40
+; SSE2-NEXT: # %bb.39: # %cond.load52
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: .LBB12_40: # %else53
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_42
+; SSE2-NEXT: # %bb.41: # %cond.load55
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm15
+; SSE2-NEXT: .LBB12_42: # %else56
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_44
+; SSE2-NEXT: # %bb.43: # %cond.load58
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: .LBB12_44: # %else59
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_46
+; SSE2-NEXT: # %bb.45: # %cond.load61
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm13
+; SSE2-NEXT: .LBB12_46: # %else62
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_48
+; SSE2-NEXT: # %bb.47: # %cond.load64
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: .LBB12_48: # %else65
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_50
+; SSE2-NEXT: # %bb.49: # %cond.load67
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm11
+; SSE2-NEXT: .LBB12_50: # %else68
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_52
+; SSE2-NEXT: # %bb.51: # %cond.load70
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: .LBB12_52: # %else71
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_54
+; SSE2-NEXT: # %bb.53: # %cond.load73
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: .LBB12_54: # %else74
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_56
+; SSE2-NEXT: # %bb.55: # %cond.load76
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: .LBB12_56: # %else77
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_58
+; SSE2-NEXT: # %bb.57: # %cond.load79
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm7
+; SSE2-NEXT: .LBB12_58: # %else80
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_60
+; SSE2-NEXT: # %bb.59: # %cond.load82
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: .LBB12_60: # %else83
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_62
+; SSE2-NEXT: # %bb.61: # %cond.load85
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: .LBB12_62: # %else86
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: jne .LBB12_64
+; SSE2-NEXT: # %bb.63: # %cond.load88
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: .LBB12_64: # %else89
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: jne .LBB12_65
+; SSE2-NEXT: # %bb.66: # %cond.load91
+; SSE2-NEXT: movzwl (%rax), %eax
+; SSE2-NEXT: shll $16, %eax
+; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: jmp .LBB12_67
+; SSE2-NEXT: .LBB12_65:
+; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT: .LBB12_67: # %else92
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %r14d
+; SSE2-NEXT: orl %ebx, %r14d
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %ebx
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfbf2 at PLT
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: orl %ebx, %eax
+; SSE2-NEXT: shlq $32, %rax
+; SSE2-NEXT: orq %r14, %rax
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT: addq $200, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %r14
+; SSE2-NEXT: retq
+;
+; BF16-LABEL: pr63017_2:
+; BF16: # %bb.0:
+; BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
+; BF16-NEXT: retq
+ %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
+ ret <32 x bfloat> %1
+}
+
+declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
More information about the llvm-commits
mailing list