[llvm] f11526b - [X86][BF16] Do not scalarize masked load for BF16 when we have AVX512BF16

Phoebe Wang via llvm-commits llvm-commits at lists.llvm.org
Sat Jul 22 03:21:00 PDT 2023


Author: Phoebe Wang
Date: 2023-07-22T18:16:49+08:00
New Revision: f11526b091c489ed0b96538bc91a2e4dcfd9ed4f

URL: https://github.com/llvm/llvm-project/commit/f11526b091c489ed0b96538bc91a2e4dcfd9ed4f
DIFF: https://github.com/llvm/llvm-project/commit/f11526b091c489ed0b96538bc91a2e4dcfd9ed4f.diff

LOG: [X86][BF16] Do not scalarize masked load for BF16 when we have AVX512BF16

Fixes #63017

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D155952

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/CodeGen/X86/bfloat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8bfbd27a5b900a..7dccb9161d5a77 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5839,6 +5839,9 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
   if (ScalarTy->isHalfTy() && ST->hasBWI())
     return true;
 
+  if (ScalarTy->isBFloatTy() && ST->hasBF16())
+    return true;
+
   if (!ScalarTy->isIntegerTy())
     return false;
 
@@ -6294,16 +6297,18 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
     bool UseMaskForCond, bool UseMaskForGaps) {
   auto *VecTy = cast<FixedVectorType>(BaseTy);
 
-  auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
+  auto isSupportedOnAVX512 = [&](Type *VecTy) {
     Type *EltTy = cast<VectorType>(VecTy)->getElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
       return true;
     if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
-      return HasBW;
+      return ST->hasBWI();
+    if (EltTy->isBFloatTy())
+      return ST->hasBF16();
     return false;
   };
-  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
+  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
     return getInterleavedMemoryOpCostAVX512(
         Opcode, VecTy, Factor, Indices, Alignment,
         AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);

diff  --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 4caeaf381c874e..43213761bb5cd8 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -581,3 +581,581 @@ define <32 x bfloat> @pr63017() {
 ; BF16-NEXT:    retq
   ret <32 x bfloat> zeroinitializer
 }
+
+define <32 x bfloat> @pr63017_2() nounwind {
+; SSE2-LABEL: pr63017_2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $200, %rsp
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_1
+; SSE2-NEXT:  # %bb.2: # %cond.load
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    jmp .LBB12_3
+; SSE2-NEXT:  .LBB12_1:
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:  .LBB12_3: # %else
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_5
+; SSE2-NEXT:  # %bb.4: # %cond.load1
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:  .LBB12_5: # %else2
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_6
+; SSE2-NEXT:  # %bb.7: # %cond.load4
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movdqa %xmm1, %xmm14
+; SSE2-NEXT:    movdqa %xmm1, %xmm15
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    jmp .LBB12_8
+; SSE2-NEXT:  .LBB12_6:
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movdqa %xmm1, %xmm14
+; SSE2-NEXT:    movdqa %xmm1, %xmm15
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB12_8: # %else5
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_10
+; SSE2-NEXT:  # %bb.9: # %cond.load7
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_10: # %else8
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_12
+; SSE2-NEXT:  # %bb.11: # %cond.load10
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_12: # %else11
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_14
+; SSE2-NEXT:  # %bb.13: # %cond.load13
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_14: # %else14
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_16
+; SSE2-NEXT:  # %bb.15: # %cond.load16
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_16: # %else17
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_18
+; SSE2-NEXT:  # %bb.17: # %cond.load19
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_18: # %else20
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_20
+; SSE2-NEXT:  # %bb.19: # %cond.load22
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_20: # %else23
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_22
+; SSE2-NEXT:  # %bb.21: # %cond.load25
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_22: # %else26
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_24
+; SSE2-NEXT:  # %bb.23: # %cond.load28
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_24: # %else29
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_26
+; SSE2-NEXT:  # %bb.25: # %cond.load31
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_26: # %else32
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_28
+; SSE2-NEXT:  # %bb.27: # %cond.load34
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_28: # %else35
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_30
+; SSE2-NEXT:  # %bb.29: # %cond.load37
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_30: # %else38
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_32
+; SSE2-NEXT:  # %bb.31: # %cond.load40
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_32: # %else41
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_34
+; SSE2-NEXT:  # %bb.33: # %cond.load43
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_34: # %else44
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_36
+; SSE2-NEXT:  # %bb.35: # %cond.load46
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_36: # %else47
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_38
+; SSE2-NEXT:  # %bb.37: # %cond.load49
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:  .LBB12_38: # %else50
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_40
+; SSE2-NEXT:  # %bb.39: # %cond.load52
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm14
+; SSE2-NEXT:  .LBB12_40: # %else53
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_42
+; SSE2-NEXT:  # %bb.41: # %cond.load55
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm15
+; SSE2-NEXT:  .LBB12_42: # %else56
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_44
+; SSE2-NEXT:  # %bb.43: # %cond.load58
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm12
+; SSE2-NEXT:  .LBB12_44: # %else59
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_46
+; SSE2-NEXT:  # %bb.45: # %cond.load61
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm13
+; SSE2-NEXT:  .LBB12_46: # %else62
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_48
+; SSE2-NEXT:  # %bb.47: # %cond.load64
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm10
+; SSE2-NEXT:  .LBB12_48: # %else65
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_50
+; SSE2-NEXT:  # %bb.49: # %cond.load67
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm11
+; SSE2-NEXT:  .LBB12_50: # %else68
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_52
+; SSE2-NEXT:  # %bb.51: # %cond.load70
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm8
+; SSE2-NEXT:  .LBB12_52: # %else71
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_54
+; SSE2-NEXT:  # %bb.53: # %cond.load73
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:  .LBB12_54: # %else74
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_56
+; SSE2-NEXT:  # %bb.55: # %cond.load76
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm6
+; SSE2-NEXT:  .LBB12_56: # %else77
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_58
+; SSE2-NEXT:  # %bb.57: # %cond.load79
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm7
+; SSE2-NEXT:  .LBB12_58: # %else80
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_60
+; SSE2-NEXT:  # %bb.59: # %cond.load82
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:  .LBB12_60: # %else83
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_62
+; SSE2-NEXT:  # %bb.61: # %cond.load85
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:  .LBB12_62: # %else86
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    jne .LBB12_64
+; SSE2-NEXT:  # %bb.63: # %cond.load88
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:  .LBB12_64: # %else89
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    jne .LBB12_65
+; SSE2-NEXT:  # %bb.66: # %cond.load91
+; SSE2-NEXT:    movzwl (%rax), %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    jmp .LBB12_67
+; SSE2-NEXT:  .LBB12_65:
+; SSE2-NEXT:    movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:  .LBB12_67: # %else92
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebx, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebx, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebx, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebx, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebx, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebx, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebx, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %r14d
+; SSE2-NEXT:    orl %ebx, %r14d
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %ebx
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfbf2 at PLT
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    orl %ebx, %eax
+; SSE2-NEXT:    shlq $32, %rax
+; SSE2-NEXT:    orq %r14, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    addq $200, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    retq
+;
+; BF16-LABEL: pr63017_2:
+; BF16:       # %bb.0:
+; BF16-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; BF16-NEXT:    vmovdqu16 (%rax), %zmm0 {%k1}
+; BF16-NEXT:    retq
+  %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
+  ret <32 x bfloat> %1
+}
+
+declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)


        


More information about the llvm-commits mailing list