[llvm] [ScalarizeMaskedMemIntr] Optimize splat non-constant masks (PR #104537)
Krzysztof Drewniak via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 16 08:42:41 PDT 2024
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/104537
>From 04ae0cb77ccbaea3ab49098e09f61321ee698ac7 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 16 Aug 2024 01:38:18 +0000
Subject: [PATCH 1/2] [ScalarizeMaskedMemIntr] Optimize splat non-constant
masks
In cases (like the ones added in the tests) where the condition of a
masked load or store is a splat but not a constant (that is, a masked
operation is being used to implement patterns like "load if the
current lane is in-bounds, otherwise return 0"), optimize the
'scalarized' code to perform an aligned vector load/store if the splat
constant is true.
Additionally, take a few steps to preserve aliasing information and
names when nothing is scalarized while I'm here.
As motivation, some LLVM IR users will genatate masked load/store in
cases that map to this kind of predicated operation (where either the
vector is loaded/stored or it isn't) in order to take advantage of
hardware primitives, but on AMDGPU, where we don't have a masked load
or store, this pass would scalarize a load or store that was intended
to be - and can be - vectorized while also introducing expensive branches.
Fixes #104520
Pre-commit tests at #104527
---
.../Scalar/ScalarizeMaskedMemIntrin.cpp | 64 ++++++++++++++++++-
.../X86/expand-masked-load.ll | 34 +++-------
.../X86/expand-masked-store.ll | 25 ++------
3 files changed, 75 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 8eadf8900020d9..9cb7bad94c20bc 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -161,7 +162,9 @@ static void scalarizeMaskedLoad(const DataLayout &DL, CallInst *CI,
// Short-cut if the mask is all-true.
if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
- Value *NewI = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal);
+ LoadInst *NewI = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal);
+ NewI->copyMetadata(*CI);
+ NewI->takeName(CI);
CI->replaceAllUsesWith(NewI);
CI->eraseFromParent();
return;
@@ -188,8 +191,39 @@ static void scalarizeMaskedLoad(const DataLayout &DL, CallInst *CI,
return;
}
+ // Optimize the case where the "masked load" is a predicated load - that is,
+ // where the mask is the splat of a non-constant scalar boolean. In that case,
+ // use that splated value as the guard on a conditional vector load.
+ if (isSplatValue(Mask, /*Index=*/0)) {
+ Value *Predicate = Builder.CreateExtractElement(Mask, uint64_t(0ull),
+ Mask->getName() + ".first");
+ Instruction *ThenTerm =
+ SplitBlockAndInsertIfThen(Predicate, InsertPt, /*Unreachable=*/false,
+ /*BranchWeights=*/nullptr, DTU);
+
+ BasicBlock *CondBlock = ThenTerm->getParent();
+ CondBlock->setName("cond.load");
+ Builder.SetInsertPoint(CondBlock->getTerminator());
+ LoadInst *Load = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal,
+ CI->getName() + ".cond.load");
+ Load->copyMetadata(*CI);
+
+ BasicBlock *PostLoad = ThenTerm->getSuccessor(0);
+ Builder.SetInsertPoint(PostLoad, PostLoad->begin());
+ PHINode *Phi = Builder.CreatePHI(VecType, /*NumReservedValues=*/2);
+ Phi->addIncoming(Load, CondBlock);
+ Phi->addIncoming(Src0, IfBlock);
+ Phi->takeName(CI);
+
+ CI->replaceAllUsesWith(Phi);
+ CI->eraseFromParent();
+ ModifiedDT = true;
+ return;
+ }
// If the mask is not v1i1, use scalar bit test operations. This generates
// better results on X86 at least.
+ // Note: this produces worse code on AMDGPU, where the "i1" is implicitly SIMD
+ // - what's a good way to detect this?
Value *SclrMask;
if (VectorWidth != 1) {
Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
@@ -297,7 +331,9 @@ static void scalarizeMaskedStore(const DataLayout &DL, CallInst *CI,
// Short-cut if the mask is all-true.
if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
- Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+ StoreInst *Store = Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+ Store->takeName(CI);
+ Store->copyMetadata(*CI);
CI->eraseFromParent();
return;
}
@@ -319,8 +355,31 @@ static void scalarizeMaskedStore(const DataLayout &DL, CallInst *CI,
return;
}
+ // Optimize the case where the "masked store" is a predicated store - that is,
+ // when the mask is the splat of a non-constant scalar boolean. In that case,
+ // optimize to a conditional store.
+ if (isSplatValue(Mask, /*Index=*/0)) {
+ Value *Predicate = Builder.CreateExtractElement(Mask, uint64_t(0ull),
+ Mask->getName() + ".first");
+ Instruction *ThenTerm =
+ SplitBlockAndInsertIfThen(Predicate, InsertPt, /*Unreachable=*/false,
+ /*BranchWeights=*/nullptr, DTU);
+ BasicBlock *CondBlock = ThenTerm->getParent();
+ CondBlock->setName("cond.store");
+ Builder.SetInsertPoint(CondBlock->getTerminator());
+
+ StoreInst *Store = Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+ Store->takeName(CI);
+ Store->copyMetadata(*CI);
+
+ CI->eraseFromParent();
+ ModifiedDT = true;
+ return;
+ }
+
// If the mask is not v1i1, use scalar bit test operations. This generates
// better results on X86 at least.
+
Value *SclrMask;
if (VectorWidth != 1) {
Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
@@ -997,7 +1056,6 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
any_of(II->args(),
[](Value *V) { return isa<ScalableVectorType>(V->getType()); }))
return false;
-
switch (II->getIntrinsicID()) {
default:
break;
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll
index 9b1c59829b9ffb..fffb5f021e52d4 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll
@@ -32,8 +32,8 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
define <2 x i64> @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %passthru) {
; CHECK-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
-; CHECK-NEXT: ret <2 x i64> [[TMP1]]
+; CHECK-NEXT: [[RET:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
+; CHECK-NEXT: ret <2 x i64> [[RET]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
ret <2 x i64> %ret
@@ -58,34 +58,18 @@ define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) {
ret <2 x i64> %ret
}
-; To be fixed: If the mask is the splat/broadcast of a non-constant value, use a
-; vector load
define <2 x i64> @scalarize_v2i64_splat_mask(ptr %p, i1 %mask, <2 x i64> %passthrough) {
; CHECK-LABEL: @scalarize_v2i64_splat_mask(
; CHECK-NEXT: [[MASK_VEC:%.*]] = insertelement <2 x i1> poison, i1 [[MASK:%.*]], i32 0
; CHECK-NEXT: [[MASK_SPLAT:%.*]] = shufflevector <2 x i1> [[MASK_VEC]], <2 x i1> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK_SPLAT]] to i2
-; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT: [[MASK_SPLAT_FIRST:%.*]] = extractelement <2 x i1> [[MASK_SPLAT]], i64 0
+; CHECK-NEXT: br i1 [[MASK_SPLAT_FIRST]], label [[COND_LOAD:%.*]], label [[TMP1:%.*]]
; CHECK: cond.load:
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[PASSTHROUGH:%.*]], i64 [[TMP4]], i64 0
-; CHECK-NEXT: br label [[ELSE]]
-; CHECK: else:
-; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHROUGH]], [[TMP0:%.*]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
-; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
-; CHECK-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
-; CHECK: cond.load1:
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[P]], i32 1
-; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP9]], i64 1
-; CHECK-NEXT: br label [[ELSE2]]
-; CHECK: else2:
-; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
-; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
+; CHECK-NEXT: [[RET_COND_LOAD:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
+; CHECK-NEXT: br label [[TMP1]]
+; CHECK: 1:
+; CHECK-NEXT: [[RET:%.*]] = phi <2 x i64> [ [[RET_COND_LOAD]], [[COND_LOAD]] ], [ [[PASSTHROUGH:%.*]], [[TMP0:%.*]] ]
+; CHECK-NEXT: ret <2 x i64> [[RET]]
;
%mask.vec = insertelement <2 x i1> poison, i1 %mask, i32 0
%mask.splat = shufflevector <2 x i1> %mask.vec, <2 x i1> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll
index cd2815e67e6720..4e3679dc5da99e 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll
@@ -56,31 +56,16 @@ define void @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %data) {
ret void
}
-; To be fixed: If the mask is the splat/broadcast of a non-constant value, use a
-; vector store
define void @scalarize_v2i64_splat_mask(ptr %p, <2 x i64> %data, i1 %mask) {
; CHECK-LABEL: @scalarize_v2i64_splat_mask(
; CHECK-NEXT: [[MASK_VEC:%.*]] = insertelement <2 x i1> poison, i1 [[MASK:%.*]], i32 0
; CHECK-NEXT: [[MASK_SPLAT:%.*]] = shufflevector <2 x i1> [[MASK_VEC]], <2 x i1> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK_SPLAT]] to i2
-; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[TMP2]], label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT: [[MASK_SPLAT_FIRST:%.*]] = extractelement <2 x i1> [[MASK_SPLAT]], i64 0
+; CHECK-NEXT: br i1 [[MASK_SPLAT_FIRST]], label [[COND_STORE:%.*]], label [[TMP1:%.*]]
; CHECK: cond.store:
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 0
-; CHECK-NEXT: store i64 [[TMP3]], ptr [[TMP4]], align 8
-; CHECK-NEXT: br label [[ELSE]]
-; CHECK: else:
-; CHECK-NEXT: [[TMP5:%.*]] = and i2 [[SCALAR_MASK]], -2
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i2 [[TMP5]], 0
-; CHECK-NEXT: br i1 [[TMP6]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
-; CHECK: cond.store1:
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[P]], i32 1
-; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP8]], align 8
-; CHECK-NEXT: br label [[ELSE2]]
-; CHECK: else2:
+; CHECK-NEXT: store <2 x i64> [[DATA:%.*]], ptr [[P:%.*]], align 8
+; CHECK-NEXT: br label [[TMP1]]
+; CHECK: 1:
; CHECK-NEXT: ret void
;
%mask.vec = insertelement <2 x i1> poison, i1 %mask, i32 0
>From 41cec411062d9542f4fdf5009351c062b2225991 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 16 Aug 2024 15:42:23 +0000
Subject: [PATCH 2/2] Update x86 tests that happen to use masked loads
---
llvm/test/CodeGen/X86/bfloat.ll | 586 +-------------------------
llvm/test/CodeGen/X86/shuffle-half.ll | 298 +------------
2 files changed, 15 insertions(+), 869 deletions(-)
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index ec76e8b05678b0..3759909a2ccc8e 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -759,347 +759,21 @@ define <32 x bfloat> @pr63017_2() nounwind {
;
; SSE2-LABEL: pr63017_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: pushq %r14
-; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: subq $200, %rsp
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: jne .LBB12_1
; SSE2-NEXT: # %bb.2: # %cond.load
; SSE2-NEXT: movzwl (%rax), %eax
; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movdqa %xmm0, %xmm15
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movdqa %xmm0, %xmm13
-; SSE2-NEXT: movdqa %xmm0, %xmm14
-; SSE2-NEXT: movdqa %xmm0, %xmm11
-; SSE2-NEXT: movdqa %xmm0, %xmm12
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: movdqa %xmm0, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: jmp .LBB12_3
; SSE2-NEXT: .LBB12_1:
-; SSE2-NEXT: movd {{.*#+}} xmm2 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movdqa %xmm2, %xmm15
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movdqa %xmm2, %xmm13
-; SSE2-NEXT: movdqa %xmm2, %xmm14
-; SSE2-NEXT: movdqa %xmm2, %xmm11
-; SSE2-NEXT: movdqa %xmm2, %xmm12
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: .LBB12_3: # %else
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_5
-; SSE2-NEXT: # %bb.4: # %cond.load1
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: .LBB12_5: # %else2
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_7
-; SSE2-NEXT: # %bb.6: # %cond.load4
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_7: # %else5
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_9
-; SSE2-NEXT: # %bb.8: # %cond.load7
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_9: # %else8
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_11
-; SSE2-NEXT: # %bb.10: # %cond.load10
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_11: # %else11
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_13
-; SSE2-NEXT: # %bb.12: # %cond.load13
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_13: # %else14
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_15
-; SSE2-NEXT: # %bb.14: # %cond.load16
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_15: # %else17
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_17
-; SSE2-NEXT: # %bb.16: # %cond.load19
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_17: # %else20
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_19
-; SSE2-NEXT: # %bb.18: # %cond.load22
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_19: # %else23
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_21
-; SSE2-NEXT: # %bb.20: # %cond.load25
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_21: # %else26
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_23
-; SSE2-NEXT: # %bb.22: # %cond.load28
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_23: # %else29
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_25
-; SSE2-NEXT: # %bb.24: # %cond.load31
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_25: # %else32
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_27
-; SSE2-NEXT: # %bb.26: # %cond.load34
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_27: # %else35
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_29
-; SSE2-NEXT: # %bb.28: # %cond.load37
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_29: # %else38
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_31
-; SSE2-NEXT: # %bb.30: # %cond.load40
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_31: # %else41
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_33
-; SSE2-NEXT: # %bb.32: # %cond.load43
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_33: # %else44
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_35
-; SSE2-NEXT: # %bb.34: # %cond.load46
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm15
-; SSE2-NEXT: .LBB12_35: # %else47
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_37
-; SSE2-NEXT: # %bb.36: # %cond.load49
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: .LBB12_37: # %else50
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_39
-; SSE2-NEXT: # %bb.38: # %cond.load52
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm13
-; SSE2-NEXT: .LBB12_39: # %else53
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_41
-; SSE2-NEXT: # %bb.40: # %cond.load55
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm14
-; SSE2-NEXT: .LBB12_41: # %else56
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_43
-; SSE2-NEXT: # %bb.42: # %cond.load58
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm11
-; SSE2-NEXT: .LBB12_43: # %else59
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_45
-; SSE2-NEXT: # %bb.44: # %cond.load61
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm12
-; SSE2-NEXT: .LBB12_45: # %else62
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_47
-; SSE2-NEXT: # %bb.46: # %cond.load64
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm9
-; SSE2-NEXT: .LBB12_47: # %else65
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_49
-; SSE2-NEXT: # %bb.48: # %cond.load67
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm10
-; SSE2-NEXT: .LBB12_49: # %else68
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_51
-; SSE2-NEXT: # %bb.50: # %cond.load70
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm7
-; SSE2-NEXT: .LBB12_51: # %else71
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_53
-; SSE2-NEXT: # %bb.52: # %cond.load73
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm8
-; SSE2-NEXT: .LBB12_53: # %else74
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_55
-; SSE2-NEXT: # %bb.54: # %cond.load76
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm5
-; SSE2-NEXT: .LBB12_55: # %else77
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_57
-; SSE2-NEXT: # %bb.56: # %cond.load79
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm6
-; SSE2-NEXT: .LBB12_57: # %else80
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_59
-; SSE2-NEXT: # %bb.58: # %cond.load82
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: .LBB12_59: # %else83
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_61
-; SSE2-NEXT: # %bb.60: # %cond.load85
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: .LBB12_61: # %else86
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: jne .LBB12_63
-; SSE2-NEXT: # %bb.62: # %cond.load88
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: .LBB12_63: # %else89
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: testb %al, %al
-; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: jne .LBB12_64
-; SSE2-NEXT: # %bb.65: # %cond.load91
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: jmp .LBB12_66
-; SSE2-NEXT: .LBB12_64:
-; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: .LBB12_66: # %else92
+; SSE2-NEXT: movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT: .LBB12_3:
+; SSE2-NEXT: pushq %r14
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: subq $88, %rsp
+; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: pextrw $0, %xmm0, %ebx
; SSE2-NEXT: shll $16, %ebx
@@ -1316,7 +990,7 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE2-NEXT: addq $200, %rsp
+; SSE2-NEXT: addq $88, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r14
; SSE2-NEXT: retq
@@ -1329,250 +1003,14 @@ define <32 x bfloat> @pr63017_2() nounwind {
;
; AVXNC-LABEL: pr63017_2:
; AVXNC: # %bb.0:
-; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT: vbroadcastss {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
; AVXNC-NEXT: xorl %eax, %eax
; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: vmovdqa %ymm0, %ymm1
; AVXNC-NEXT: jne .LBB12_2
; AVXNC-NEXT: # %bb.1: # %cond.load
-; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
-; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
-; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT: .LBB12_2: # %else
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_4
-; AVXNC-NEXT: # %bb.3: # %cond.load1
-; AVXNC-NEXT: vpinsrw $1, (%rax), %xmm0, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT: .LBB12_4: # %else2
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_6
-; AVXNC-NEXT: # %bb.5: # %cond.load4
-; AVXNC-NEXT: vpinsrw $2, (%rax), %xmm0, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT: .LBB12_6: # %else5
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_8
-; AVXNC-NEXT: # %bb.7: # %cond.load7
-; AVXNC-NEXT: vpinsrw $3, (%rax), %xmm0, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT: .LBB12_8: # %else8
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_10
-; AVXNC-NEXT: # %bb.9: # %cond.load10
-; AVXNC-NEXT: vpinsrw $4, (%rax), %xmm0, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT: .LBB12_10: # %else11
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_12
-; AVXNC-NEXT: # %bb.11: # %cond.load13
-; AVXNC-NEXT: vpinsrw $5, (%rax), %xmm0, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT: .LBB12_12: # %else14
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_14
-; AVXNC-NEXT: # %bb.13: # %cond.load16
-; AVXNC-NEXT: vpinsrw $6, (%rax), %xmm0, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT: .LBB12_14: # %else17
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_16
-; AVXNC-NEXT: # %bb.15: # %cond.load19
-; AVXNC-NEXT: vpinsrw $7, (%rax), %xmm0, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVXNC-NEXT: .LBB12_16: # %else20
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_18
-; AVXNC-NEXT: # %bb.17: # %cond.load22
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_18: # %else23
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_20
-; AVXNC-NEXT: # %bb.19: # %cond.load25
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_20: # %else26
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_22
-; AVXNC-NEXT: # %bb.21: # %cond.load28
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4,5,6,7,8,9],ymm2[10],ymm0[11,12,13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_22: # %else29
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_24
-; AVXNC-NEXT: # %bb.23: # %cond.load31
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_24: # %else32
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_26
-; AVXNC-NEXT: # %bb.25: # %cond.load34
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7,8,9,10,11],ymm2[12],ymm0[13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_26: # %else35
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_28
-; AVXNC-NEXT: # %bb.27: # %cond.load37
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7,8,9,10,11,12],ymm2[13],ymm0[14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_28: # %else38
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_30
-; AVXNC-NEXT: # %bb.29: # %cond.load40
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_30: # %else41
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_32
-; AVXNC-NEXT: # %bb.31: # %cond.load43
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_32: # %else44
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_34
-; AVXNC-NEXT: # %bb.33: # %cond.load46
-; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm1, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT: .LBB12_34: # %else47
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_36
-; AVXNC-NEXT: # %bb.35: # %cond.load49
-; AVXNC-NEXT: vpinsrw $1, (%rax), %xmm1, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT: .LBB12_36: # %else50
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_38
-; AVXNC-NEXT: # %bb.37: # %cond.load52
-; AVXNC-NEXT: vpinsrw $2, (%rax), %xmm1, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT: .LBB12_38: # %else53
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_40
-; AVXNC-NEXT: # %bb.39: # %cond.load55
-; AVXNC-NEXT: vpinsrw $3, (%rax), %xmm1, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT: .LBB12_40: # %else56
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_42
-; AVXNC-NEXT: # %bb.41: # %cond.load58
-; AVXNC-NEXT: vpinsrw $4, (%rax), %xmm1, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT: .LBB12_42: # %else59
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_44
-; AVXNC-NEXT: # %bb.43: # %cond.load61
-; AVXNC-NEXT: vpinsrw $5, (%rax), %xmm1, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT: .LBB12_44: # %else62
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_46
-; AVXNC-NEXT: # %bb.45: # %cond.load64
-; AVXNC-NEXT: vpinsrw $6, (%rax), %xmm1, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT: .LBB12_46: # %else65
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_48
-; AVXNC-NEXT: # %bb.47: # %cond.load67
-; AVXNC-NEXT: vpinsrw $7, (%rax), %xmm1, %xmm2
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVXNC-NEXT: .LBB12_48: # %else68
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_50
-; AVXNC-NEXT: # %bb.49: # %cond.load70
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_50: # %else71
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_52
-; AVXNC-NEXT: # %bb.51: # %cond.load73
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_52: # %else74
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_54
-; AVXNC-NEXT: # %bb.53: # %cond.load76
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_54: # %else77
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_56
-; AVXNC-NEXT: # %bb.55: # %cond.load79
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_56: # %else80
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_58
-; AVXNC-NEXT: # %bb.57: # %cond.load82
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_58: # %else83
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_60
-; AVXNC-NEXT: # %bb.59: # %cond.load85
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7,8,9,10,11,12],ymm2[13],ymm1[14,15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_60: # %else86
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_62
-; AVXNC-NEXT: # %bb.61: # %cond.load88
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_62: # %else89
-; AVXNC-NEXT: xorl %eax, %eax
-; AVXNC-NEXT: testb %al, %al
-; AVXNC-NEXT: jne .LBB12_64
-; AVXNC-NEXT: # %bb.63: # %cond.load91
-; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
-; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15]
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVXNC-NEXT: .LBB12_64: # %else92
+; AVXNC-NEXT: vmovups (%rax), %ymm0
+; AVXNC-NEXT: .LBB12_2:
+; AVXNC-NEXT: vmovaps %ymm0, %ymm1
; AVXNC-NEXT: retq
%1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
ret <32 x bfloat> %1
diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll
index 291fe841043ed4..001db2c7cecae1 100644
--- a/llvm/test/CodeGen/X86/shuffle-half.ll
+++ b/llvm/test/CodeGen/X86/shuffle-half.ll
@@ -4,305 +4,13 @@
define <32 x half> @dump_vec() {
; CHECK-LABEL: dump_vec:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.1: # %cond.load
-; CHECK-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0
-; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; CHECK-NEXT: .LBB0_2: # %else
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_4
-; CHECK-NEXT: # %bb.3: # %cond.load1
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_4: # %else2
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_6
-; CHECK-NEXT: # %bb.5: # %cond.load4
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_6: # %else5
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_8
-; CHECK-NEXT: # %bb.7: # %cond.load7
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_8: # %else8
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_10
-; CHECK-NEXT: # %bb.9: # %cond.load10
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_10: # %else11
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_12
-; CHECK-NEXT: # %bb.11: # %cond.load13
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_12: # %else14
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_14
-; CHECK-NEXT: # %bb.13: # %cond.load16
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_14: # %else17
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_16
-; CHECK-NEXT: # %bb.15: # %cond.load19
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_16: # %else20
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_18
-; CHECK-NEXT: # %bb.17: # %cond.load22
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; CHECK-NEXT: .LBB0_18: # %else23
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_20
-; CHECK-NEXT: # %bb.19: # %cond.load25
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; CHECK-NEXT: .LBB0_20: # %else26
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_22
-; CHECK-NEXT: # %bb.21: # %cond.load28
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; CHECK-NEXT: .LBB0_22: # %else29
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_24
-; CHECK-NEXT: # %bb.23: # %cond.load31
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; CHECK-NEXT: .LBB0_24: # %else32
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_26
-; CHECK-NEXT: # %bb.25: # %cond.load34
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; CHECK-NEXT: .LBB0_26: # %else35
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_28
-; CHECK-NEXT: # %bb.27: # %cond.load37
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; CHECK-NEXT: .LBB0_28: # %else38
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_30
-; CHECK-NEXT: # %bb.29: # %cond.load40
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; CHECK-NEXT: .LBB0_30: # %else41
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_32
-; CHECK-NEXT: # %bb.31: # %cond.load43
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; CHECK-NEXT: .LBB0_32: # %else44
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_34
-; CHECK-NEXT: # %bb.33: # %cond.load46
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_34: # %else47
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_36
-; CHECK-NEXT: # %bb.35: # %cond.load49
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_36: # %else50
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_38
-; CHECK-NEXT: # %bb.37: # %cond.load52
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_38: # %else53
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_40
-; CHECK-NEXT: # %bb.39: # %cond.load55
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_40: # %else56
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_42
-; CHECK-NEXT: # %bb.41: # %cond.load58
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_42: # %else59
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_44
-; CHECK-NEXT: # %bb.43: # %cond.load61
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_44: # %else62
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_46
-; CHECK-NEXT: # %bb.45: # %cond.load64
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_46: # %else65
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_48
-; CHECK-NEXT: # %bb.47: # %cond.load67
-; CHECK-NEXT: vpbroadcastw (%rax), %xmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_48: # %else68
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_50
-; CHECK-NEXT: # %bb.49: # %cond.load70
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_50: # %else71
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_52
-; CHECK-NEXT: # %bb.51: # %cond.load73
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7,8],ymm1[9],ymm2[10,11,12,13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_52: # %else74
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_54
-; CHECK-NEXT: # %bb.53: # %cond.load76
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7,8,9],ymm1[10],ymm2[11,12,13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_54: # %else77
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_56
-; CHECK-NEXT: # %bb.55: # %cond.load79
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7,8,9,10],ymm1[11],ymm2[12,13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_56: # %else80
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_58
-; CHECK-NEXT: # %bb.57: # %cond.load82
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_58: # %else83
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_60
-; CHECK-NEXT: # %bb.59: # %cond.load85
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7,8,9,10,11,12],ymm1[13],ymm2[14,15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_60: # %else86
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_62
-; CHECK-NEXT: # %bb.61: # %cond.load88
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_62: # %else89
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_64
-; CHECK-NEXT: # %bb.63: # %cond.load91
-; CHECK-NEXT: vpbroadcastw (%rax), %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: .LBB0_64: # %else92
+; CHECK-NEXT: vmovups (%rax), %zmm0
+; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: retq
%1 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x half> <half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0, half 0.0>)
ret <32 x half> %1
More information about the llvm-commits
mailing list