[llvm] [X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector to Bitcast of widen Build Vector (PR #135010)
Rohit Aggarwal via llvm-commits
llvm-commits at lists.llvm.org
Mon May 5 23:03:12 PDT 2025
https://github.com/rohitaggarwal007 updated https://github.com/llvm/llvm-project/pull/135010
>From e5c19148b23a2e666c79f2f1bd21661b7dbeeb4e Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Wed, 9 Apr 2025 19:10:28 +0530
Subject: [PATCH 1/6] [X86][DAGCombiner][SelectionDAG] - Fold Zext Build Vector
to Bitcast of widen Build Vector
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 58 ++++
llvm/test/CodeGen/X86/WidenBuildVector.ll | 258 ++++++++++++++++++
2 files changed, 316 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/WidenBuildVector.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 38376de5783ae..77c659aad0ed2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14195,6 +14195,61 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) {
return DAG.getZExtOrTrunc(NewAbs, SDLoc(Extend), VT);
}
+// Try to widen the build vector and bitcast it to the type of zext.
+// This is a special case for the 128-bit vector types. Intention is to remove
+// the zext and replace it with a bitcast the wider type. While lowering
+// the bitcast is removed and extra commutation due to zext is avoided.
+static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
+
+ assert(Extend->getOpcode() == ISD::ZERO_EXTEND && "Expected zero extend.");
+
+ EVT ExtendVT = Extend->getValueType(0);
+
+ SDValue BV = Extend->getOperand(0);
+ if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
+ return SDValue();
+
+ SDLoc dl(BV);
+ EVT VT = BV.getValueType();
+ EVT EltVT = BV.getOperand(0).getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+ SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
+ assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
+ // Fill the new elements with Zero.
+ NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
+ // Compute the step to place the elements in the right place and control the
+ // iteration.
+ unsigned step = WidenNumElts / NumElts;
+ if (WidenVT.is128BitVector()) {
+ if (Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
+ for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
+ i--, j -= step) {
+ SDValue temp = NewOps[i];
+ NewOps[i] = NewOps[j];
+ NewOps[j] = temp;
+ }
+ // Create new build vector with WidenVT and NewOps
+ SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
+ // Replace the old build vector with the new one. Bitcast the
+ // new build vector to the type of the zext.
+ SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
+ LLVM_DEBUG(
+ dbgs() << DAG.getMachineFunction().getFunction().getName()
+ << " - Widening buildvector and replace zext with bitcast\n";
+ BV.dump(); Extend->dump(); dbgs() << " to \n";
+ NewBV.getNode()->dump(); NewBVBitcast->dump(););
+ return NewBV;
+ }
+ }
+ return SDValue();
+}
+
SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -14521,6 +14576,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
return SDValue(CSENode, 0);
}
+ if (SDValue V = widenBuildVec(N, DAG))
+ return V;
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/WidenBuildVector.ll b/llvm/test/CodeGen/X86/WidenBuildVector.ll
new file mode 100644
index 0000000000000..d2924d016a1bf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/WidenBuildVector.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mcpu=znver5 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov8i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov8i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
+; CHECK-NEXT: leaq (%rsi,%rsi,4), %r8
+; CHECK-NEXT: leaq (,%rsi,8), %r9
+; CHECK-NEXT: subq %rsi, %r9
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0
+; CHECK-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: retq
+entry:
+ %var0 = load i8, ptr %a, align 1
+ %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+ %var1 = load i8, ptr %arrayidx.1, align 1
+ %mul.2 = shl nsw i64 %a_stride, 1
+ %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
+ %var2 = load i8, ptr %arrayidx.2, align 1
+ %mul.3 = mul nsw i64 %a_stride, 3
+ %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
+ %var3 = load i8, ptr %arrayidx.3, align 1
+ %mul.4 = shl nsw i64 %a_stride, 2
+ %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 %mul.4
+ %var4 = load i8, ptr %arrayidx.4, align 1
+ %mul.5 = mul nsw i64 %a_stride, 5
+ %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 %mul.5
+ %var5 = load i8, ptr %arrayidx.5, align 1
+ %mul.6 = mul nsw i64 %a_stride, 6
+ %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 %mul.6
+ %var6 = load i8, ptr %arrayidx.6, align 1
+ %mul.7 = mul nsw i64 %a_stride, 7
+ %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 %mul.7
+ %var7 = load i8, ptr %arrayidx.7, align 1
+ %var8 = insertelement <8 x i8> poison, i8 %var0, i64 0
+ %var9 = insertelement <8 x i8> %var8, i8 %var1, i64 1
+ %var10 = insertelement <8 x i8> %var9, i8 %var2, i64 2
+ %var11 = insertelement <8 x i8> %var10, i8 %var3, i64 3
+ %var12 = insertelement <8 x i8> %var11, i8 %var4, i64 4
+ %var13 = insertelement <8 x i8> %var12, i8 %var5, i64 5
+ %var14 = insertelement <8 x i8> %var13, i8 %var6, i64 6
+ %var15 = insertelement <8 x i8> %var14, i8 %var7, i64 7
+ %var16 = zext <8 x i8> %var15 to <8 x i32>
+ %var17 = load <8 x i16>, ptr %b, align 2
+ %var18 = sext <8 x i16> %var17 to <8 x i32>
+ %var19 = mul nsw <8 x i32> %var18, %var16
+ %var20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %var19)
+ ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov4i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov4i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0
+; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: retq
+entry:
+ %var0 = load i8, ptr %a, align 1
+ %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+ %var1 = load i8, ptr %arrayidx.1, align 1
+ %mul.2 = shl nsw i64 %a_stride, 1
+ %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
+ %var2 = load i8, ptr %arrayidx.2, align 1
+ %mul.3 = mul nsw i64 %a_stride, 3
+ %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
+ %var3 = load i8, ptr %arrayidx.3, align 1
+ %var8 = insertelement <4 x i8> poison, i8 %var0, i64 0
+ %var9 = insertelement <4 x i8> %var8, i8 %var1, i64 1
+ %var10 = insertelement <4 x i8> %var9, i8 %var2, i64 2
+ %var11 = insertelement <4 x i8> %var10, i8 %var3, i64 3
+ %var16 = zext <4 x i8> %var11 to <4 x i32>
+ %var17 = load <4 x i16>, ptr %b, align 2
+ %var18 = sext <4 x i16> %var17 to <4 x i32>
+ %var19 = mul nsw <4 x i32> %var18, %var16
+ %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19)
+ ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov2i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: retq
+ %var0 = load i8, ptr %a, align 1
+ %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+ %var1 = load i8, ptr %arrayidx.1, align 1
+ %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0
+ %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1
+ %var16 = zext <2 x i8> %var9 to <2 x i32>
+ %var17 = load <2 x i16>, ptr %b, align 2
+ %var18 = sext <2 x i16> %var17 to <2 x i32>
+ %var19 = mul nsw <2 x i32> %var18, %var16
+ %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19)
+ ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i64 @foov2i8_v2i64(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov2i8_v2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: vpmovsxbq (%rdx), %xmm1
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpmuldq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: retq
+ %var0 = load i8, ptr %a, align 1
+ %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+ %var1 = load i8, ptr %arrayidx.1, align 1
+ %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0
+ %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1
+ %var16 = zext <2 x i8> %var9 to <2 x i64>
+ %var17 = load <2 x i8>, ptr %b, align 2
+ %var18 = sext <2 x i8> %var17 to <2 x i64>
+ %var19 = mul nsw <2 x i64> %var18, %var16
+ %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19)
+ ret i64 %var20
+}
+
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov4i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov4i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzwl (%rdi), %eax
+; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
+; CHECK-NEXT: vpmovsxwd (%rdx), %xmm1
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0
+; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: retq
+entry:
+ %var0 = load i16, ptr %a, align 1
+ %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+ %var1 = load i16, ptr %arrayidx.1, align 1
+ %mul.2 = shl nsw i64 %a_stride, 1
+ %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
+ %var2 = load i16, ptr %arrayidx.2, align 1
+ %mul.3 = mul nsw i64 %a_stride, 3
+ %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
+ %var3 = load i16, ptr %arrayidx.3, align 1
+ %var8 = insertelement <4 x i16> poison, i16 %var0, i64 0
+ %var9 = insertelement <4 x i16> %var8, i16 %var1, i64 1
+ %var10 = insertelement <4 x i16> %var9, i16 %var2, i64 2
+ %var11 = insertelement <4 x i16> %var10, i16 %var3, i64 3
+ %var16 = zext <4 x i16> %var11 to <4 x i32>
+ %var17 = load <4 x i16>, ptr %b, align 2
+ %var18 = sext <4 x i16> %var17 to <4 x i32>
+ %var19 = mul nsw <4 x i32> %var18, %var16
+ %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19)
+ ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i32 @foov2i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzwl (%rdi), %eax
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpmovsxwd %xmm1, %xmm1
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: retq
+ %var0 = load i16, ptr %a, align 1
+ %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+ %var1 = load i16, ptr %arrayidx.1, align 1
+ %var8 = insertelement <2 x i16> poison, i16 %var0, i64 0
+ %var9 = insertelement <2 x i16> %var8, i16 %var1, i64 1
+ %var16 = zext <2 x i16> %var9 to <2 x i32>
+ %var17 = load <2 x i16>, ptr %b, align 2
+ %var18 = sext <2 x i16> %var17 to <2 x i32>
+ %var19 = mul nsw <2 x i32> %var18, %var16
+ %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19)
+ ret i32 %var20
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
+define dso_local i64 @foov2i32(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
+; CHECK-LABEL: foov2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT: vpmovsxdq (%rdx), %xmm1
+; CHECK-NEXT: vpmullq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: retq
+ %var0 = load i32, ptr %a, align 1
+ %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
+ %var1 = load i32, ptr %arrayidx.1, align 1
+ %var8 = insertelement <2 x i32> poison, i32 %var0, i64 0
+ %var9 = insertelement <2 x i32> %var8, i32 %var1, i64 1
+ %var16 = zext <2 x i32> %var9 to <2 x i64>
+ %var17 = load <2 x i32>, ptr %b, align 2
+ %var18 = sext <2 x i32> %var17 to <2 x i64>
+ %var19 = mul nsw <2 x i64> %var18, %var16
+ %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19)
+ ret i64 %var20
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #1
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) #1
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) #1
>From 483c27322e50799ff99c4142f1e1f087b7fb810f Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Wed, 16 Apr 2025 19:17:52 +0530
Subject: [PATCH 2/6] Fix for test cases failure
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 +-
llvm/test/CodeGen/PowerPC/custom-stov.ll | 16 +--
llvm/test/CodeGen/PowerPC/pre-inc-disable.ll | 104 ++++++++++--------
llvm/test/CodeGen/SystemZ/vec-mul-07.ll | 30 ++++-
llvm/test/CodeGen/SystemZ/vec-mul-09.ll | 30 ++++-
llvm/test/CodeGen/WebAssembly/interleave.ll | 63 ++++++-----
llvm/test/CodeGen/X86/buildvec-insertvec.ll | 8 +-
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 53 ++-------
8 files changed, 166 insertions(+), 145 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 77c659aad0ed2..96b5f666ba9e5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14215,6 +14215,11 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
unsigned NumElts = VT.getVectorNumElements();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+ TargetLowering::TypeWidenVector)
+ return SDValue();
+
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
unsigned WidenNumElts = WidenVT.getVectorNumElements();
@@ -14226,7 +14231,7 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
// iteration.
unsigned step = WidenNumElts / NumElts;
if (WidenVT.is128BitVector()) {
- if (Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
+ if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
i--, j -= step) {
SDValue temp = NewOps[i];
diff --git a/llvm/test/CodeGen/PowerPC/custom-stov.ll b/llvm/test/CodeGen/PowerPC/custom-stov.ll
index 0642fa900b0e5..d1bcc73fd212a 100644
--- a/llvm/test/CodeGen/PowerPC/custom-stov.ll
+++ b/llvm/test/CodeGen/PowerPC/custom-stov.ll
@@ -15,18 +15,18 @@ define void @_blah() {
; CHECK-NEXT: vperm v2, v4, v3, v2
; CHECK-NEXT: lwz r4, 16(0)
; CHECK-NEXT: stvx v2, 0, r5
-; CHECK-NEXT: lhz r5, -64(r1)
-; CHECK-NEXT: lhz r6, -58(r1)
-; CHECK-NEXT: lhz r7, -52(r1)
-; CHECK-NEXT: sth r4, -34(r1)
-; CHECK-NEXT: sth r3, -36(r1)
+; CHECK-NEXT: sth r3, -34(r1)
+; CHECK-NEXT: sth r3, -38(r1)
+; CHECK-NEXT: sth r3, -42(r1)
+; CHECK-NEXT: sth r3, -46(r1)
+; CHECK-NEXT: lhz r3, -52(r1)
; CHECK-NEXT: sth r3, -40(r1)
+; CHECK-NEXT: lhz r3, -58(r1)
; CHECK-NEXT: sth r3, -44(r1)
+; CHECK-NEXT: lhz r3, -64(r1)
+; CHECK-NEXT: sth r4, -36(r1)
; CHECK-NEXT: sth r3, -48(r1)
; CHECK-NEXT: addi r3, r1, -48
-; CHECK-NEXT: sth r7, -38(r1)
-; CHECK-NEXT: sth r6, -42(r1)
-; CHECK-NEXT: sth r5, -46(r1)
; CHECK-NEXT: lvx v2, 0, r3
; CHECK-NEXT: addi r3, r1, -32
; CHECK-NEXT: vsldoi v3, v2, v2, 8
diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
index 4435484ae0b94..d668868d41aa0 100644
--- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
+++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
@@ -327,9 +327,9 @@ define void @test16(ptr nocapture readonly %sums, i32 signext %delta, i32 signex
; P9BE-AIX32-NEXT: sth 3, -32(1)
; P9BE-AIX32-NEXT: lwz 3, L..C3(2) # %const.0
; P9BE-AIX32-NEXT: lxv 3, -32(1)
-; P9BE-AIX32-NEXT: vmrghh 4, 2, 4
+; P9BE-AIX32-NEXT: vmrghh 4, 4, 2
; P9BE-AIX32-NEXT: lxv 0, 0(3)
-; P9BE-AIX32-NEXT: vmrghh 3, 2, 3
+; P9BE-AIX32-NEXT: vmrghh 3, 3, 2
; P9BE-AIX32-NEXT: vsplth 2, 2, 0
; P9BE-AIX32-NEXT: xxmrghw 2, 2, 4
; P9BE-AIX32-NEXT: xxperm 3, 2, 0
@@ -403,25 +403,29 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext
; P9BE-LABEL: test8:
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: add 6, 3, 4
-; P9BE-NEXT: li 7, 8
-; P9BE-NEXT: lxsibzx 3, 3, 4
+; P9BE-NEXT: lxsibzx 2, 3, 4
+; P9BE-NEXT: addis 3, 2, .LCPI3_0 at toc@ha
+; P9BE-NEXT: addi 3, 3, .LCPI3_0 at toc@l
+; P9BE-NEXT: lxv 0, 0(3)
+; P9BE-NEXT: li 3, 0
+; P9BE-NEXT: mtvsrwz 3, 3
+; P9BE-NEXT: li 3, 8
+; P9BE-NEXT: vspltb 4, 3, 7
+; P9BE-NEXT: xxperm 2, 3, 0
+; P9BE-NEXT: lxsibzx 0, 6, 3
; P9BE-NEXT: addis 3, 2, .LCPI3_1 at toc@ha
-; P9BE-NEXT: lxsibzx 0, 6, 7
-; P9BE-NEXT: addis 6, 2, .LCPI3_0 at toc@ha
; P9BE-NEXT: addi 3, 3, .LCPI3_1 at toc@l
-; P9BE-NEXT: addi 6, 6, .LCPI3_0 at toc@l
-; P9BE-NEXT: lxv 1, 0(6)
-; P9BE-NEXT: li 6, 0
-; P9BE-NEXT: mtvsrwz 2, 6
-; P9BE-NEXT: xxperm 0, 2, 1
-; P9BE-NEXT: xxperm 3, 2, 1
-; P9BE-NEXT: vspltb 2, 2, 7
-; P9BE-NEXT: vmrghh 3, 3, 2
-; P9BE-NEXT: xxspltw 1, 2, 0
-; P9BE-NEXT: xxmrghw 3, 3, 0
+; P9BE-NEXT: vmrghh 2, 4, 2
+; P9BE-NEXT: lxv 1, 0(3)
+; P9BE-NEXT: addis 3, 2, .LCPI3_2 at toc@ha
+; P9BE-NEXT: addi 3, 3, .LCPI3_2 at toc@l
+; P9BE-NEXT: xxmrghw 2, 4, 2
+; P9BE-NEXT: xxperm 3, 0, 1
; P9BE-NEXT: lxv 0, 0(3)
; P9BE-NEXT: li 3, 0
-; P9BE-NEXT: xxperm 3, 1, 0
+; P9BE-NEXT: vmrghh 3, 4, 3
+; P9BE-NEXT: xxmrghw 3, 3, 4
+; P9BE-NEXT: xxperm 3, 2, 0
; P9BE-NEXT: xxspltw 2, 3, 1
; P9BE-NEXT: vadduwm 2, 3, 2
; P9BE-NEXT: vextuwlx 3, 3, 2
@@ -432,23 +436,26 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext
; P9BE-AIX-LABEL: test8:
; P9BE-AIX: # %bb.0: # %entry
; P9BE-AIX-NEXT: add 6, 3, 4
-; P9BE-AIX-NEXT: li 7, 8
-; P9BE-AIX-NEXT: lxsibzx 3, 3, 4
-; P9BE-AIX-NEXT: ld 3, L..C5(2) # %const.1
-; P9BE-AIX-NEXT: lxsibzx 0, 6, 7
-; P9BE-AIX-NEXT: ld 6, L..C6(2) # %const.0
-; P9BE-AIX-NEXT: lxv 1, 0(6)
-; P9BE-AIX-NEXT: li 6, 0
-; P9BE-AIX-NEXT: mtvsrwz 2, 6
-; P9BE-AIX-NEXT: xxperm 0, 2, 1
-; P9BE-AIX-NEXT: xxperm 3, 2, 1
-; P9BE-AIX-NEXT: vspltb 2, 2, 7
-; P9BE-AIX-NEXT: vmrghh 3, 3, 2
-; P9BE-AIX-NEXT: xxspltw 1, 2, 0
-; P9BE-AIX-NEXT: xxmrghw 3, 3, 0
+; P9BE-AIX-NEXT: lxsibzx 2, 3, 4
+; P9BE-AIX-NEXT: ld 3, L..C5(2) # %const.0
+; P9BE-AIX-NEXT: lxv 0, 0(3)
+; P9BE-AIX-NEXT: li 3, 0
+; P9BE-AIX-NEXT: mtvsrwz 3, 3
+; P9BE-AIX-NEXT: li 3, 8
+; P9BE-AIX-NEXT: vspltb 4, 3, 7
+; P9BE-AIX-NEXT: xxperm 2, 3, 0
+; P9BE-AIX-NEXT: lxsibzx 0, 6, 3
+; P9BE-AIX-NEXT: ld 3, L..C6(2) # %const.1
+; P9BE-AIX-NEXT: vmrghh 2, 4, 2
+; P9BE-AIX-NEXT: lxv 1, 0(3)
+; P9BE-AIX-NEXT: ld 3, L..C7(2) # %const.2
+; P9BE-AIX-NEXT: xxmrghw 2, 4, 2
+; P9BE-AIX-NEXT: xxperm 3, 0, 1
; P9BE-AIX-NEXT: lxv 0, 0(3)
; P9BE-AIX-NEXT: li 3, 0
-; P9BE-AIX-NEXT: xxperm 3, 1, 0
+; P9BE-AIX-NEXT: vmrghh 3, 4, 3
+; P9BE-AIX-NEXT: xxmrghw 3, 3, 4
+; P9BE-AIX-NEXT: xxperm 3, 2, 0
; P9BE-AIX-NEXT: xxspltw 2, 3, 1
; P9BE-AIX-NEXT: vadduwm 2, 3, 2
; P9BE-AIX-NEXT: vextuwlx 3, 3, 2
@@ -459,22 +466,25 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext
; P9BE-AIX32-LABEL: test8:
; P9BE-AIX32: # %bb.0: # %entry
; P9BE-AIX32-NEXT: add 6, 3, 4
-; P9BE-AIX32-NEXT: li 7, 8
-; P9BE-AIX32-NEXT: lxsibzx 3, 3, 4
-; P9BE-AIX32-NEXT: lwz 3, L..C4(2) # %const.1
-; P9BE-AIX32-NEXT: lxsibzx 0, 6, 7
-; P9BE-AIX32-NEXT: lwz 6, L..C5(2) # %const.0
-; P9BE-AIX32-NEXT: lxv 1, 0(6)
-; P9BE-AIX32-NEXT: li 6, 0
-; P9BE-AIX32-NEXT: mtvsrwz 2, 6
-; P9BE-AIX32-NEXT: xxperm 0, 2, 1
-; P9BE-AIX32-NEXT: xxperm 3, 2, 1
-; P9BE-AIX32-NEXT: vspltb 2, 2, 7
-; P9BE-AIX32-NEXT: vmrghh 3, 3, 2
-; P9BE-AIX32-NEXT: xxspltw 1, 2, 0
-; P9BE-AIX32-NEXT: xxmrghw 3, 3, 0
+; P9BE-AIX32-NEXT: lxsibzx 2, 3, 4
+; P9BE-AIX32-NEXT: lwz 3, L..C4(2) # %const.0
; P9BE-AIX32-NEXT: lxv 0, 0(3)
-; P9BE-AIX32-NEXT: xxperm 3, 1, 0
+; P9BE-AIX32-NEXT: li 3, 0
+; P9BE-AIX32-NEXT: mtvsrwz 3, 3
+; P9BE-AIX32-NEXT: li 3, 8
+; P9BE-AIX32-NEXT: vspltb 4, 3, 7
+; P9BE-AIX32-NEXT: xxperm 2, 3, 0
+; P9BE-AIX32-NEXT: lxsibzx 0, 6, 3
+; P9BE-AIX32-NEXT: lwz 3, L..C5(2) # %const.1
+; P9BE-AIX32-NEXT: vmrghh 2, 4, 2
+; P9BE-AIX32-NEXT: lxv 1, 0(3)
+; P9BE-AIX32-NEXT: lwz 3, L..C6(2) # %const.2
+; P9BE-AIX32-NEXT: xxmrghw 2, 4, 2
+; P9BE-AIX32-NEXT: xxperm 3, 0, 1
+; P9BE-AIX32-NEXT: lxv 0, 0(3)
+; P9BE-AIX32-NEXT: vmrghh 3, 4, 3
+; P9BE-AIX32-NEXT: xxmrghw 3, 3, 4
+; P9BE-AIX32-NEXT: xxperm 3, 2, 0
; P9BE-AIX32-NEXT: xxspltw 2, 3, 1
; P9BE-AIX32-NEXT: vadduwm 2, 3, 2
; P9BE-AIX32-NEXT: stxv 2, -16(1)
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-07.ll b/llvm/test/CodeGen/SystemZ/vec-mul-07.ll
index 73c7a8dec5dfc..ca9e8412d95bd 100644
--- a/llvm/test/CodeGen/SystemZ/vec-mul-07.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-07.ll
@@ -7,7 +7,11 @@
define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmleb %v24, %v24, %v26
+; CHECK-NEXT: larl %r1, .LCPI0_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v1, %v24, %v0, %v0
+; CHECK-NEXT: vperm %v0, %v26, %v0, %v0
+; CHECK-NEXT: vmlhw %v24, %v1, %v0
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%zext1 = zext <8 x i8> %shuf1 to <8 x i16>
@@ -21,7 +25,12 @@ define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2) {
define <8 x i16> @f2(<16 x i8> %val1, <16 x i8> %val2) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmlob %v24, %v24, %v26
+; CHECK-NEXT: larl %r1, .LCPI1_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vperm %v2, %v24, %v1, %v0
+; CHECK-NEXT: vperm %v0, %v26, %v1, %v0
+; CHECK-NEXT: vmlhw %v24, %v2, %v0
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%zext1 = zext <8 x i8> %shuf1 to <8 x i16>
@@ -63,7 +72,11 @@ define <8 x i16> @f4(<16 x i8> %val1, <16 x i8> %val2) {
define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmleh %v24, %v24, %v26
+; CHECK-NEXT: larl %r1, .LCPI4_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v1, %v24, %v0, %v0
+; CHECK-NEXT: vperm %v0, %v26, %v0, %v0
+; CHECK-NEXT: vmlf %v24, %v1, %v0
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%zext1 = zext <4 x i16> %shuf1 to <4 x i32>
@@ -77,7 +90,12 @@ define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2) {
define <4 x i32> @f6(<8 x i16> %val1, <8 x i16> %val2) {
; CHECK-LABEL: f6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmloh %v24, %v24, %v26
+; CHECK-NEXT: larl %r1, .LCPI5_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vperm %v2, %v24, %v1, %v0
+; CHECK-NEXT: vperm %v0, %v26, %v1, %v0
+; CHECK-NEXT: vmlf %v24, %v2, %v0
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%zext1 = zext <4 x i16> %shuf1 to <4 x i32>
@@ -119,7 +137,7 @@ define <4 x i32> @f8(<8 x i16> %val1, <8 x i16> %val2) {
define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmlef %v24, %v24, %v26
+; CHECK-NEXT: vgbm %v24, 0
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
%zext1 = zext <2 x i32> %shuf1 to <2 x i64>
@@ -133,7 +151,7 @@ define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2) {
define <2 x i64> @f10(<4 x i32> %val1, <4 x i32> %val2) {
; CHECK-LABEL: f10:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmlof %v24, %v24, %v26
+; CHECK-NEXT: vgbm %v24, 0
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
%zext1 = zext <2 x i32> %shuf1 to <2 x i64>
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-09.ll b/llvm/test/CodeGen/SystemZ/vec-mul-09.ll
index def57ca03bb0c..e1e52bed7a143 100644
--- a/llvm/test/CodeGen/SystemZ/vec-mul-09.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-09.ll
@@ -7,7 +7,11 @@
define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmaleb %v24, %v24, %v26, %v28
+; CHECK-NEXT: larl %r1, .LCPI0_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v1, %v24, %v0, %v0
+; CHECK-NEXT: vperm %v0, %v26, %v0, %v0
+; CHECK-NEXT: vmalhw %v24, %v1, %v0, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%zext1 = zext <8 x i8> %shuf1 to <8 x i16>
@@ -22,7 +26,12 @@ define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
define <8 x i16> @f2(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmalob %v24, %v24, %v26, %v28
+; CHECK-NEXT: larl %r1, .LCPI1_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vperm %v2, %v24, %v1, %v0
+; CHECK-NEXT: vperm %v0, %v26, %v1, %v0
+; CHECK-NEXT: vmalhw %v24, %v2, %v0, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%zext1 = zext <8 x i8> %shuf1 to <8 x i16>
@@ -67,7 +76,11 @@ define <8 x i16> @f4(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmaleh %v24, %v24, %v26, %v28
+; CHECK-NEXT: larl %r1, .LCPI4_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v1, %v24, %v0, %v0
+; CHECK-NEXT: vperm %v0, %v26, %v0, %v0
+; CHECK-NEXT: vmalf %v24, %v1, %v0, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%zext1 = zext <4 x i16> %shuf1 to <4 x i32>
@@ -82,7 +95,12 @@ define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
define <4 x i32> @f6(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
; CHECK-LABEL: f6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmaloh %v24, %v24, %v26, %v28
+; CHECK-NEXT: larl %r1, .LCPI5_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vgbm %v1, 0
+; CHECK-NEXT: vperm %v2, %v24, %v1, %v0
+; CHECK-NEXT: vperm %v0, %v26, %v1, %v0
+; CHECK-NEXT: vmalf %v24, %v2, %v0, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%zext1 = zext <4 x i16> %shuf1 to <4 x i32>
@@ -127,7 +145,7 @@ define <4 x i32> @f8(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmalef %v24, %v24, %v26, %v28
+; CHECK-NEXT: vlr %v24, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
%zext1 = zext <2 x i32> %shuf1 to <2 x i64>
@@ -142,7 +160,7 @@ define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
define <2 x i64> @f10(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
; CHECK-LABEL: f10:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmalof %v24, %v24, %v26, %v28
+; CHECK-NEXT: vlr %v24, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
%zext1 = zext <2 x i32> %shuf1 to <2 x i64>
diff --git a/llvm/test/CodeGen/WebAssembly/interleave.ll b/llvm/test/CodeGen/WebAssembly/interleave.ll
index c20b5e42c4850..eada6cc8c6813 100644
--- a/llvm/test/CodeGen/WebAssembly/interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/interleave.ll
@@ -17,13 +17,12 @@ define hidden void @accumulate8x2(ptr dead_on_unwind noalias writable sret(%stru
; CHECK-LABEL: accumulate8x2:
; CHECK: loop
; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: local.tee 10
+; CHECK: i8x16.shuffle 1, 17, 18, 19, 3, 21, 22, 23, 5, 25, 26, 27, 7, 29, 30, 31
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: local.get 10
+; CHECK: i8x16.shuffle 0, 17, 18, 19, 2, 21, 22, 23, 4, 25, 26, 27, 6, 29, 30, 31
; CHECK: i32x4.add
%4 = load i32, ptr %0, align 4
%5 = icmp eq i32 %2, 0
@@ -65,21 +64,18 @@ define hidden void @accumulate8x4(ptr dead_on_unwind noalias writable sret(%stru
; CHECK-LABEL: accumulate8x4
; CHECK: loop
; CHECK: v128.load
-; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: local.tee 14
+; CHECK: i8x16.shuffle 3, 17, 18, 19, 7, 21, 22, 23, 11, 25, 26, 27, 15, 29, 30, 31
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: local.get 14
+; CHECK: i8x16.shuffle 2, 17, 18, 19, 6, 21, 22, 23, 10, 25, 26, 27, 14, 29, 30, 31
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: local.get 14
+; CHECK: i8x16.shuffle 1, 17, 18, 19, 5, 21, 22, 23, 9, 25, 26, 27, 13, 29, 30, 31
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: local.get 14
+; CHECK: i8x16.shuffle 0, 17, 18, 19, 4, 21, 22, 23, 8, 25, 26, 27, 12, 29, 30, 31
; CHECK: i32x4.add
%4 = load i32, ptr %0, align 4
%5 = icmp eq i32 %2, 0
@@ -137,11 +133,12 @@ define hidden void @accumulate16x2(ptr dead_on_unwind noalias writable sret(%str
; CHECK-LABEL: accumulate16x2
; CHECK: loop
; CHECK: v128.load
-; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: local.tee 10
+; CHECK: i8x16.shuffle 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: local.get 10
+; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
; CHECK: i32x4.add
%4 = load i32, ptr %0, align 4
%5 = icmp eq i32 %2, 0
@@ -184,17 +181,23 @@ define hidden void @accumulate16x4(ptr dead_on_unwind noalias writable sret(%str
; CHECK: loop
; CHECK: v128.load 0:p2align=1
; CHECK: v128.load 16:p2align=1
-; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i8x16.shuffle 6, 7, 0, 1, 14, 15, 0, 1, 22, 23, 0, 1, 30, 31, 0, 1
+; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: local.tee 15
+; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i8x16.shuffle 4, 5, 0, 1, 12, 13, 0, 1, 20, 21, 0, 1, 28, 29, 0, 1
+; CHECK: local.get 15
+; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i8x16.shuffle 2, 3, 0, 1, 10, 11, 0, 1, 18, 19, 0, 1, 26, 27, 0, 1
+; CHECK: local.get 15
+; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i8x16.shuffle 0, 1, 0, 1, 8, 9, 0, 1, 16, 17, 0, 1, 24, 25, 0, 1
+; CHECK: local.get 15
+; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
; CHECK: i32x4.add
%4 = load i32, ptr %0, align 4
%5 = icmp eq i32 %2, 0
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 4b0e5441b4abf..ea037c1173ae3 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -799,7 +799,9 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) {
;
; SSE41-LABEL: PR46586:
; SSE41: # %bb.0:
-; SSE41-NEXT: movzbl 3(%rdi), %eax
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm1
+; SSE41-NEXT: pextrd $3, %xmm1, %eax
; SSE41-NEXT: extractps $3, %xmm0, %ecx
; SSE41-NEXT: xorl %edx, %edx
; SSE41-NEXT: divl %ecx
@@ -808,7 +810,9 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) {
;
; AVX-LABEL: PR46586:
; AVX: # %bb.0:
-; AVX-NEXT: movzbl 3(%rdi), %eax
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm1, %xmm1
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
; AVX-NEXT: vextractps $3, %xmm0, %ecx
; AVX-NEXT: xorl %edx, %edx
; AVX-NEXT: divl %ecx
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index ec442c185706c..b0b148b0cd50a 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -382,51 +382,14 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
}
define <4 x double> @PR34175(ptr %p) {
-; AVX512F-LABEL: PR34175:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: PR34175:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: PR34175:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: PR34175:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BWVL-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: PR34175:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512VBMI-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512VBMI-NEXT: retq
+; AVX512-LABEL: PR34175:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
+; AVX512-NEXT: vpermd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512-NEXT: retq
%v = load <32 x i16>, ptr %p, align 2
%shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
%tofp = uitofp <4 x i16> %shuf to <4 x double>
>From ff669c677e307ca6a57066008bbb073211aced5c Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Fri, 25 Apr 2025 15:54:47 +0530
Subject: [PATCH 3/6] Fix for the test case failure
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 +-
llvm/test/CodeGen/PowerPC/pre-inc-disable.ll | 104 ++++---
llvm/test/CodeGen/X86/WidenBuildVector.ll | 258 ------------------
llvm/test/CodeGen/X86/buildvec-insertvec.ll | 8 +-
.../CodeGen/X86/buildvec-widen-dotproduct.ll | 231 ++++++----------
5 files changed, 149 insertions(+), 467 deletions(-)
delete mode 100644 llvm/test/CodeGen/X86/WidenBuildVector.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b2ca646679838..b90182c6fdc3e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14299,6 +14299,9 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) {
// This is a special case for the 128-bit vector types. Intention is to remove
// the zext and replace it with a bitcast the wider type. While lowering
// the bitcast is removed and extra commutation due to zext is avoided.
+// For example:
+// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
+// build_vector (x, 0, y, 0, z, w, 0)
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
assert(Extend->getOpcode() == ISD::ZERO_EXTEND && "Expected zero extend.");
@@ -14309,6 +14312,13 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
return SDValue();
+ if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
+ // If the build vector has undef elements, we cannot widen it.
+ // The widening would create a vector with more undef elements, which
+ // is not valid.
+ return SDValue();
+ }
+
SDLoc dl(BV);
EVT VT = BV.getValueType();
EVT EltVT = BV.getOperand(0).getValueType();
@@ -14344,11 +14354,6 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
// new build vector to the type of the zext.
SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
- LLVM_DEBUG(
- dbgs() << DAG.getMachineFunction().getFunction().getName()
- << " - Widening buildvector and replace zext with bitcast\n";
- BV.dump(); Extend->dump(); dbgs() << " to \n";
- NewBV.getNode()->dump(); NewBVBitcast->dump(););
return NewBV;
}
}
diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
index d668868d41aa0..4435484ae0b94 100644
--- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
+++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
@@ -327,9 +327,9 @@ define void @test16(ptr nocapture readonly %sums, i32 signext %delta, i32 signex
; P9BE-AIX32-NEXT: sth 3, -32(1)
; P9BE-AIX32-NEXT: lwz 3, L..C3(2) # %const.0
; P9BE-AIX32-NEXT: lxv 3, -32(1)
-; P9BE-AIX32-NEXT: vmrghh 4, 4, 2
+; P9BE-AIX32-NEXT: vmrghh 4, 2, 4
; P9BE-AIX32-NEXT: lxv 0, 0(3)
-; P9BE-AIX32-NEXT: vmrghh 3, 3, 2
+; P9BE-AIX32-NEXT: vmrghh 3, 2, 3
; P9BE-AIX32-NEXT: vsplth 2, 2, 0
; P9BE-AIX32-NEXT: xxmrghw 2, 2, 4
; P9BE-AIX32-NEXT: xxperm 3, 2, 0
@@ -403,29 +403,25 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext
; P9BE-LABEL: test8:
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: add 6, 3, 4
-; P9BE-NEXT: lxsibzx 2, 3, 4
-; P9BE-NEXT: addis 3, 2, .LCPI3_0 at toc@ha
-; P9BE-NEXT: addi 3, 3, .LCPI3_0 at toc@l
-; P9BE-NEXT: lxv 0, 0(3)
-; P9BE-NEXT: li 3, 0
-; P9BE-NEXT: mtvsrwz 3, 3
-; P9BE-NEXT: li 3, 8
-; P9BE-NEXT: vspltb 4, 3, 7
-; P9BE-NEXT: xxperm 2, 3, 0
-; P9BE-NEXT: lxsibzx 0, 6, 3
+; P9BE-NEXT: li 7, 8
+; P9BE-NEXT: lxsibzx 3, 3, 4
; P9BE-NEXT: addis 3, 2, .LCPI3_1 at toc@ha
+; P9BE-NEXT: lxsibzx 0, 6, 7
+; P9BE-NEXT: addis 6, 2, .LCPI3_0 at toc@ha
; P9BE-NEXT: addi 3, 3, .LCPI3_1 at toc@l
-; P9BE-NEXT: vmrghh 2, 4, 2
-; P9BE-NEXT: lxv 1, 0(3)
-; P9BE-NEXT: addis 3, 2, .LCPI3_2 at toc@ha
-; P9BE-NEXT: addi 3, 3, .LCPI3_2 at toc@l
-; P9BE-NEXT: xxmrghw 2, 4, 2
-; P9BE-NEXT: xxperm 3, 0, 1
+; P9BE-NEXT: addi 6, 6, .LCPI3_0 at toc@l
+; P9BE-NEXT: lxv 1, 0(6)
+; P9BE-NEXT: li 6, 0
+; P9BE-NEXT: mtvsrwz 2, 6
+; P9BE-NEXT: xxperm 0, 2, 1
+; P9BE-NEXT: xxperm 3, 2, 1
+; P9BE-NEXT: vspltb 2, 2, 7
+; P9BE-NEXT: vmrghh 3, 3, 2
+; P9BE-NEXT: xxspltw 1, 2, 0
+; P9BE-NEXT: xxmrghw 3, 3, 0
; P9BE-NEXT: lxv 0, 0(3)
; P9BE-NEXT: li 3, 0
-; P9BE-NEXT: vmrghh 3, 4, 3
-; P9BE-NEXT: xxmrghw 3, 3, 4
-; P9BE-NEXT: xxperm 3, 2, 0
+; P9BE-NEXT: xxperm 3, 1, 0
; P9BE-NEXT: xxspltw 2, 3, 1
; P9BE-NEXT: vadduwm 2, 3, 2
; P9BE-NEXT: vextuwlx 3, 3, 2
@@ -436,26 +432,23 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext
; P9BE-AIX-LABEL: test8:
; P9BE-AIX: # %bb.0: # %entry
; P9BE-AIX-NEXT: add 6, 3, 4
-; P9BE-AIX-NEXT: lxsibzx 2, 3, 4
-; P9BE-AIX-NEXT: ld 3, L..C5(2) # %const.0
-; P9BE-AIX-NEXT: lxv 0, 0(3)
-; P9BE-AIX-NEXT: li 3, 0
-; P9BE-AIX-NEXT: mtvsrwz 3, 3
-; P9BE-AIX-NEXT: li 3, 8
-; P9BE-AIX-NEXT: vspltb 4, 3, 7
-; P9BE-AIX-NEXT: xxperm 2, 3, 0
-; P9BE-AIX-NEXT: lxsibzx 0, 6, 3
-; P9BE-AIX-NEXT: ld 3, L..C6(2) # %const.1
-; P9BE-AIX-NEXT: vmrghh 2, 4, 2
-; P9BE-AIX-NEXT: lxv 1, 0(3)
-; P9BE-AIX-NEXT: ld 3, L..C7(2) # %const.2
-; P9BE-AIX-NEXT: xxmrghw 2, 4, 2
-; P9BE-AIX-NEXT: xxperm 3, 0, 1
+; P9BE-AIX-NEXT: li 7, 8
+; P9BE-AIX-NEXT: lxsibzx 3, 3, 4
+; P9BE-AIX-NEXT: ld 3, L..C5(2) # %const.1
+; P9BE-AIX-NEXT: lxsibzx 0, 6, 7
+; P9BE-AIX-NEXT: ld 6, L..C6(2) # %const.0
+; P9BE-AIX-NEXT: lxv 1, 0(6)
+; P9BE-AIX-NEXT: li 6, 0
+; P9BE-AIX-NEXT: mtvsrwz 2, 6
+; P9BE-AIX-NEXT: xxperm 0, 2, 1
+; P9BE-AIX-NEXT: xxperm 3, 2, 1
+; P9BE-AIX-NEXT: vspltb 2, 2, 7
+; P9BE-AIX-NEXT: vmrghh 3, 3, 2
+; P9BE-AIX-NEXT: xxspltw 1, 2, 0
+; P9BE-AIX-NEXT: xxmrghw 3, 3, 0
; P9BE-AIX-NEXT: lxv 0, 0(3)
; P9BE-AIX-NEXT: li 3, 0
-; P9BE-AIX-NEXT: vmrghh 3, 4, 3
-; P9BE-AIX-NEXT: xxmrghw 3, 3, 4
-; P9BE-AIX-NEXT: xxperm 3, 2, 0
+; P9BE-AIX-NEXT: xxperm 3, 1, 0
; P9BE-AIX-NEXT: xxspltw 2, 3, 1
; P9BE-AIX-NEXT: vadduwm 2, 3, 2
; P9BE-AIX-NEXT: vextuwlx 3, 3, 2
@@ -466,25 +459,22 @@ define void @test8(ptr nocapture readonly %sums, i32 signext %delta, i32 signext
; P9BE-AIX32-LABEL: test8:
; P9BE-AIX32: # %bb.0: # %entry
; P9BE-AIX32-NEXT: add 6, 3, 4
-; P9BE-AIX32-NEXT: lxsibzx 2, 3, 4
-; P9BE-AIX32-NEXT: lwz 3, L..C4(2) # %const.0
-; P9BE-AIX32-NEXT: lxv 0, 0(3)
-; P9BE-AIX32-NEXT: li 3, 0
-; P9BE-AIX32-NEXT: mtvsrwz 3, 3
-; P9BE-AIX32-NEXT: li 3, 8
-; P9BE-AIX32-NEXT: vspltb 4, 3, 7
-; P9BE-AIX32-NEXT: xxperm 2, 3, 0
-; P9BE-AIX32-NEXT: lxsibzx 0, 6, 3
-; P9BE-AIX32-NEXT: lwz 3, L..C5(2) # %const.1
-; P9BE-AIX32-NEXT: vmrghh 2, 4, 2
-; P9BE-AIX32-NEXT: lxv 1, 0(3)
-; P9BE-AIX32-NEXT: lwz 3, L..C6(2) # %const.2
-; P9BE-AIX32-NEXT: xxmrghw 2, 4, 2
-; P9BE-AIX32-NEXT: xxperm 3, 0, 1
+; P9BE-AIX32-NEXT: li 7, 8
+; P9BE-AIX32-NEXT: lxsibzx 3, 3, 4
+; P9BE-AIX32-NEXT: lwz 3, L..C4(2) # %const.1
+; P9BE-AIX32-NEXT: lxsibzx 0, 6, 7
+; P9BE-AIX32-NEXT: lwz 6, L..C5(2) # %const.0
+; P9BE-AIX32-NEXT: lxv 1, 0(6)
+; P9BE-AIX32-NEXT: li 6, 0
+; P9BE-AIX32-NEXT: mtvsrwz 2, 6
+; P9BE-AIX32-NEXT: xxperm 0, 2, 1
+; P9BE-AIX32-NEXT: xxperm 3, 2, 1
+; P9BE-AIX32-NEXT: vspltb 2, 2, 7
+; P9BE-AIX32-NEXT: vmrghh 3, 3, 2
+; P9BE-AIX32-NEXT: xxspltw 1, 2, 0
+; P9BE-AIX32-NEXT: xxmrghw 3, 3, 0
; P9BE-AIX32-NEXT: lxv 0, 0(3)
-; P9BE-AIX32-NEXT: vmrghh 3, 4, 3
-; P9BE-AIX32-NEXT: xxmrghw 3, 3, 4
-; P9BE-AIX32-NEXT: xxperm 3, 2, 0
+; P9BE-AIX32-NEXT: xxperm 3, 1, 0
; P9BE-AIX32-NEXT: xxspltw 2, 3, 1
; P9BE-AIX32-NEXT: vadduwm 2, 3, 2
; P9BE-AIX32-NEXT: stxv 2, -16(1)
diff --git a/llvm/test/CodeGen/X86/WidenBuildVector.ll b/llvm/test/CodeGen/X86/WidenBuildVector.ll
deleted file mode 100644
index d2924d016a1bf..0000000000000
--- a/llvm/test/CodeGen/X86/WidenBuildVector.ll
+++ /dev/null
@@ -1,258 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mcpu=znver5 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
-
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
-define dso_local i32 @foov8i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
-; CHECK-LABEL: foov8i8:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movzbl (%rdi), %eax
-; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
-; CHECK-NEXT: leaq (%rsi,%rsi,4), %r8
-; CHECK-NEXT: leaq (,%rsi,8), %r9
-; CHECK-NEXT: subq %rsi, %r9
-; CHECK-NEXT: vmovd %eax, %xmm0
-; CHECK-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0
-; CHECK-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %eax
-; CHECK-NEXT: retq
-entry:
- %var0 = load i8, ptr %a, align 1
- %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
- %var1 = load i8, ptr %arrayidx.1, align 1
- %mul.2 = shl nsw i64 %a_stride, 1
- %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
- %var2 = load i8, ptr %arrayidx.2, align 1
- %mul.3 = mul nsw i64 %a_stride, 3
- %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
- %var3 = load i8, ptr %arrayidx.3, align 1
- %mul.4 = shl nsw i64 %a_stride, 2
- %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 %mul.4
- %var4 = load i8, ptr %arrayidx.4, align 1
- %mul.5 = mul nsw i64 %a_stride, 5
- %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 %mul.5
- %var5 = load i8, ptr %arrayidx.5, align 1
- %mul.6 = mul nsw i64 %a_stride, 6
- %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 %mul.6
- %var6 = load i8, ptr %arrayidx.6, align 1
- %mul.7 = mul nsw i64 %a_stride, 7
- %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 %mul.7
- %var7 = load i8, ptr %arrayidx.7, align 1
- %var8 = insertelement <8 x i8> poison, i8 %var0, i64 0
- %var9 = insertelement <8 x i8> %var8, i8 %var1, i64 1
- %var10 = insertelement <8 x i8> %var9, i8 %var2, i64 2
- %var11 = insertelement <8 x i8> %var10, i8 %var3, i64 3
- %var12 = insertelement <8 x i8> %var11, i8 %var4, i64 4
- %var13 = insertelement <8 x i8> %var12, i8 %var5, i64 5
- %var14 = insertelement <8 x i8> %var13, i8 %var6, i64 6
- %var15 = insertelement <8 x i8> %var14, i8 %var7, i64 7
- %var16 = zext <8 x i8> %var15 to <8 x i32>
- %var17 = load <8 x i16>, ptr %b, align 2
- %var18 = sext <8 x i16> %var17 to <8 x i32>
- %var19 = mul nsw <8 x i32> %var18, %var16
- %var20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %var19)
- ret i32 %var20
-}
-
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
-define dso_local i32 @foov4i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
-; CHECK-LABEL: foov4i8:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movzbl (%rdi), %eax
-; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; CHECK-NEXT: vmovd %eax, %xmm0
-; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0
-; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %eax
-; CHECK-NEXT: retq
-entry:
- %var0 = load i8, ptr %a, align 1
- %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
- %var1 = load i8, ptr %arrayidx.1, align 1
- %mul.2 = shl nsw i64 %a_stride, 1
- %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
- %var2 = load i8, ptr %arrayidx.2, align 1
- %mul.3 = mul nsw i64 %a_stride, 3
- %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
- %var3 = load i8, ptr %arrayidx.3, align 1
- %var8 = insertelement <4 x i8> poison, i8 %var0, i64 0
- %var9 = insertelement <4 x i8> %var8, i8 %var1, i64 1
- %var10 = insertelement <4 x i8> %var9, i8 %var2, i64 2
- %var11 = insertelement <4 x i8> %var10, i8 %var3, i64 3
- %var16 = zext <4 x i8> %var11 to <4 x i32>
- %var17 = load <4 x i16>, ptr %b, align 2
- %var18 = sext <4 x i16> %var17 to <4 x i32>
- %var19 = mul nsw <4 x i32> %var18, %var16
- %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19)
- ret i32 %var20
-}
-
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
-define dso_local i32 @foov2i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
-; CHECK-LABEL: foov2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzbl (%rdi), %eax
-; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovd %eax, %xmm0
-; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %eax
-; CHECK-NEXT: retq
- %var0 = load i8, ptr %a, align 1
- %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
- %var1 = load i8, ptr %arrayidx.1, align 1
- %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0
- %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1
- %var16 = zext <2 x i8> %var9 to <2 x i32>
- %var17 = load <2 x i16>, ptr %b, align 2
- %var18 = sext <2 x i16> %var17 to <2 x i32>
- %var19 = mul nsw <2 x i32> %var18, %var16
- %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19)
- ret i32 %var20
-}
-
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
-define dso_local i64 @foov2i8_v2i64(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
-; CHECK-LABEL: foov2i8_v2i64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzbl (%rdi), %eax
-; CHECK-NEXT: vpmovsxbq (%rdx), %xmm1
-; CHECK-NEXT: vmovd %eax, %xmm0
-; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0
-; CHECK-NEXT: vpmuldq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovq %xmm0, %rax
-; CHECK-NEXT: retq
- %var0 = load i8, ptr %a, align 1
- %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
- %var1 = load i8, ptr %arrayidx.1, align 1
- %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0
- %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1
- %var16 = zext <2 x i8> %var9 to <2 x i64>
- %var17 = load <2 x i8>, ptr %b, align 2
- %var18 = sext <2 x i8> %var17 to <2 x i64>
- %var19 = mul nsw <2 x i64> %var18, %var16
- %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19)
- ret i64 %var20
-}
-
-
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
-define dso_local i32 @foov4i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
-; CHECK-LABEL: foov4i16:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movzwl (%rdi), %eax
-; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
-; CHECK-NEXT: vpmovsxwd (%rdx), %xmm1
-; CHECK-NEXT: vmovd %eax, %xmm0
-; CHECK-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0
-; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %eax
-; CHECK-NEXT: retq
-entry:
- %var0 = load i16, ptr %a, align 1
- %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
- %var1 = load i16, ptr %arrayidx.1, align 1
- %mul.2 = shl nsw i64 %a_stride, 1
- %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
- %var2 = load i16, ptr %arrayidx.2, align 1
- %mul.3 = mul nsw i64 %a_stride, 3
- %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
- %var3 = load i16, ptr %arrayidx.3, align 1
- %var8 = insertelement <4 x i16> poison, i16 %var0, i64 0
- %var9 = insertelement <4 x i16> %var8, i16 %var1, i64 1
- %var10 = insertelement <4 x i16> %var9, i16 %var2, i64 2
- %var11 = insertelement <4 x i16> %var10, i16 %var3, i64 3
- %var16 = zext <4 x i16> %var11 to <4 x i32>
- %var17 = load <4 x i16>, ptr %b, align 2
- %var18 = sext <4 x i16> %var17 to <4 x i32>
- %var19 = mul nsw <4 x i32> %var18, %var16
- %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19)
- ret i32 %var20
-}
-
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
-define dso_local i32 @foov2i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
-; CHECK-LABEL: foov2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzwl (%rdi), %eax
-; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovd %eax, %xmm0
-; CHECK-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
-; CHECK-NEXT: vpmovsxwd %xmm1, %xmm1
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %eax
-; CHECK-NEXT: retq
- %var0 = load i16, ptr %a, align 1
- %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
- %var1 = load i16, ptr %arrayidx.1, align 1
- %var8 = insertelement <2 x i16> poison, i16 %var0, i64 0
- %var9 = insertelement <2 x i16> %var8, i16 %var1, i64 1
- %var16 = zext <2 x i16> %var9 to <2 x i32>
- %var17 = load <2 x i16>, ptr %b, align 2
- %var18 = sext <2 x i16> %var17 to <2 x i32>
- %var19 = mul nsw <2 x i32> %var18, %var16
- %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19)
- ret i32 %var20
-}
-
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
-define dso_local i64 @foov2i32(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
-; CHECK-LABEL: foov2i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-NEXT: vpmovsxdq (%rdx), %xmm1
-; CHECK-NEXT: vpmullq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovq %xmm0, %rax
-; CHECK-NEXT: retq
- %var0 = load i32, ptr %a, align 1
- %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
- %var1 = load i32, ptr %arrayidx.1, align 1
- %var8 = insertelement <2 x i32> poison, i32 %var0, i64 0
- %var9 = insertelement <2 x i32> %var8, i32 %var1, i64 1
- %var16 = zext <2 x i32> %var9 to <2 x i64>
- %var17 = load <2 x i32>, ptr %b, align 2
- %var18 = sext <2 x i32> %var17 to <2 x i64>
- %var19 = mul nsw <2 x i64> %var18, %var16
- %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19)
- ret i64 %var20
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #1
-declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1
-declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) #1
-declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) #1
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index ea037c1173ae3..4b0e5441b4abf 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -799,9 +799,7 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) {
;
; SSE41-LABEL: PR46586:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm1
-; SSE41-NEXT: pextrd $3, %xmm1, %eax
+; SSE41-NEXT: movzbl 3(%rdi), %eax
; SSE41-NEXT: extractps $3, %xmm0, %ecx
; SSE41-NEXT: xorl %edx, %edx
; SSE41-NEXT: divl %ecx
@@ -810,9 +808,7 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) {
;
; AVX-LABEL: PR46586:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm1, %xmm1
-; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: movzbl 3(%rdi), %eax
; AVX-NEXT: vextractps $3, %xmm0, %ecx
; AVX-NEXT: xorl %edx, %edx
; AVX-NEXT: divl %ecx
diff --git a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
index 8c85dfa09fd2d..345014edd0e9d 100644
--- a/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
+++ b/llvm/test/CodeGen/X86/buildvec-widen-dotproduct.ll
@@ -31,88 +31,62 @@ define i32 @dot_ext_v8i8_v8i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE2-NEXT: pinsrw $6, %r9d, %xmm0
; SSE2-NEXT: pinsrw $7, %esi, %xmm0
; SSE2-NEXT: movdqu (%rdx), %xmm1
-; SSE2-NEXT: pmaddwd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT: pmaddwd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r14
; SSE2-NEXT: retq
;
; SSE4-LABEL: dot_ext_v8i8_v8i32:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: leaq (%rsi,%rsi,4), %rax
-; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
-; SSE4-NEXT: leaq (,%rsi,8), %r8
-; SSE4-NEXT: movzbl (%rdi), %r9d
-; SSE4-NEXT: movd %r9d, %xmm0
-; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
-; SSE4-NEXT: pinsrb $2, (%rdi,%rsi,2), %xmm0
-; SSE4-NEXT: pinsrb $3, (%rdi,%rcx), %xmm0
-; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,4), %xmm0
-; SSE4-NEXT: pinsrb $5, (%rdi,%rax), %xmm0
-; SSE4-NEXT: pinsrb $6, (%rdi,%rcx,2), %xmm0
-; SSE4-NEXT: subq %rsi, %r8
-; SSE4-NEXT: pinsrb $7, (%rdi,%r8), %xmm0
+; SSE4-NEXT: movzbl (%rdi), %eax
+; SSE4-NEXT: leaq (%rsi,%rsi,4), %rcx
+; SSE4-NEXT: leaq (%rsi,%rsi,2), %r8
+; SSE4-NEXT: leaq (,%rsi,8), %r9
+; SSE4-NEXT: subq %rsi, %r9
+; SSE4-NEXT: movd %eax, %xmm0
+; SSE4-NEXT: pinsrb $2, (%rdi,%rsi), %xmm0
+; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,2), %xmm0
+; SSE4-NEXT: pinsrb $6, (%rdi,%r8), %xmm0
+; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,4), %xmm0
+; SSE4-NEXT: pinsrb $10, (%rdi,%rcx), %xmm0
+; SSE4-NEXT: pinsrb $12, (%rdi,%r8,2), %xmm0
+; SSE4-NEXT: pinsrb $14, (%rdi,%r9), %xmm0
; SSE4-NEXT: movdqu (%rdx), %xmm1
-; SSE4-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE4-NEXT: pmaddwd %xmm1, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE4-NEXT: paddd %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT: pmaddwd %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: paddd %xmm1, %xmm0
-; SSE4-NEXT: movd %xmm0, %eax
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT: paddd %xmm0, %xmm1
+; SSE4-NEXT: movd %xmm1, %eax
; SSE4-NEXT: retq
;
-; AVX2-LABEL: dot_ext_v8i8_v8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: leaq (%rsi,%rsi,4), %rax
-; AVX2-NEXT: leaq (%rsi,%rsi,2), %rcx
-; AVX2-NEXT: leaq (,%rsi,8), %r8
-; AVX2-NEXT: subq %rsi, %r8
-; AVX2-NEXT: movzbl (%rdi), %r9d
-; AVX2-NEXT: vmovd %r9d, %xmm0
-; AVX2-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: dot_ext_v8i8_v8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: leaq (%rsi,%rsi,4), %rax
-; AVX512-NEXT: leaq (%rsi,%rsi,2), %rcx
-; AVX512-NEXT: leaq (,%rsi,8), %r8
-; AVX512-NEXT: movzbl (%rdi), %r9d
-; AVX512-NEXT: vmovd %r9d, %xmm0
-; AVX512-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $3, (%rdi,%rcx), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $4, (%rdi,%rsi,4), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $5, (%rdi,%rax), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $6, (%rdi,%rcx,2), %xmm0, %xmm0
-; AVX512-NEXT: subq %rsi, %r8
-; AVX512-NEXT: vpinsrb $7, (%rdi,%r8), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX-LABEL: dot_ext_v8i8_v8i32:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: movzbl (%rdi), %eax
+; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
+; AVX-NEXT: leaq (%rsi,%rsi,4), %r8
+; AVX-NEXT: leaq (,%rsi,8), %r9
+; AVX-NEXT: subq %rsi, %r9
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0
+; AVX-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
%var0 = load i8, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
@@ -175,14 +149,13 @@ define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
;
; SSE4-LABEL: dot_ext_v4i8_v4i32:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: leaq (%rsi,%rsi,2), %rax
-; SSE4-NEXT: movzbl (%rdi), %ecx
-; SSE4-NEXT: movd %ecx, %xmm0
-; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
-; SSE4-NEXT: pinsrb $2, (%rdi,%rsi,2), %xmm0
-; SSE4-NEXT: pinsrb $3, (%rdi,%rax), %xmm0
+; SSE4-NEXT: movzbl (%rdi), %eax
+; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
+; SSE4-NEXT: movd %eax, %xmm0
+; SSE4-NEXT: pinsrb $4, (%rdi,%rsi), %xmm0
+; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,2), %xmm0
+; SSE4-NEXT: pinsrb $12, (%rdi,%rcx), %xmm0
; SSE4-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-NEXT: pmaddwd %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: paddd %xmm1, %xmm0
@@ -194,12 +167,11 @@ define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
; AVX-LABEL: dot_ext_v4i8_v4i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: movzbl (%rdi), %eax
+; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX-NEXT: leaq (%rsi,%rsi,2), %rax
-; AVX-NEXT: vpinsrb $3, (%rdi,%rax), %xmm0, %xmm0
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -311,8 +283,7 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE4: # %bb.0:
; SSE4-NEXT: movzbl (%rdi), %eax
; SSE4-NEXT: movd %eax, %xmm0
-; SSE4-NEXT: pinsrb $1, (%rdi,%rsi), %xmm0
-; SSE4-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE4-NEXT: pinsrb $8, (%rdi,%rsi), %xmm0
; SSE4-NEXT: pmovsxbq (%rdx), %xmm1
; SSE4-NEXT: pmuldq %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
@@ -324,8 +295,7 @@ define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: movzbl (%rdi), %eax
; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrb $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0
; AVX-NEXT: vpmovsxbq (%rdx), %xmm1
; AVX-NEXT: vpmuldq %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -374,14 +344,13 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
;
; SSE4-LABEL: dot_ext_v4i16_v4i32:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: leaq (%rsi,%rsi,2), %rax
-; SSE4-NEXT: movzwl (%rdi), %ecx
-; SSE4-NEXT: movd %ecx, %xmm0
-; SSE4-NEXT: pinsrw $1, (%rdi,%rsi), %xmm0
-; SSE4-NEXT: pinsrw $2, (%rdi,%rsi,2), %xmm0
-; SSE4-NEXT: pinsrw $3, (%rdi,%rax), %xmm0
+; SSE4-NEXT: movzwl (%rdi), %eax
+; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx
+; SSE4-NEXT: movd %eax, %xmm0
+; SSE4-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0
+; SSE4-NEXT: pinsrw $4, (%rdi,%rsi,2), %xmm0
+; SSE4-NEXT: pinsrw $6, (%rdi,%rcx), %xmm0
; SSE4-NEXT: pmovsxwd (%rdx), %xmm1
-; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-NEXT: pmulld %xmm0, %xmm1
; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE4-NEXT: paddd %xmm1, %xmm0
@@ -390,41 +359,22 @@ define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE4-NEXT: movd %xmm1, %eax
; SSE4-NEXT: retq
;
-; AVX2-LABEL: dot_ext_v4i16_v4i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: leaq (%rsi,%rsi,2), %rax
-; AVX2-NEXT: movzwl (%rdi), %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrw $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX2-NEXT: vpinsrw $3, (%rdi,%rax), %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxwd (%rdx), %xmm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: dot_ext_v4i16_v4i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: leaq (%rsi,%rsi,2), %rax
-; AVX512-NEXT: movzwl (%rdi), %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
-; AVX512-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrw $2, (%rdi,%rsi,2), %xmm0, %xmm0
-; AVX512-NEXT: vpinsrw $3, (%rdi,%rax), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpmovsxwd (%rdx), %xmm1
-; AVX512-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX-LABEL: dot_ext_v4i16_v4i32:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpmovsxwd (%rdx), %xmm1
+; AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
entry:
%var0 = load i16, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
@@ -509,16 +459,15 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: psllq $32, %xmm2
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: paddq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddq %xmm1, %xmm0
; SSE2-NEXT: movq %xmm0, %rax
@@ -560,8 +509,8 @@ define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind {
; AVX512-LABEL: dot_ext_v2i32_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpinsrd $1, (%rdi,%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vpmovsxdq (%rdx), %xmm1
; AVX512-NEXT: vpmullq %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
>From 9dc5117cdf875ab3355ea9470b3216806ce193d6 Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Mon, 5 May 2025 18:36:20 +0530
Subject: [PATCH 4/6] Fix the bad codegen
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 +++
llvm/test/CodeGen/PowerPC/custom-stov.ll | 16 ++---
llvm/test/CodeGen/SystemZ/vec-mul-07.ll | 30 ++-------
llvm/test/CodeGen/SystemZ/vec-mul-09.ll | 30 ++-------
llvm/test/CodeGen/WebAssembly/interleave.ll | 63 +++++++++----------
llvm/test/CodeGen/X86/avx512-i1test.ll | 14 ++---
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 53 +++++++++++++---
.../vector-shuffle-combining-avx512bwvl.ll | 2 +-
8 files changed, 110 insertions(+), 105 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c7165ae520039..b7980c92ed5fa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14319,6 +14319,13 @@ static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
return SDValue();
}
+ if (!all_of(BV->op_values(),
+ [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
+ // If the build vector any element other than \ISD::LOAD, we cannot widen
+ // it.
+ return SDValue();
+ }
+
SDLoc dl(BV);
EVT VT = BV.getValueType();
EVT EltVT = BV.getOperand(0).getValueType();
diff --git a/llvm/test/CodeGen/PowerPC/custom-stov.ll b/llvm/test/CodeGen/PowerPC/custom-stov.ll
index d1bcc73fd212a..0642fa900b0e5 100644
--- a/llvm/test/CodeGen/PowerPC/custom-stov.ll
+++ b/llvm/test/CodeGen/PowerPC/custom-stov.ll
@@ -15,18 +15,18 @@ define void @_blah() {
; CHECK-NEXT: vperm v2, v4, v3, v2
; CHECK-NEXT: lwz r4, 16(0)
; CHECK-NEXT: stvx v2, 0, r5
-; CHECK-NEXT: sth r3, -34(r1)
-; CHECK-NEXT: sth r3, -38(r1)
-; CHECK-NEXT: sth r3, -42(r1)
-; CHECK-NEXT: sth r3, -46(r1)
-; CHECK-NEXT: lhz r3, -52(r1)
+; CHECK-NEXT: lhz r5, -64(r1)
+; CHECK-NEXT: lhz r6, -58(r1)
+; CHECK-NEXT: lhz r7, -52(r1)
+; CHECK-NEXT: sth r4, -34(r1)
+; CHECK-NEXT: sth r3, -36(r1)
; CHECK-NEXT: sth r3, -40(r1)
-; CHECK-NEXT: lhz r3, -58(r1)
; CHECK-NEXT: sth r3, -44(r1)
-; CHECK-NEXT: lhz r3, -64(r1)
-; CHECK-NEXT: sth r4, -36(r1)
; CHECK-NEXT: sth r3, -48(r1)
; CHECK-NEXT: addi r3, r1, -48
+; CHECK-NEXT: sth r7, -38(r1)
+; CHECK-NEXT: sth r6, -42(r1)
+; CHECK-NEXT: sth r5, -46(r1)
; CHECK-NEXT: lvx v2, 0, r3
; CHECK-NEXT: addi r3, r1, -32
; CHECK-NEXT: vsldoi v3, v2, v2, 8
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-07.ll b/llvm/test/CodeGen/SystemZ/vec-mul-07.ll
index ca9e8412d95bd..73c7a8dec5dfc 100644
--- a/llvm/test/CodeGen/SystemZ/vec-mul-07.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-07.ll
@@ -7,11 +7,7 @@
define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: larl %r1, .LCPI0_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vperm %v1, %v24, %v0, %v0
-; CHECK-NEXT: vperm %v0, %v26, %v0, %v0
-; CHECK-NEXT: vmlhw %v24, %v1, %v0
+; CHECK-NEXT: vmleb %v24, %v24, %v26
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%zext1 = zext <8 x i8> %shuf1 to <8 x i16>
@@ -25,12 +21,7 @@ define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2) {
define <8 x i16> @f2(<16 x i8> %val1, <16 x i8> %val2) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: larl %r1, .LCPI1_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vgbm %v1, 0
-; CHECK-NEXT: vperm %v2, %v24, %v1, %v0
-; CHECK-NEXT: vperm %v0, %v26, %v1, %v0
-; CHECK-NEXT: vmlhw %v24, %v2, %v0
+; CHECK-NEXT: vmlob %v24, %v24, %v26
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%zext1 = zext <8 x i8> %shuf1 to <8 x i16>
@@ -72,11 +63,7 @@ define <8 x i16> @f4(<16 x i8> %val1, <16 x i8> %val2) {
define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: larl %r1, .LCPI4_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vperm %v1, %v24, %v0, %v0
-; CHECK-NEXT: vperm %v0, %v26, %v0, %v0
-; CHECK-NEXT: vmlf %v24, %v1, %v0
+; CHECK-NEXT: vmleh %v24, %v24, %v26
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%zext1 = zext <4 x i16> %shuf1 to <4 x i32>
@@ -90,12 +77,7 @@ define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2) {
define <4 x i32> @f6(<8 x i16> %val1, <8 x i16> %val2) {
; CHECK-LABEL: f6:
; CHECK: # %bb.0:
-; CHECK-NEXT: larl %r1, .LCPI5_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vgbm %v1, 0
-; CHECK-NEXT: vperm %v2, %v24, %v1, %v0
-; CHECK-NEXT: vperm %v0, %v26, %v1, %v0
-; CHECK-NEXT: vmlf %v24, %v2, %v0
+; CHECK-NEXT: vmloh %v24, %v24, %v26
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%zext1 = zext <4 x i16> %shuf1 to <4 x i32>
@@ -137,7 +119,7 @@ define <4 x i32> @f8(<8 x i16> %val1, <8 x i16> %val2) {
define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: vmlef %v24, %v24, %v26
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
%zext1 = zext <2 x i32> %shuf1 to <2 x i64>
@@ -151,7 +133,7 @@ define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2) {
define <2 x i64> @f10(<4 x i32> %val1, <4 x i32> %val2) {
; CHECK-LABEL: f10:
; CHECK: # %bb.0:
-; CHECK-NEXT: vgbm %v24, 0
+; CHECK-NEXT: vmlof %v24, %v24, %v26
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
%zext1 = zext <2 x i32> %shuf1 to <2 x i64>
diff --git a/llvm/test/CodeGen/SystemZ/vec-mul-09.ll b/llvm/test/CodeGen/SystemZ/vec-mul-09.ll
index e1e52bed7a143..def57ca03bb0c 100644
--- a/llvm/test/CodeGen/SystemZ/vec-mul-09.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-mul-09.ll
@@ -7,11 +7,7 @@
define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: larl %r1, .LCPI0_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vperm %v1, %v24, %v0, %v0
-; CHECK-NEXT: vperm %v0, %v26, %v0, %v0
-; CHECK-NEXT: vmalhw %v24, %v1, %v0, %v28
+; CHECK-NEXT: vmaleb %v24, %v24, %v26, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%zext1 = zext <8 x i8> %shuf1 to <8 x i16>
@@ -26,12 +22,7 @@ define <8 x i16> @f1(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
define <8 x i16> @f2(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: larl %r1, .LCPI1_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vgbm %v1, 0
-; CHECK-NEXT: vperm %v2, %v24, %v1, %v0
-; CHECK-NEXT: vperm %v0, %v26, %v1, %v0
-; CHECK-NEXT: vmalhw %v24, %v2, %v0, %v28
+; CHECK-NEXT: vmalob %v24, %v24, %v26, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <16 x i8> %val1, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%zext1 = zext <8 x i8> %shuf1 to <8 x i16>
@@ -76,11 +67,7 @@ define <8 x i16> @f4(<16 x i8> %val1, <16 x i8> %val2, <8 x i16> %val3) {
define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: larl %r1, .LCPI4_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vperm %v1, %v24, %v0, %v0
-; CHECK-NEXT: vperm %v0, %v26, %v0, %v0
-; CHECK-NEXT: vmalf %v24, %v1, %v0, %v28
+; CHECK-NEXT: vmaleh %v24, %v24, %v26, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%zext1 = zext <4 x i16> %shuf1 to <4 x i32>
@@ -95,12 +82,7 @@ define <4 x i32> @f5(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
define <4 x i32> @f6(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
; CHECK-LABEL: f6:
; CHECK: # %bb.0:
-; CHECK-NEXT: larl %r1, .LCPI5_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vgbm %v1, 0
-; CHECK-NEXT: vperm %v2, %v24, %v1, %v0
-; CHECK-NEXT: vperm %v0, %v26, %v1, %v0
-; CHECK-NEXT: vmalf %v24, %v2, %v0, %v28
+; CHECK-NEXT: vmaloh %v24, %v24, %v26, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <8 x i16> %val1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%zext1 = zext <4 x i16> %shuf1 to <4 x i32>
@@ -145,7 +127,7 @@ define <4 x i32> @f8(<8 x i16> %val1, <8 x i16> %val2, <4 x i32> %val3) {
define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vlr %v24, %v28
+; CHECK-NEXT: vmalef %v24, %v24, %v26, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
%zext1 = zext <2 x i32> %shuf1 to <2 x i64>
@@ -160,7 +142,7 @@ define <2 x i64> @f9(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
define <2 x i64> @f10(<4 x i32> %val1, <4 x i32> %val2, <2 x i64> %val3) {
; CHECK-LABEL: f10:
; CHECK: # %bb.0:
-; CHECK-NEXT: vlr %v24, %v28
+; CHECK-NEXT: vmalof %v24, %v24, %v26, %v28
; CHECK-NEXT: br %r14
%shuf1 = shufflevector <4 x i32> %val1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
%zext1 = zext <2 x i32> %shuf1 to <2 x i64>
diff --git a/llvm/test/CodeGen/WebAssembly/interleave.ll b/llvm/test/CodeGen/WebAssembly/interleave.ll
index eada6cc8c6813..c20b5e42c4850 100644
--- a/llvm/test/CodeGen/WebAssembly/interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/interleave.ll
@@ -17,12 +17,13 @@ define hidden void @accumulate8x2(ptr dead_on_unwind noalias writable sret(%stru
; CHECK-LABEL: accumulate8x2:
; CHECK: loop
; CHECK: v128.load64_zero
-; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: local.tee 10
-; CHECK: i8x16.shuffle 1, 17, 18, 19, 3, 21, 22, 23, 5, 25, 26, 27, 7, 29, 30, 31
+; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
-; CHECK: local.get 10
-; CHECK: i8x16.shuffle 0, 17, 18, 19, 2, 21, 22, 23, 4, 25, 26, 27, 6, 29, 30, 31
+; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
%4 = load i32, ptr %0, align 4
%5 = icmp eq i32 %2, 0
@@ -64,18 +65,21 @@ define hidden void @accumulate8x4(ptr dead_on_unwind noalias writable sret(%stru
; CHECK-LABEL: accumulate8x4
; CHECK: loop
; CHECK: v128.load
-; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: local.tee 14
-; CHECK: i8x16.shuffle 3, 17, 18, 19, 7, 21, 22, 23, 11, 25, 26, 27, 15, 29, 30, 31
+; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
-; CHECK: local.get 14
-; CHECK: i8x16.shuffle 2, 17, 18, 19, 6, 21, 22, 23, 10, 25, 26, 27, 14, 29, 30, 31
+; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
-; CHECK: local.get 14
-; CHECK: i8x16.shuffle 1, 17, 18, 19, 5, 21, 22, 23, 9, 25, 26, 27, 13, 29, 30, 31
+; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
-; CHECK: local.get 14
-; CHECK: i8x16.shuffle 0, 17, 18, 19, 4, 21, 22, 23, 8, 25, 26, 27, 12, 29, 30, 31
+; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
%4 = load i32, ptr %0, align 4
%5 = icmp eq i32 %2, 0
@@ -133,12 +137,11 @@ define hidden void @accumulate16x2(ptr dead_on_unwind noalias writable sret(%str
; CHECK-LABEL: accumulate16x2
; CHECK: loop
; CHECK: v128.load
-; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: local.tee 10
-; CHECK: i8x16.shuffle 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31
+; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
-; CHECK: local.get 10
-; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
%4 = load i32, ptr %0, align 4
%5 = icmp eq i32 %2, 0
@@ -181,23 +184,17 @@ define hidden void @accumulate16x4(ptr dead_on_unwind noalias writable sret(%str
; CHECK: loop
; CHECK: v128.load 0:p2align=1
; CHECK: v128.load 16:p2align=1
-; CHECK: i8x16.shuffle 6, 7, 0, 1, 14, 15, 0, 1, 22, 23, 0, 1, 30, 31, 0, 1
-; CHECK: v128.const 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: local.tee 15
-; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
-
+; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 4, 5, 0, 1, 12, 13, 0, 1, 20, 21, 0, 1, 28, 29, 0, 1
-; CHECK: local.get 15
-; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 2, 3, 0, 1, 10, 11, 0, 1, 18, 19, 0, 1, 26, 27, 0, 1
-; CHECK: local.get 15
-; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
-; CHECK: i8x16.shuffle 0, 1, 0, 1, 8, 9, 0, 1, 16, 17, 0, 1, 24, 25, 0, 1
-; CHECK: local.get 15
-; CHECK: i8x16.shuffle 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
; CHECK: i32x4.add
%4 = load i32, ptr %0, align 4
%5 = icmp eq i32 %2, 0
diff --git a/llvm/test/CodeGen/X86/avx512-i1test.ll b/llvm/test/CodeGen/X86/avx512-i1test.ll
index d43f05bbd5a1d..c5d4c87d66da2 100644
--- a/llvm/test/CodeGen/X86/avx512-i1test.ll
+++ b/llvm/test/CodeGen/X86/avx512-i1test.ll
@@ -14,13 +14,13 @@ define void @func() {
; CHECK-NEXT: retq
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: # %bb33
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_1
-; CHECK-NEXT: # %bb.2: # %bb35
-; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jmp .LBB0_1
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB0_1
+; CHECK-NEXT: # %bb.2: # %bb35
+; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jmp .LBB0_1
bb1:
br i1 poison, label %L_10, label %L_10
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 5a4233f1a0ffd..e27a77ed2293d 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -380,14 +380,51 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
}
define <4 x double> @PR34175(ptr %p) {
-; AVX512-LABEL: PR34175:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12]
-; AVX512-NEXT: vpermd (%rdi), %zmm0, %zmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: PR34175:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: PR34175:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: PR34175:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
+; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: PR34175:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
+; AVX512BWVL-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512BWVL-NEXT: retq
+;
+; AVX512VBMI-LABEL: PR34175:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
+; AVX512VBMI-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512VBMI-NEXT: retq
%v = load <32 x i16>, ptr %p, align 2
%shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
%tofp = uitofp <4 x i16> %shuf to <4 x double>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index ee59169498d27..6e0fa72398dda 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -274,7 +274,7 @@ define i64 @PR55050() {
; X86-NEXT: # %bb.1: # %if
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB15_2: # %exit
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: retl
;
; X64-LABEL: PR55050:
>From dd28865b4dc37bdbc21fe3ec0c036c00573cc396 Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Tue, 6 May 2025 10:39:00 +0530
Subject: [PATCH 5/6] Revert back two testcases formatting
---
llvm/test/CodeGen/X86/avx512-i1test.ll | 14 +++++++-------
.../X86/vector-shuffle-combining-avx512bwvl.ll | 2 +-
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/test/CodeGen/X86/avx512-i1test.ll b/llvm/test/CodeGen/X86/avx512-i1test.ll
index c5d4c87d66da2..d43f05bbd5a1d 100644
--- a/llvm/test/CodeGen/X86/avx512-i1test.ll
+++ b/llvm/test/CodeGen/X86/avx512-i1test.ll
@@ -14,13 +14,13 @@ define void @func() {
; CHECK-NEXT: retq
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: # %bb33
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne .LBB0_1
-; CHECK-NEXT: # %bb.2: # %bb35
-; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jmp .LBB0_1
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB0_1
+; CHECK-NEXT: # %bb.2: # %bb35
+; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jmp .LBB0_1
bb1:
br i1 poison, label %L_10, label %L_10
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index 6e0fa72398dda..ee59169498d27 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -274,7 +274,7 @@ define i64 @PR55050() {
; X86-NEXT: # %bb.1: # %if
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB15_2: # %exit
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: retl
;
; X64-LABEL: PR55050:
>From 321b52ed4540c043a9357ff2bc4ab56f6fd528b7 Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Tue, 6 May 2025 11:32:48 +0530
Subject: [PATCH 6/6] Move folding logic from DAGCombiner to X86ISelLowering as
it is specific to X86
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 75 ------------------
llvm/lib/Target/X86/X86ISelLowering.cpp | 76 +++++++++++++++++++
2 files changed, 76 insertions(+), 75 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b7980c92ed5fa..ea1435c3934be 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14295,78 +14295,6 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) {
return DAG.getZExtOrTrunc(NewAbs, SDLoc(Extend), VT);
}
-// Try to widen the build vector and bitcast it to the type of zext.
-// This is a special case for the 128-bit vector types. Intention is to remove
-// the zext and replace it with a bitcast the wider type. While lowering
-// the bitcast is removed and extra commutation due to zext is avoided.
-// For example:
-// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
-// build_vector (x, 0, y, 0, z, w, 0)
-static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
-
- assert(Extend->getOpcode() == ISD::ZERO_EXTEND && "Expected zero extend.");
-
- EVT ExtendVT = Extend->getValueType(0);
-
- SDValue BV = Extend->getOperand(0);
- if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
- return SDValue();
-
- if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
- // If the build vector has undef elements, we cannot widen it.
- // The widening would create a vector with more undef elements, which
- // is not valid.
- return SDValue();
- }
-
- if (!all_of(BV->op_values(),
- [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
- // If the build vector any element other than \ISD::LOAD, we cannot widen
- // it.
- return SDValue();
- }
-
- SDLoc dl(BV);
- EVT VT = BV.getValueType();
- EVT EltVT = BV.getOperand(0).getValueType();
- unsigned NumElts = VT.getVectorNumElements();
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
- if (TLI.getTypeAction(*DAG.getContext(), VT) !=
- TargetLowering::TypeWidenVector)
- return SDValue();
-
- EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
- unsigned WidenNumElts = WidenVT.getVectorNumElements();
-
- SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
- assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
- // Fill the new elements with Zero.
- NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
- // Compute the step to place the elements in the right place and control the
- // iteration.
- unsigned step = WidenNumElts / NumElts;
- if (WidenVT.is128BitVector()) {
- if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
- for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
- i--, j -= step) {
- SDValue temp = NewOps[i];
- NewOps[i] = NewOps[j];
- NewOps[j] = temp;
- }
- // Create new build vector with WidenVT and NewOps
- SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
- // Replace the old build vector with the new one. Bitcast the
- // new build vector to the type of the zext.
- SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
- DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
- return NewBV;
- }
- }
- return SDValue();
-}
-
SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -14693,9 +14621,6 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
return SDValue(CSENode, 0);
}
- if (SDValue V = widenBuildVec(N, DAG))
- return V;
-
return SDValue();
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 483aceb239b0c..8f447358695bf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55936,6 +55936,79 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
NegN2);
}
+// Try to widen the build vector and bitcast it to the type of zext.
+// This is a special case for the 128-bit vector types. Intention is to remove
+// the zext and replace it with a bitcast the wider type. While lowering
+// the bitcast is removed and extra commutation due to zext is avoided.
+// For example:
+// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
+// build_vector (x, 0, y, 0, z, w, 0)
+static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {
+
+ if (Extend->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+
+ EVT ExtendVT = Extend->getValueType(0);
+
+ SDValue BV = Extend->getOperand(0);
+ if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
+ return SDValue();
+
+ if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
+ // If the build vector has undef elements, we cannot widen it.
+ // The widening would create a vector with more undef elements, which
+ // is not valid.
+ return SDValue();
+ }
+
+ if (!all_of(BV->op_values(),
+ [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
+ // If the build vector any element other than \ISD::LOAD, we cannot widen
+ // it.
+ return SDValue();
+ }
+
+ SDLoc dl(BV);
+ EVT VT = BV.getValueType();
+ EVT EltVT = BV.getOperand(0).getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+ TargetLowering::TypeWidenVector)
+ return SDValue();
+
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+ SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
+ assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
+ // Fill the new elements with Zero.
+ NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
+ // Compute the step to place the elements in the right place and control the
+ // iteration.
+ unsigned step = WidenNumElts / NumElts;
+ if (WidenVT.is128BitVector()) {
+ if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
+ for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
+ i--, j -= step) {
+ SDValue temp = NewOps[i];
+ NewOps[i] = NewOps[j];
+ NewOps[j] = temp;
+ }
+ // Create new build vector with WidenVT and NewOps
+ SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
+ // Replace the old build vector with the new one. Bitcast the
+ // new build vector to the type of the zext.
+ SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
+ return NewBV;
+ }
+ }
+ return SDValue();
+}
+
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -55995,6 +56068,9 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
}
}
+ if (SDValue V = widenBuildVec(N, DAG))
+ return V;
+
return SDValue();
}
More information about the llvm-commits
mailing list