[llvm-branch-commits] [llvm] release/22.x: [SelectionDAG] Make sure demanded lanes for AND/MUL-by-zero are frozen (#180727) (PR #181147)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Feb 12 06:19:04 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Nikita Popov (nikic)
<details>
<summary>Changes</summary>
Cherry-pick of 6420099bcc62a09e002e500870216b2dd9d256a9.
---
Patch is 28.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/181147.diff
16 Files Affected:
- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+5)
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+10-1)
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+7)
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+12-9)
- (modified) llvm/test/CodeGen/AArch64/neon-dotreduce.ll (+3-2)
- (modified) llvm/test/CodeGen/X86/insertelement-zero.ll (+40)
- (modified) llvm/test/CodeGen/X86/pr134602.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/pr173924.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-fshl-256.ll (+6)
- (modified) llvm/test/CodeGen/X86/vector-fshl-512.ll (+4)
- (modified) llvm/test/CodeGen/X86/vector-fshl-rot-256.ll (+5)
- (modified) llvm/test/CodeGen/X86/vector-fshl-rot-512.ll (+4)
- (modified) llvm/test/CodeGen/X86/vector-fshr-512.ll (+24-22)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-512.ll (+20-18)
- (modified) llvm/test/CodeGen/X86/vector-rotate-256.ll (+5)
- (modified) llvm/test/CodeGen/X86/vector-rotate-512.ll (+4)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 604319095e74f..e70a59d9fb0fd 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1723,6 +1723,11 @@ class SelectionDAG {
/// Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getFreeze(SDValue V);
+ /// Return a freeze of V if any of the demanded elts may be undef or poison.
+ /// If \p PoisonOnly is true, then only check for poison elements.
+ LLVM_ABI SDValue getFreeze(SDValue V, const APInt &DemandedElts,
+ bool PoisonOnly = false);
+
/// Return an AssertAlignSDNode.
LLVM_ABI SDValue getAssertAlign(const SDLoc &DL, SDValue V, Align A);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b071a12587826..2f8fe09c3dc98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -24060,8 +24060,17 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
// Build the mask and return the corresponding DAG node.
auto BuildMaskAndNode = [&](SDValue TrueVal, SDValue FalseVal,
unsigned MaskOpcode) {
- for (unsigned I = 0; I != NumElts; ++I)
+ APInt InsertedEltMask = APInt::getZero(NumElts);
+ for (unsigned I = 0; I != NumElts; ++I) {
Mask[I] = Ops[I] ? TrueVal : FalseVal;
+ if (Ops[I])
+ InsertedEltMask.setBit(I);
+ }
+ // Make sure to freeze the source vector in case any of the elements
+ // overwritten by the insert may be poison. Otherwise those elements
+ // could end up being poison instead of 0/-1 after the AND/OR.
+ CurVec =
+ DAG.getFreeze(CurVec, InsertedEltMask, /*PoisonOnly=*/true);
return DAG.getNode(MaskOpcode, DL, VT, CurVec,
DAG.getBuildVector(VT, DL, Mask));
};
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8e609bc443da5..5a926b8bf6211 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2502,6 +2502,13 @@ SDValue SelectionDAG::getFreeze(SDValue V) {
return getNode(ISD::FREEZE, SDLoc(V), V.getValueType(), V);
}
+SDValue SelectionDAG::getFreeze(SDValue V, const APInt &DemandedElts,
+ bool PoisonOnly) {
+ if (isGuaranteedNotToBeUndefOrPoison(V, DemandedElts, PoisonOnly))
+ return V;
+ return getFreeze(V);
+}
+
/// getShiftAmountOperand - Return the specified value casted to
/// the target's desired shift amount type.
SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ff5e046f0fcd5..ee489f9fc74f9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3794,19 +3794,22 @@ bool TargetLowering::SimplifyDemandedVectorElts(
if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
- // If we know that a demanded element was zero in Op1 we don't need to
- // demand it in Op0 - its guaranteed to be zero.
- APInt DemandedElts0 = DemandedElts & ~SrcZero;
- if (SimplifyDemandedVectorElts(Op0, DemandedElts0, KnownUndef, KnownZero,
+ // FIXME: If we know that a demanded element was zero in Op1 we don't need
+ // to demand it in Op0 - its guaranteed to be zero. There is however a
+ // restriction, as we must not make any of the originally demanded elements
+ // more poisonous. We could reduce amount of elements demanded, but then we
+ // also need a to inform SimplifyDemandedVectorElts that some elements must
+ // not be made more poisonous.
+ if (SimplifyDemandedVectorElts(Op0, DemandedElts, KnownUndef, KnownZero,
TLO, Depth + 1))
return true;
- KnownUndef &= DemandedElts0;
- KnownZero &= DemandedElts0;
+ KnownUndef &= DemandedElts;
+ KnownZero &= DemandedElts;
- // If every element pair has a zero/undef then just fold to zero.
- // fold (and x, undef) -> 0 / (and x, 0) -> 0
- // fold (mul x, undef) -> 0 / (mul x, 0) -> 0
+ // If every element pair has a zero/undef/poison then just fold to zero.
+ // fold (and x, undef/poison) -> 0 / (and x, 0) -> 0
+ // fold (mul x, undef/poison) -> 0 / (mul x, 0) -> 0
if (DemandedElts.isSubsetOf(SrcZero | KnownZero | SrcUndef | KnownUndef))
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index dbbe00c89eecf..8854d8ab80798 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -445,8 +445,9 @@ entry:
define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
; CHECK-SD-LABEL: test_udot_v5i8_nomla:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: ldr x8, [x0]
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: mov v1.s[0], v2.s[0]
@@ -2681,8 +2682,8 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
; CHECK-SD-NEXT: ldp q2, q1, [x0]
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: ushll v4.8h, v2.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: ushll2 v2.8h, v2.16b, #0
; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
; CHECK-SD-NEXT: uaddl2 v5.4s, v4.8h, v1.8h
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index b66ad07c466e1..e1c8cefa73d8a 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -539,3 +539,43 @@ define <4 x i32> @PR41512_loads(ptr %p1, ptr %p2) {
%r = shufflevector <4 x i32> %ins1, <4 x i32> %ins2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i32> %r
}
+
+; Reproducer for bugs in DAGCombiner and SimplifyDemandedVectorElts.
+;
+; Problem was that DAGCombiner replaced INSERT_VECTOR_ELT by AND, without
+; considering that %i has poison elements. So instead of overwriting those
+; poison elements by inserting zeroes, we got "AND poison, 0" which is poison
+; and not guaranteed to be folded as zero.
+;
+; When solving the above by inserting a FREEZE another bug
+; surfaced. SimplifyDemandedVectorElts was not demanding elements that were
+; known to be AND:ed by zero. So the FREEZE ended up being removed and we
+; still got "AND poison, 0".
+;
+; Expected result is that the add reduction computes the sum 0+0+0+0+0+77+0+77 = 154.
+define i64 @fold_insertelement_to_and(i32 noundef %arg) {
+; SSE-LABEL: fold_insertelement_to_and:
+; SSE: # %bb.0:
+; SSE-NEXT: movl $154, %eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: fold_insertelement_to_and:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movl $154, %eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: fold_insertelement_to_and:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,77]
+; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: retq
+ %i = shufflevector <8 x i64> zeroinitializer, <8 x i64> splat (i64 77), <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 4, i32 8, i32 6, i32 10>
+ %i1 = insertelement <8 x i64> %i, i64 0, i64 0
+ %i2 = insertelement <8 x i64> %i1, i64 0, i64 2
+ %i3 = shufflevector <8 x i64> %i2, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %i4 = tail call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %i3)
+ ret i64 %i4
+}
diff --git a/llvm/test/CodeGen/X86/pr134602.ll b/llvm/test/CodeGen/X86/pr134602.ll
index 063b6f31fe630..50efcde462532 100644
--- a/llvm/test/CodeGen/X86/pr134602.ll
+++ b/llvm/test/CodeGen/X86/pr134602.ll
@@ -17,7 +17,7 @@ define i32 @PR134602(i16 %a0) {
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7]
; X64-NEXT: paddw %xmm0, %xmm1
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: psrld $16, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr173924.ll b/llvm/test/CodeGen/X86/pr173924.ll
index f5059da10da7c..d130014a8fa62 100644
--- a/llvm/test/CodeGen/X86/pr173924.ll
+++ b/llvm/test/CodeGen/X86/pr173924.ll
@@ -7,7 +7,7 @@ define i256 @PR173924(<8 x i256> %a0) {
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r8d
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 2fadf5f101626..919450857171b 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1004,6 +1004,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1015,6 +1016,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1026,6 +1028,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1037,6 +1040,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1057,6 +1061,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -1092,6 +1097,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 34ad667f01171..fed534a7b9440 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -552,6 +552,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3
@@ -570,6 +571,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3
@@ -584,6 +586,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
@@ -601,6 +604,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 5f7e4070b3783..1f6df959f6d00 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -773,6 +773,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
@@ -784,6 +785,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
@@ -795,6 +797,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
@@ -806,6 +809,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
@@ -817,6 +821,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index 4c6680ac4a19a..da2d41ee19d5a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -308,6 +308,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm6
; AVX512F-NEXT: vpsrlw %xmm3, %ymm6, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpsllw %xmm1, %ymm4, %ymm2
; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
@@ -325,6 +326,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm6
; AVX512VL-NEXT: vpsrlw %xmm3, %ymm6, %ymm3
; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpsllw %xmm1, %ymm4, %ymm2
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
@@ -338,6 +340,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4
; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
@@ -349,6 +352,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 25f8f94eb834c..4257bcc0e3f99 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -547,36 +547,38 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpaddw %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw %xmm4, %ymm5, %ymm5
; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/181147
More information about the llvm-branch-commits
mailing list