[llvm] ac4cb17 - [X86] fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) iff c2 is all/no bits mask
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 21 07:10:59 PDT 2022
Author: Simon Pilgrim
Date: 2022-06-21T15:10:43+01:00
New Revision: ac4cb1775b8f40be2809245b818993a576639c1f
URL: https://github.com/llvm/llvm-project/commit/ac4cb1775b8f40be2809245b818993a576639c1f
DIFF: https://github.com/llvm/llvm-project/commit/ac4cb1775b8f40be2809245b818993a576639c1f.diff
LOG: [X86] fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) iff c2 is all/no bits mask
Noticed on D128216 - if we're zeroing out vector elements of a mul/mulh result then see if we can merge the and-mask into the mul by just multiplying by zero.
Ideally we'd make this generic (similar to the existing foldSelectWithIdentityConstant?), but these cases are appearing very late, after the constants have been lowered to constant-pool loads.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/combine-udiv.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 70cd94e58c07..92ee38e16eff 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47651,6 +47651,20 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
return R;
+ // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
+ // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
+ // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
+ if (VT.isVector() && getTargetConstantFromNode(N1)) {
+ unsigned Opc0 = N0.getOpcode();
+ if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
+ getTargetConstantFromNode(N0.getOperand(1)) &&
+ DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
+ N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
+ SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
+ return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
+ }
+ }
+
// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
// avoids slow variable shift (moving shift amount to ECX etc.)
if (isOneConstant(N1) && N0->hasOneUse()) {
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 6c0401624243..c609d22cda47 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -510,11 +510,9 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_udiv_nonuniform:
@@ -697,12 +695,10 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: paddw %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index a549d3fd5343..2858a2a2bdb0 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1759,14 +1759,12 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; SSE2-LABEL: constant_funnnel_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psllw $1, %xmm0
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v8i16:
@@ -1853,14 +1851,12 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v8i16:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: pand %xmm1, %xmm2
; X86-SSE2-NEXT: psllw $1, %xmm0
; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: por %xmm2, %xmm0
+; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index c25ccb6b06e7..24c963f34607 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -812,11 +812,9 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
; SSE2-LABEL: test_v4i16_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 545089bd3340..2b4e58b19f43 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1166,11 +1166,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
@@ -1220,11 +1218,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v8i16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pandn %xmm0, %xmm2
+; X86-SSE-NEXT: pandn %xmm0, %xmm1
; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: pand %xmm1, %xmm0
-; X86-SSE-NEXT: por %xmm2, %xmm0
+; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <8 x i16> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 4cc16c584e46..be9550095cd8 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1483,11 +1483,9 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i16:
@@ -1537,11 +1535,9 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v4i16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pandn %xmm0, %xmm2
+; X86-SSE-NEXT: pandn %xmm0, %xmm1
; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: pand %xmm1, %xmm0
-; X86-SSE-NEXT: por %xmm2, %xmm0
+; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift
More information about the llvm-commits
mailing list