[llvm] 275729a - [X86] Generalize i8 CTPOP expansion to work with any input with 8 or less active bits
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 2 06:30:16 PST 2024
Author: Simon Pilgrim
Date: 2024-02-02T14:29:57Z
New Revision: 275729ae06d568e9589392c142a416fb8c2bb1a8
URL: https://github.com/llvm/llvm-project/commit/275729ae06d568e9589392c142a416fb8c2bb1a8
DIFF: https://github.com/llvm/llvm-project/commit/275729ae06d568e9589392c142a416fb8c2bb1a8.diff
LOG: [X86] Generalize i8 CTPOP expansion to work with any input with 8 or less active bits
Extend #79989 slightly to use KnownBits on the CTPOP input - this should make it easier to add additional cases identified in #79823
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/masked_compressstore.ll
llvm/test/CodeGen/X86/masked_expandload.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8f38c0d0daf58..b6468a9b18e76 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -428,10 +428,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
} else {
setOperationAction(ISD::CTPOP , MVT::i8 , Custom);
- setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
- setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i16 , Custom);
+ setOperationAction(ISD::CTPOP , MVT::i32 , Custom);
if (Subtarget.is64Bit())
- setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
else
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
@@ -31030,29 +31030,37 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
}
-static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
- SDLoc DL(Op);
+ MVT VT = N.getSimpleValueType();
+ SDValue Op = N.getOperand(0);
+ SDLoc DL(N);
- // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
- if (VT == MVT::i8) {
- SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
- Op = DAG.getZExtOrTrunc(Op.getOperand(0), DL, MVT::i32);
- Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
- DAG.getConstant(0x08040201U, DL, MVT::i32));
- Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
- DAG.getShiftAmountConstant(3, MVT::i32, DL));
- Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
- Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
- Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
- DAG.getShiftAmountConstant(28, MVT::i32, DL));
- return DAG.getZExtOrTrunc(Op, DL, VT);
+ if (VT.isScalarInteger()) {
+ KnownBits Known = DAG.computeKnownBits(Op);
+ unsigned ActiveBits = Known.countMaxActiveBits();
+
+ // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
+ if (ActiveBits <= 8) {
+ SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
+ Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
+ Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
+ DAG.getConstant(0x08040201U, DL, MVT::i32));
+ Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
+ DAG.getShiftAmountConstant(3, MVT::i32, DL));
+ Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
+ Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
+ Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
+ DAG.getShiftAmountConstant(28, MVT::i32, DL));
+ return DAG.getZExtOrTrunc(Op, DL, VT);
+ }
+
+ return SDValue(); // fallback to generic expansion.
}
assert(VT.isVector() &&
"We only do custom lowering for vector population count.");
- return LowerVectorCTPOP(Op, DL, Subtarget, DAG);
+ return LowerVectorCTPOP(N, DL, Subtarget, DAG);
}
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index dedae2893e2ad..3187bf6448690 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -516,23 +516,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: movzbl %al, %ecx
-; AVX512F-NEXT: shrl %eax
-; AVX512F-NEXT: andl $85, %eax
-; AVX512F-NEXT: subl %eax, %ecx
-; AVX512F-NEXT: movl %ecx, %eax
-; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333
-; AVX512F-NEXT: shrl $2, %ecx
-; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333
-; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: movl %ecx, %eax
-; AVX512F-NEXT: shrl $4, %eax
-; AVX512F-NEXT: addl %ecx, %eax
-; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F
-; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101
-; AVX512F-NEXT: shrl $24, %eax
; AVX512F-NEXT: kshiftrw $8, %k1, %k2
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: movzbl %al, %eax
+; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512F-NEXT: shrl $3, %eax
+; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111
+; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512F-NEXT: shrl $28, %eax
; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; AVX512F-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
@@ -543,23 +534,13 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1
-; AVX512VLDQ-NEXT: kmovb %k1, %eax
-; AVX512VLDQ-NEXT: movl %eax, %ecx
-; AVX512VLDQ-NEXT: shrl %ecx
-; AVX512VLDQ-NEXT: andl $-43, %ecx
-; AVX512VLDQ-NEXT: subl %ecx, %eax
-; AVX512VLDQ-NEXT: movl %eax, %ecx
-; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLDQ-NEXT: shrl $2, %eax
-; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLDQ-NEXT: addl %ecx, %eax
-; AVX512VLDQ-NEXT: movl %eax, %ecx
-; AVX512VLDQ-NEXT: shrl $4, %ecx
-; AVX512VLDQ-NEXT: addl %eax, %ecx
-; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
-; AVX512VLDQ-NEXT: shrl $24, %eax
; AVX512VLDQ-NEXT: kshiftrw $8, %k1, %k2
+; AVX512VLDQ-NEXT: kmovb %k1, %eax
+; AVX512VLDQ-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512VLDQ-NEXT: shrl $3, %eax
+; AVX512VLDQ-NEXT: andl $286331153, %eax ## imm = 0x11111111
+; AVX512VLDQ-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512VLDQ-NEXT: shrl $28, %eax
; AVX512VLDQ-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
@@ -569,23 +550,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $7, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpmovb2m %xmm2, %k1
-; AVX512VLBW-NEXT: kmovd %k1, %eax
-; AVX512VLBW-NEXT: movzbl %al, %ecx
-; AVX512VLBW-NEXT: shrl %eax
-; AVX512VLBW-NEXT: andl $85, %eax
-; AVX512VLBW-NEXT: subl %eax, %ecx
-; AVX512VLBW-NEXT: movl %ecx, %eax
-; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLBW-NEXT: shrl $2, %ecx
-; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLBW-NEXT: addl %eax, %ecx
-; AVX512VLBW-NEXT: movl %ecx, %eax
-; AVX512VLBW-NEXT: shrl $4, %eax
-; AVX512VLBW-NEXT: addl %ecx, %eax
-; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F
-; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101
-; AVX512VLBW-NEXT: shrl $24, %eax
; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2
+; AVX512VLBW-NEXT: kmovd %k1, %eax
+; AVX512VLBW-NEXT: movzbl %al, %eax
+; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512VLBW-NEXT: shrl $3, %eax
+; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111
+; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512VLBW-NEXT: shrl $28, %eax
; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; AVX512VLBW-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 46b1fa5dd2757..4c5b67962a58b 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1008,21 +1008,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
; AVX512F-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
; AVX512F-NEXT: kmovw %k2, %eax
; AVX512F-NEXT: movzbl %al, %eax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl %ecx
-; AVX512F-NEXT: andl $-43, %ecx
-; AVX512F-NEXT: subl %ecx, %eax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333
-; AVX512F-NEXT: shrl $2, %eax
-; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333
-; AVX512F-NEXT: addl %ecx, %eax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $4, %ecx
-; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
-; AVX512F-NEXT: shrl $24, %eax
+; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512F-NEXT: shrl $3, %eax
+; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111
+; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512F-NEXT: shrl $28, %eax
; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
; AVX512F-NEXT: retq
;
@@ -1032,21 +1022,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
; AVX512VLDQ-NEXT: vptestnmd %ymm3, %ymm3, %k1
; AVX512VLDQ-NEXT: vptestnmd %ymm2, %ymm2, %k2
; AVX512VLDQ-NEXT: kmovb %k2, %eax
-; AVX512VLDQ-NEXT: movl %eax, %ecx
-; AVX512VLDQ-NEXT: shrl %ecx
-; AVX512VLDQ-NEXT: andl $-43, %ecx
-; AVX512VLDQ-NEXT: subl %ecx, %eax
-; AVX512VLDQ-NEXT: movl %eax, %ecx
-; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLDQ-NEXT: shrl $2, %eax
-; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLDQ-NEXT: addl %ecx, %eax
-; AVX512VLDQ-NEXT: movl %eax, %ecx
-; AVX512VLDQ-NEXT: shrl $4, %ecx
-; AVX512VLDQ-NEXT: addl %eax, %ecx
-; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
-; AVX512VLDQ-NEXT: shrl $24, %eax
+; AVX512VLDQ-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512VLDQ-NEXT: shrl $3, %eax
+; AVX512VLDQ-NEXT: andl $286331153, %eax ## imm = 0x11111111
+; AVX512VLDQ-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512VLDQ-NEXT: shrl $28, %eax
; AVX512VLDQ-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
; AVX512VLDQ-NEXT: retq
@@ -1059,21 +1039,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
; AVX512VLBW-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
; AVX512VLBW-NEXT: kmovd %k2, %eax
; AVX512VLBW-NEXT: movzbl %al, %eax
-; AVX512VLBW-NEXT: movl %eax, %ecx
-; AVX512VLBW-NEXT: shrl %ecx
-; AVX512VLBW-NEXT: andl $-43, %ecx
-; AVX512VLBW-NEXT: subl %ecx, %eax
-; AVX512VLBW-NEXT: movl %eax, %ecx
-; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLBW-NEXT: shrl $2, %eax
-; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLBW-NEXT: addl %ecx, %eax
-; AVX512VLBW-NEXT: movl %eax, %ecx
-; AVX512VLBW-NEXT: shrl $4, %ecx
-; AVX512VLBW-NEXT: addl %eax, %ecx
-; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
-; AVX512VLBW-NEXT: shrl $24, %eax
+; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512VLBW-NEXT: shrl $3, %eax
+; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111
+; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512VLBW-NEXT: shrl $28, %eax
; AVX512VLBW-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
More information about the llvm-commits
mailing list