[llvm] [DAG] Add ISD::VECTOR_COMPRESS handling in computeKnownBits/ComputeNumSignBits (PR #159692)
Kavin Gnanapandithan via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 20 15:36:20 PDT 2025
https://github.com/KavinTheG updated https://github.com/llvm/llvm-project/pull/159692
>From 0173be88666d3d074d72c0f69c91782d84b14c99 Mon Sep 17 00:00:00 2001
From: Kavin Gnanapandithan <kavin.balag at gmail.com>
Date: Thu, 18 Sep 2025 21:02:49 -0400
Subject: [PATCH 1/2] added ISD::VECTOR_COMPRES handling in
computeKnownBits/ComputeNumSignBits with test coverage
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 34 +++
llvm/test/CodeGen/AArch64/vector-compress.ll | 113 ++++++++++
llvm/test/CodeGen/X86/vector-compress.ll | 209 ++++++++++++++++++
3 files changed, 356 insertions(+)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bcf25958d0982..ab0411eae9549 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -58,6 +58,7 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
@@ -3480,6 +3481,26 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
break;
}
break;
+ case ISD::VECTOR_COMPRESS: {
+ assert(!Op.getValueType().isScalableVector());
+
+ SDValue Vec = Op.getOperand(0);
+ SDValue PassThru = Op.getOperand(2);
+ // If PassThru is undefined, early out
+ if (PassThru.isUndef())
+ break;
+
+ Known.Zero.setAllBits();
+ Known.One.setAllBits();
+ Known2 = computeKnownBits(PassThru, Depth + 1);
+ Known = Known.intersectWith(Known2);
+ // If we don't know any bits, early out.
+ if (Known.isUnknown())
+ break;
+ Known2 = computeKnownBits(Vec, Depth + 1);
+ Known = Known.intersectWith(Known2);
+ break;
+ }
case ISD::VECTOR_SHUFFLE: {
assert(!Op.getValueType().isScalableVector());
// Collect the known bits that are shared by every vector element referenced
@@ -4792,6 +4813,19 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
}
return Tmp;
+ case ISD::VECTOR_COMPRESS: {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Mask = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ // If PassThru is undefined, early out.
+ if (PassThru.isUndef())
+ return 1;
+ Tmp = ComputeNumSignBits(Vec, Depth + 1);
+ Tmp2 = ComputeNumSignBits(PassThru, Depth + 1);
+ Tmp = std::min(Tmp, Tmp2);
+ return Tmp;
+ }
+
case ISD::VECTOR_SHUFFLE: {
// Collect the minimum number of sign bits that are shared by every vector
// element referenced by the shuffle.
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index 67a0379d05244..9165493863729 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -471,3 +471,116 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
%out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
ret <3 x i3> %out
}
+
+define <4 x i32> @test_compress_knownbits_zext_v4i16_4i32(<4 x i16> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_knownbits_zext_v4i16_4i32:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: movi.4s v3, #1
+; CHECK-NEXT: mov x14, sp
+; CHECK-NEXT: movi.4s v4, #3
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: mov x13, sp
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: mov x15, sp
+; CHECK-NEXT: shl.4s v1, v1, #31
+; CHECK-NEXT: and.16b v2, v2, v4
+; CHECK-NEXT: cmlt.4s v1, v1, #0
+; CHECK-NEXT: str q2, [sp]
+; CHECK-NEXT: and.16b v3, v1, v3
+; CHECK-NEXT: mov.s w8, v1[1]
+; CHECK-NEXT: mov.s w9, v1[2]
+; CHECK-NEXT: mov.s w10, v1[3]
+; CHECK-NEXT: fmov w11, s1
+; CHECK-NEXT: addv.4s s1, v3
+; CHECK-NEXT: and x16, x11, #0x1
+; CHECK-NEXT: and x8, x8, #0x1
+; CHECK-NEXT: bfi x14, x11, #2, #1
+; CHECK-NEXT: add x8, x16, x8
+; CHECK-NEXT: and x9, x9, #0x1
+; CHECK-NEXT: and x10, x10, #0x1
+; CHECK-NEXT: fmov w11, s1
+; CHECK-NEXT: add x9, x8, x9
+; CHECK-NEXT: mov w16, #3 ; =0x3
+; CHECK-NEXT: add x10, x9, x10
+; CHECK-NEXT: orr x8, x12, x8, lsl #2
+; CHECK-NEXT: bfi x15, x9, #2, #2
+; CHECK-NEXT: cmp x10, #3
+; CHECK-NEXT: bfi x13, x11, #2, #2
+; CHECK-NEXT: mov.s w11, v0[3]
+; CHECK-NEXT: csel x9, x10, x16, lo
+; CHECK-NEXT: ldr w13, [x13]
+; CHECK-NEXT: str s0, [sp]
+; CHECK-NEXT: st1.s { v0 }[1], [x14]
+; CHECK-NEXT: st1.s { v0 }[2], [x8]
+; CHECK-NEXT: orr x8, x12, x9, lsl #2
+; CHECK-NEXT: csel w9, w11, w13, hi
+; CHECK-NEXT: st1.s { v0 }[3], [x15]
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: ret
+entry:
+ %xvec = zext <4 x i16> %vec to <4 x i32>
+ %xpassthru = and <4 x i32> %passthru, splat (i32 3)
+ %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %xvec, <4 x i1> %mask, <4 x i32> %xpassthru)
+ %res = and <4 x i32> %out, splat (i32 65535)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_compress_numsignbits_sext_v4i16_4i32(<4 x i16> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_numsignbits_sext_v4i16_4i32:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: movi.4s v3, #1
+; CHECK-NEXT: mov x14, sp
+; CHECK-NEXT: movi.4s v4, #3
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: mov x13, sp
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: mov x15, sp
+; CHECK-NEXT: shl.4s v1, v1, #31
+; CHECK-NEXT: and.16b v2, v2, v4
+; CHECK-NEXT: cmlt.4s v1, v1, #0
+; CHECK-NEXT: str q2, [sp]
+; CHECK-NEXT: and.16b v3, v1, v3
+; CHECK-NEXT: mov.s w8, v1[1]
+; CHECK-NEXT: mov.s w9, v1[2]
+; CHECK-NEXT: mov.s w10, v1[3]
+; CHECK-NEXT: fmov w11, s1
+; CHECK-NEXT: addv.4s s1, v3
+; CHECK-NEXT: and x16, x11, #0x1
+; CHECK-NEXT: and x8, x8, #0x1
+; CHECK-NEXT: bfi x14, x11, #2, #1
+; CHECK-NEXT: add x8, x16, x8
+; CHECK-NEXT: and x9, x9, #0x1
+; CHECK-NEXT: and x10, x10, #0x1
+; CHECK-NEXT: fmov w11, s1
+; CHECK-NEXT: add x9, x8, x9
+; CHECK-NEXT: mov w16, #3 ; =0x3
+; CHECK-NEXT: add x10, x9, x10
+; CHECK-NEXT: orr x8, x12, x8, lsl #2
+; CHECK-NEXT: bfi x15, x9, #2, #2
+; CHECK-NEXT: cmp x10, #3
+; CHECK-NEXT: bfi x13, x11, #2, #2
+; CHECK-NEXT: mov.s w11, v0[3]
+; CHECK-NEXT: csel x9, x10, x16, lo
+; CHECK-NEXT: ldr w13, [x13]
+; CHECK-NEXT: str s0, [sp]
+; CHECK-NEXT: st1.s { v0 }[1], [x14]
+; CHECK-NEXT: st1.s { v0 }[2], [x8]
+; CHECK-NEXT: orr x8, x12, x9, lsl #2
+; CHECK-NEXT: csel w9, w11, w13, hi
+; CHECK-NEXT: st1.s { v0 }[3], [x15]
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: ret
+entry:
+ %xvec = sext <4 x i16> %vec to <4 x i32>
+ %xpassthru = and <4 x i32> %passthru, splat(i32 3)
+ %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %xvec, <4 x i1> %mask, <4 x i32> %xpassthru)
+ %shl = shl <4 x i32> %out, splat(i32 16)
+ %res = ashr <4 x i32> %shl, splat(i32 16)
+ ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 1ab1a1a01e168..ac932d51017ae 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -4427,6 +4427,215 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
ret <64 x i32> %out
}
+define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind {
+; AVX2-LABEL: test_compress_knownbits_zext_v8i16_8i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $96, %rsp
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [3,3,3,3]
+; AVX2-NEXT: vandps %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vandps %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, (%rsp)
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: addl %ecx, %eax
+; AVX2-NEXT: andl $7, %eax
+; AVX2-NEXT: vpextrw $1, %xmm1, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: vpextrw $2, %xmm1, %esi
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: addq %rcx, %rsi
+; AVX2-NEXT: vpextrw $3, %xmm1, %edi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: addq %rsi, %rdi
+; AVX2-NEXT: vpextrw $4, %xmm1, %r8d
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: addq %rdi, %r8
+; AVX2-NEXT: vpextrw $5, %xmm1, %r9d
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: addq %r8, %r9
+; AVX2-NEXT: vpextrw $6, %xmm1, %r10d
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: addq %r9, %r10
+; AVX2-NEXT: vpextrw $7, %xmm1, %r11d
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: addq %r10, %r11
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rbx
+; AVX2-NEXT: cmpq $8, %r11
+; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rbx
+; AVX2-NEXT: vmovq %xmm4, (%rsp)
+; AVX2-NEXT: vpextrq $1, %xmm4, (%rsp,%rdx,8)
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%rsp,%rcx,8)
+; AVX2-NEXT: vpextrq $1, %xmm2, (%rsp,%rsi,8)
+; AVX2-NEXT: andl $7, %edi
+; AVX2-NEXT: vmovq %xmm0, (%rsp,%rdi,8)
+; AVX2-NEXT: andl $7, %r8d
+; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%r8,8)
+; AVX2-NEXT: andl $7, %r9d
+; AVX2-NEXT: vmovq %xmm1, (%rsp,%r9,8)
+; AVX2-NEXT: andl $7, %r10d
+; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%r10,8)
+; AVX2-NEXT: cmpq $7, %r11
+; AVX2-NEXT: movl $7, %eax
+; AVX2-NEXT: cmovbq %r11, %rax
+; AVX2-NEXT: movl %eax, %eax
+; AVX2-NEXT: movq %rbx, (%rsp,%rax,8)
+; AVX2-NEXT: vmovaps (%rsp), %ymm0
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: leaq -8(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_compress_knownbits_zext_v8i16_8i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_compress_knownbits_zext_v8i16_8i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: retq
+ %xvec = zext <8 x i16> %vec to <8 x i64> ; 0 -> 65535
+ %xpassthru = and <8 x i64> %passthru, splat (i64 3) ; 0 -> 3
+ %out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %xvec, <8 x i1> %mask, <8 x i64> %xpassthru)
+ %res = and <8 x i64> %out, splat (i64 65535) ; unnecessary - %out guaranteed to be 0 -> 65535
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind {
+; AVX2-LABEL: test_compress_knownbits_sext_v8i16_8i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $96, %rsp
+; AVX2-NEXT: vpmovsxwq %xmm0, %ymm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [3,3,3,3]
+; AVX2-NEXT: vandps %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vandps %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, (%rsp)
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: addl %ecx, %eax
+; AVX2-NEXT: andl $7, %eax
+; AVX2-NEXT: vpextrw $1, %xmm1, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: vpextrw $2, %xmm1, %esi
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: addq %rcx, %rsi
+; AVX2-NEXT: vpextrw $3, %xmm1, %edi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: addq %rsi, %rdi
+; AVX2-NEXT: vpextrw $4, %xmm1, %r8d
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: addq %rdi, %r8
+; AVX2-NEXT: vpextrw $5, %xmm1, %r9d
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: addq %r8, %r9
+; AVX2-NEXT: vpextrw $6, %xmm1, %r10d
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: addq %r9, %r10
+; AVX2-NEXT: vpextrw $7, %xmm1, %r11d
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: addq %r10, %r11
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rbx
+; AVX2-NEXT: cmpq $8, %r11
+; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rbx
+; AVX2-NEXT: vmovq %xmm4, (%rsp)
+; AVX2-NEXT: vpextrq $1, %xmm4, (%rsp,%rdx,8)
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%rsp,%rcx,8)
+; AVX2-NEXT: vpextrq $1, %xmm2, (%rsp,%rsi,8)
+; AVX2-NEXT: andl $7, %edi
+; AVX2-NEXT: vmovq %xmm0, (%rsp,%rdi,8)
+; AVX2-NEXT: andl $7, %r8d
+; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%r8,8)
+; AVX2-NEXT: andl $7, %r9d
+; AVX2-NEXT: vmovq %xmm1, (%rsp,%r9,8)
+; AVX2-NEXT: andl $7, %r10d
+; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%r10,8)
+; AVX2-NEXT: cmpq $7, %r11
+; AVX2-NEXT: movl $7, %eax
+; AVX2-NEXT: cmovbq %r11, %rax
+; AVX2-NEXT: movl %eax, %eax
+; AVX2-NEXT: movq %rbx, (%rsp,%rax,8)
+; AVX2-NEXT: vmovaps (%rsp), %ymm0
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: leaq -8(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_compress_knownbits_sext_v8i16_8i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm1
+; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_compress_knownbits_sext_v8i16_8i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
+; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm1
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: retq
+ %xvec = sext <8 x i16> %vec to <8 x i64> ; sign extend vec
+ %xpassthru = and <8 x i64> %passthru, splat(i64 3)
+ %out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %xvec, <8 x i1> %mask, <8 x i64> %xpassthru)
+ %shl = shl <8 x i64> %out, splat(i64 48)
+ %res = ashr <8 x i64> %shl, splat(i64 48)
+ ret <8 x i64> %res
+}
+
define <4 x i32> @test_compress_all_const() nounwind {
; AVX2-LABEL: test_compress_all_const:
; AVX2: # %bb.0:
>From 3ad285b58a9014410c8f5caf2145116c1a4c645d Mon Sep 17 00:00:00 2001
From: Kavin Gnanapandithan <kavin.balag at gmail.com>
Date: Sat, 20 Sep 2025 18:35:44 -0400
Subject: [PATCH 2/2] Updated knownbit/numsignbit for VECTOR_COMPRESS & added
tests for SVE/RVV
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 17 +++------
.../CodeGen/AArch64/sve-vector-compress.ll | 36 +++++++++++++++++++
.../test/CodeGen/RISCV/rvv/vector-compress.ll | 33 +++++++++++++++++
3 files changed, 73 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ab0411eae9549..b5bbcafaab183 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -58,7 +58,6 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
-#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
@@ -3482,18 +3481,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
}
break;
case ISD::VECTOR_COMPRESS: {
- assert(!Op.getValueType().isScalableVector());
-
SDValue Vec = Op.getOperand(0);
SDValue PassThru = Op.getOperand(2);
- // If PassThru is undefined, early out
- if (PassThru.isUndef())
- break;
Known.Zero.setAllBits();
Known.One.setAllBits();
- Known2 = computeKnownBits(PassThru, Depth + 1);
- Known = Known.intersectWith(Known2);
+ Known = computeKnownBits(PassThru, DemandedElts, Depth + 1);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
@@ -4815,13 +4808,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
case ISD::VECTOR_COMPRESS: {
SDValue Vec = Op.getOperand(0);
- SDValue Mask = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
- // If PassThru is undefined, early out.
- if (PassThru.isUndef())
+ Tmp = ComputeNumSignBits(PassThru, DemandedElts, Depth + 1);
+ if (Tmp == 1)
return 1;
- Tmp = ComputeNumSignBits(Vec, Depth + 1);
- Tmp2 = ComputeNumSignBits(PassThru, Depth + 1);
+ Tmp2 = ComputeNumSignBits(Vec, Depth + 1);
Tmp = std::min(Tmp, Tmp2);
return Tmp;
}
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
index 198e0a37c56fa..cc3a3734a9721 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -100,6 +100,42 @@ define <vscale x 4 x i4> @test_compress_illegal_element_type(<vscale x 4 x i4> %
ret <vscale x 4 x i4> %out
}
+define <vscale x 4 x i32> @test_compress_knownbits_zext(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_knownbits_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: and z1.s, z1.s, #0x3
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ret
+ %xvec = zext <vscale x 4 x i16> %vec to <vscale x 4 x i32>
+ %xpassthru = and <vscale x 4 x i32> %passthru, splat (i32 3)
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %xvec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %xpassthru)
+ %res = and <vscale x 4 x i32> %out, splat (i32 65535)
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @test_compress_numsignbits_sext(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_numsignbits_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: and z1.s, z1.s, #0x3
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: sxth z0.s, p1/m, z0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT: ret
+ %xvec = sext <vscale x 4 x i16> %vec to <vscale x 4 x i32>
+ %xpassthru = and <vscale x 4 x i32> %passthru, splat (i32 3)
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %xvec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %xpassthru)
+ %shl = shl <vscale x 4 x i32> %out, splat (i32 16)
+ %res = ashr <vscale x 4 x i32> %shl, splat (i32 16)
+ ret <vscale x 4 x i32> %res
+}
+
define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
; CHECK-LABEL: test_compress_large:
; CHECK: // %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll b/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll
index e06382b19c41a..6a3bfae0fb10c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll
@@ -346,6 +346,39 @@ define <vscale x 4 x i32> @vector_compress_nxv4i32_passthru(<vscale x 4 x i32> %
ret <vscale x 4 x i32> %ret
}
+define <vscale x 4 x i32> @test_compress_nvx8f64_knownbits(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_nvx8f64_knownbits:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vzext.vf2 v12, v8
+; CHECK-NEXT: vand.vi v8, v10, 3
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma
+; CHECK-NEXT: vcompress.vm v8, v12, v0
+; CHECK-NEXT: ret
+ %xvec = zext <vscale x 4 x i16> %vec to <vscale x 4 x i32>
+ %xpassthru = and <vscale x 4 x i32> %passthru, splat (i32 3)
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %xvec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %xpassthru)
+ %res = and <vscale x 4 x i32> %out, splat (i32 65535)
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @test_compress_nv8xf64_numsignbits(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) nounwind {
+; CHECK-LABEL: test_compress_nv8xf64_numsignbits:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vsext.vf2 v12, v8
+; CHECK-NEXT: vand.vi v8, v10, 3
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma
+; CHECK-NEXT: vcompress.vm v8, v12, v0
+; CHECK-NEXT: ret
+ %xvec = sext <vscale x 4 x i16> %vec to <vscale x 4 x i32>
+ %xpassthru = and <vscale x 4 x i32> %passthru, splat (i32 3)
+ %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %xvec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %xpassthru)
+ %shl = shl <vscale x 4 x i32> %out, splat (i32 16)
+ %res = ashr <vscale x 4 x i32> %shl, splat (i32 16)
+ ret <vscale x 4 x i32> %res
+}
+
define <vscale x 8 x i32> @vector_compress_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i1> %mask) {
; CHECK-LABEL: vector_compress_nxv8i32:
; CHECK: # %bb.0:
More information about the llvm-commits
mailing list