[llvm] [VP] IR expansion to Int Func Call (PR #122867)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 23:47:46 PST 2025
https://github.com/LiqinWeng created https://github.com/llvm/llvm-project/pull/122867
Add basic handling for VP ops that can expand to Int intrinsics, which includes: ctpop/cttz/ctlz/sadd.sat/uadd.sat/ssub.sat/usub.sat/fshl/fshr
>From 3bb2a4fe3013811374e8fa98162565d527d83bda Mon Sep 17 00:00:00 2001
From: LiqinWeng <liqin.weng at spacemit.com>
Date: Tue, 14 Jan 2025 15:37:22 +0800
Subject: [PATCH] [VP] IR expansion to Int Func Call
Add basic handling for VP ops that can expand to Int intrinsics, which includes: ctpop/cttz/ctlz/sadd.sat/uadd.sat/ssub.sat/usub.sat/fshl/fshr
---
llvm/lib/CodeGen/ExpandVectorPredication.cpp | 9 +
.../CodeGen/X86/expand-vp-int-intrinsics.ll | 395 ++++++++++++++++++
2 files changed, 404 insertions(+)
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 5ca223852cbde3..6f4a2c9ca7604f 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -609,6 +609,15 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
case Intrinsic::vp_umin:
case Intrinsic::vp_bswap:
case Intrinsic::vp_bitreverse:
+ case Intrinsic::vp_ctpop:
+ case Intrinsic::vp_ctlz:
+ case Intrinsic::vp_cttz:
+ case Intrinsic::vp_sadd_sat:
+ case Intrinsic::vp_uadd_sat:
+ case Intrinsic::vp_ssub_sat:
+ case Intrinsic::vp_usub_sat:
+ case Intrinsic::vp_fshl:
+ case Intrinsic::vp_fshr:
return expandPredicationToIntCall(Builder, VPI);
case Intrinsic::vp_fabs:
case Intrinsic::vp_sqrt:
diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
index f26368c02de2b7..bdc25cdec06b10 100644
--- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
@@ -1347,3 +1347,398 @@ define <4 x i32> @vp_bswap_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
ret <4 x i32> %v
}
declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_ctpop_v4i32:
+; X86: # %bb.0:
+; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X86-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; X86-NEXT: vpsrlw $4, %xmm0, %xmm0
+; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; X86-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; X86-NEXT: retl
+;
+; SSE-LABEL: vp_ctpop_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $1, %xmm1
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: paddb %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $4, %xmm1
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: psadbw %xmm1, %xmm2
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: psadbw %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vp_ctpop_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vp_ctpop_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vp_ctpop_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %v = call <4 x i32> @llvm.vp.ctpop.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.ctpop.v4i32(<4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_ctlz_v4i32:
+; X86: # %bb.0:
+; X86-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-NEXT: vpshufb %xmm0, %xmm1, %xmm2
+; X86-NEXT: vpsrlw $4, %xmm0, %xmm3
+; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3
+; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; X86-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5
+; X86-NEXT: vpand %xmm5, %xmm2, %xmm2
+; X86-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; X86-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
+; X86-NEXT: vpsrlw $8, %xmm2, %xmm2
+; X86-NEXT: vpand %xmm2, %xmm1, %xmm2
+; X86-NEXT: vpsrlw $8, %xmm1, %xmm1
+; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; X86-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; X86-NEXT: vpsrld $16, %xmm0, %xmm0
+; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
+; X86-NEXT: vpsrld $16, %xmm1, %xmm1
+; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; SSE-LABEL: vp_ctlz_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $1, %xmm1
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $2, %xmm1
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $4, %xmm1
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $8, %xmm1
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $1, %xmm1
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: paddb %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $4, %xmm1
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: psadbw %xmm1, %xmm2
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: psadbw %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vp_ctlz_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vp_ctlz_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm3
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vp_ctlz_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm2
+; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm3
+; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
+; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5
+; AVX512-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX512-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %v = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %va, i1 false, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32>, i1, <4 x i1>, i32)
+
+define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; X86-LABEL: vp_cttz_v4i32:
+; X86: # %bb.0:
+; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; X86-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X86-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X86-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; X86-NEXT: vpsrlw $4, %xmm0, %xmm0
+; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; X86-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; X86-NEXT: retl
+;
+; SSE-LABEL: vp_cttz_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pandn %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $1, %xmm1
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: paddb %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $4, %xmm1
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: psadbw %xmm1, %xmm2
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: psadbw %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vp_cttz_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vp_cttz_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vp_cttz_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %v = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %va, i1 false, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32>, i1, <4 x i1>, i32)
+
+
+define <4 x i32> @vp_sadd_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
+ %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_sadd_uat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
+ %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_ssub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
+ %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_usub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 zeroext %evl) {
+ %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_fshl_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 x i1> %m, i32 zeroext %evl) {
+ %v = call <4 x i32> @llvm.vp.fshl.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vp_fshr_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 x i1> %m, i32 zeroext %evl) {
+ %v = call <4 x i32> @llvm.vp.fshr.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 x i1> %m, i32 %evl)
+ ret <4 x i32> %v
+}
+declare <4 x i32> @llvm.vp.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i1>, i32)
More information about the llvm-commits
mailing list