[llvm] [X86] optimize saturating (masked) pack (PR #169995)
Folkert de Vries via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 29 05:34:11 PST 2025
https://github.com/folkertdev updated https://github.com/llvm/llvm-project/pull/169995
>From 4e14cebf78998a8b5df74de6f018fef5a38192f8 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Fri, 28 Nov 2025 19:27:14 +0100
Subject: [PATCH 1/5] Optimize manual PACKSS/PACKUS
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 175 +++++++++++++++++++-----
llvm/test/CodeGen/X86/packus.ll | 7 -
2 files changed, 143 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d49f25a950e3a..8ff476b87dc5e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52829,6 +52829,91 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
}
+// Check whether this is a shuffle that interleaves the lanes of the two input
+// vectors. e.g. when interleaving two v8i32 into a single v16i32 that mask is
+// <0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>. Indices are based
+// on the target type.
+static bool isLaneInterleaveMask(ArrayRef<int> Mask, MVT VT) {
+ assert(VT.isVector() && "Expected vector VT.");
+
+ MVT ElemVT = VT.getScalarType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltBits = ElemVT.getSizeInBits();
+
+ if (Mask.size() != NumElts)
+ return false;
+
+ // A lane is 128 bits.
+ if (EltBits == 0 || (128u % EltBits) != 0)
+ return false;
+
+ // So 4 for i32, 8 for i16, etc.
+ unsigned EltsPerLane = 128u / EltBits;
+ unsigned GroupSize = 2 * EltsPerLane;
+
+ if (NumElts % GroupSize != 0)
+ return false;
+
+ unsigned Pos = 0;
+ for (unsigned G = 0; G != (NumElts / GroupSize); ++G) {
+ // Indices are based on the output type, hence B starts at NumElts.
+ unsigned ABase = G * EltsPerLane;
+ unsigned BBase = NumElts + G * EltsPerLane;
+
+ for (unsigned I = 0; I != EltsPerLane; ++I)
+ if (Mask[Pos++] != (int)(ABase + I))
+ return false;
+
+ for (unsigned I = 0; I != EltsPerLane; ++I)
+ if (Mask[Pos++] != (int)(BBase + I))
+ return false;
+ }
+
+ return true;
+}
+
+// Check whether this is a shuffle that interleaves the lanes of the two input
+// vectors. e.g. v16i32 that mask is <0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7,
+// 20, 21, 22, 23>.
+static bool isLaneInterleaveShuffle(MVT VT, SDValue Shuf, SDValue &A,
+ SDValue &B, const SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // For the _mm_pack{u|s}s variants, the shuffle is trivial and therefore
+ // elided.
+ if (VT == MVT::v16i16 || VT == MVT::v8i32) {
+ if (Shuf.getOpcode() == ISD::CONCAT_VECTORS && Shuf.getNumOperands() == 2) {
+ A = Shuf->getOperand(0);
+ B = Shuf->getOperand(1);
+ return true;
+ }
+
+ return false;
+ }
+
+ auto *SVN = dyn_cast<ShuffleVectorSDNode>(Shuf.getNode());
+ if (!SVN)
+ return false;
+
+ ArrayRef<int> TargetMask = SVN->getMask();
+ SDValue V1 = SVN->getOperand(0);
+ SDValue V2 = SVN->getOperand(1);
+
+ if (isLaneInterleaveMask(TargetMask, VT)) {
+ auto peelConcat = [](SDValue V) -> SDValue {
+ if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2)
+ return V.getOperand(0);
+ return V;
+ };
+
+ // The upper half is undefined.
+ A = peelConcat(V1);
+ B = peelConcat(V2);
+ return true;
+ }
+
+ return false;
+}
+
/// Detect patterns of truncation with unsigned saturation:
///
/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
@@ -52973,42 +53058,68 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
Subtarget);
}
+ if (!(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8))
+ return SDValue();
+
+ unsigned TruncOpc = 0;
+ SDValue SatVal;
+ if (SDValue SSatVal = detectSSatPattern(In, VT)) {
+ SatVal = SSatVal;
+ TruncOpc = X86ISD::VTRUNCS;
+ } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
+ SatVal = USatVal;
+ TruncOpc = X86ISD::VTRUNCUS;
+ } else {
+ return SDValue();
+ }
+
+ unsigned ResElts = VT.getVectorNumElements();
+
+ bool IsEpi16 = (SVT == MVT::i8 && InSVT == MVT::i16);
+ bool IsEpi32 = (SVT == MVT::i16 && InSVT == MVT::i32);
+
+ // Is there an adventageous pack given the current types and features?
+ unsigned Width = VT.getSizeInBits();
+ bool HasPackForWidth =
+ (Width == 128 && Subtarget.hasSSE41()) ||
+ (Width == 256 && Subtarget.hasAVX2()) ||
+ (Width == 512 && Subtarget.hasBWI() && Subtarget.hasVLX());
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
- Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
- (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
- unsigned TruncOpc = 0;
- SDValue SatVal;
- if (SDValue SSatVal = detectSSatPattern(In, VT)) {
- SatVal = SSatVal;
- TruncOpc = X86ISD::VTRUNCS;
- } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
- SatVal = USatVal;
- TruncOpc = X86ISD::VTRUNCUS;
- }
- if (SatVal) {
- unsigned ResElts = VT.getVectorNumElements();
- // If the input type is less than 512 bits and we don't have VLX, we need
- // to widen to 512 bits.
- if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
- unsigned NumConcats = 512 / InVT.getSizeInBits();
- ResElts *= NumConcats;
- SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
- ConcatOps[0] = SatVal;
- InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
- NumConcats * InVT.getVectorNumElements());
- SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
- }
- // Widen the result if its narrower than 128 bits.
- if (ResElts * SVT.getSizeInBits() < 128)
- ResElts = 128 / SVT.getSizeInBits();
- EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
- SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
- DAG.getVectorIdxConstant(0, DL));
+ if (HasPackForWidth && (IsEpi16 || IsEpi32)) {
+ SDValue A, B;
+ if (isLaneInterleaveShuffle(InVT.getSimpleVT(), SatVal, A, B, DAG,
+ Subtarget)) {
+ unsigned PackOpc =
+ TruncOpc == X86ISD::VTRUNCS ? X86ISD::PACKSS : X86ISD::PACKUS;
+
+ return DAG.getNode(PackOpc, DL, VT, A, B);
}
}
+ if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
+ Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
+
+ // If the input type is less than 512 bits and we don't have VLX, we
+ // need to widen to 512 bits.
+ if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
+ unsigned NumConcats = 512 / InVT.getSizeInBits();
+ ResElts *= NumConcats;
+ SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
+ ConcatOps[0] = SatVal;
+ InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
+ NumConcats * InVT.getVectorNumElements());
+ SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
+ }
+ // Widen the result if its narrower than 128 bits.
+ if (ResElts * SVT.getSizeInBits() < 128)
+ ResElts = 128 / SVT.getSizeInBits();
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
+ SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getVectorIdxConstant(0, DL));
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index 384e40496d82a..899e92ec61f5b 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -445,10 +445,3 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
%4 = trunc <32 x i16> %3 to <32 x i8>
ret <32 x i8> %4
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-AVX2: {{.*}}
-; X64-SSE2: {{.*}}
-; X64-SSE4: {{.*}}
-; X86-AVX2: {{.*}}
-; X86-SSE2: {{.*}}
-; X86-SSE4: {{.*}}
>From 9f5d496a101ad31d571027559a112895bb56c530 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Sat, 29 Nov 2025 02:22:30 +0100
Subject: [PATCH 2/5] add test for masked packss/packus
---
llvm/test/CodeGen/X86/masked_packss.ll | 189 ++++++++++++++++++++++++
llvm/test/CodeGen/X86/masked_packus.ll | 197 +++++++++++++++++++++++++
2 files changed, 386 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/masked_packss.ll
create mode 100644 llvm/test/CodeGen/X86/masked_packus.ll
diff --git a/llvm/test/CodeGen/X86/masked_packss.ll b/llvm/test/CodeGen/X86/masked_packss.ll
new file mode 100644
index 0000000000000..183cfec4a7933
--- /dev/null
+++ b/llvm/test/CodeGen/X86/masked_packss.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512
+
+define <16 x i8> @_mm_mask_packss_epi16_manual(<16 x i8> %src, i16 noundef %k, <8 x i16> %a, <8 x i16> %b) unnamed_addr {
+; AVX2-LABEL: _mm_mask_packss_epi16_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm_mask_packss_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpacksswb %xmm2, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %minv = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %sh, <16 x i16> splat (i16 -128))
+ %sat = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %minv, <16 x i16> splat (i16 127))
+ %tr = trunc <16 x i16> %sat to <16 x i8>
+ %mk = bitcast i16 %k to <16 x i1>
+ %res = select <16 x i1> %mk, <16 x i8> %tr, <16 x i8> %src
+ ret <16 x i8> %res
+}
+
+define <32 x i8> @_mm256_mask_packss_epi16_manual(<32 x i8> %src, i32 noundef %k, <16 x i16> %a, <16 x i16> %b) unnamed_addr {
+; AVX2-LABEL: _mm256_mask_packss_epi16_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpacksswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm256_mask_packss_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpacksswb %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %minv = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 -128))
+ %sat = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> %minv, <32 x i16> splat (i16 127))
+ %tr = trunc <32 x i16> %sat to <32 x i8>
+ %mk = bitcast i32 %k to <32 x i1>
+ %res = select <32 x i1> %mk, <32 x i8> %tr, <32 x i8> %src
+ ret <32 x i8> %res
+}
+
+define <64 x i8> @_mm512_mask_packss_epi16_manual(<64 x i8> %src, i64 noundef %k, <32 x i16> %a, <32 x i16> %b) unnamed_addr {
+; AVX2-LABEL: _mm512_mask_packss_epi16_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpacksswb %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpacksswb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vmovq %rdi, %xmm4
+; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4
+; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[4,4,4,4,4,4,4,4,13,13,13,13,13,13,13,13,22,22,22,22,22,22,22,22,31,31,31,31,31,31,31,31]
+; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm512_mask_packss_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq %rdi, %k1
+; AVX512-NEXT: vpacksswb %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+ %minv = tail call <64 x i16> @llvm.smax.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 -128))
+ %sat = tail call <64 x i16> @llvm.smin.v64i16(<64 x i16> %minv, <64 x i16> splat (i16 127))
+ %tr = trunc <64 x i16> %sat to <64 x i8>
+ %mk = bitcast i64 %k to <64 x i1>
+ %res = select <64 x i1> %mk, <64 x i8> %tr, <64 x i8> %src
+ ret <64 x i8> %res
+}
+
+define <8 x i16> @_mm_mask_packss_epi32_manual(<8 x i16> %src, i8 noundef %k, <4 x i32> %a, <4 x i32> %b) unnamed_addr {
+; AVX2-LABEL: _mm_mask_packss_epi32_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm_mask_packss_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpackssdw %xmm2, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %minv = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 -32768))
+ %sat = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %minv, <8 x i32> splat (i32 32767))
+ %tr = trunc <8 x i32> %sat to <8 x i16>
+ %mk = bitcast i8 %k to <8 x i1>
+ %res = select <8 x i1> %mk, <8 x i16> %tr, <8 x i16> %src
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @_mm256_mask_packss_epi32_manual(<16 x i16> %src, i16 noundef %k, <8 x i32> %a, <8 x i32> %b) unnamed_addr {
+; AVX2-LABEL: _mm256_mask_packss_epi32_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackssdw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm256_mask_packss_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpackssdw %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %minv = tail call <16 x i32> @llvm.smax.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 -32768))
+ %sat = tail call <16 x i32> @llvm.smin.v16i32(<16 x i32> %minv, <16 x i32> splat (i32 32767))
+ %tr = trunc <16 x i32> %sat to <16 x i16>
+ %mk = bitcast i16 %k to <16 x i1>
+ %res = select <16 x i1> %mk, <16 x i16> %tr, <16 x i16> %src
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @_mm512_mask_packss_epi32_manual(<32 x i16> %src, i32 noundef %k, <16 x i32> %a, <16 x i32> %b) unnamed_addr {
+; AVX2-LABEL: _mm512_mask_packss_epi32_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackssdw %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpackssdw %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vmovd %edi, %xmm4
+; AVX2-NEXT: vpbroadcastw %xmm4, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: shrl $16, %edi
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm512_mask_packss_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpackssdw %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
+ %minv = tail call <32 x i32> @llvm.smax.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 -32768))
+ %sat = tail call <32 x i32> @llvm.smin.v32i32(<32 x i32> %minv, <32 x i32> splat (i32 32767))
+ %tr = trunc <32 x i32> %sat to <32 x i16>
+ %mk = bitcast i32 %k to <32 x i1>
+ %res = select <32 x i1> %mk, <32 x i16> %tr, <32 x i16> %src
+ ret <32 x i16> %res
+}
+
+declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>)
+declare <64 x i16> @llvm.smax.v64i16(<64 x i16>, <64 x i16>)
+
+declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>)
+declare <64 x i16> @llvm.smin.v64i16(<64 x i16>, <64 x i16>)
+
+declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
+declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
+declare <32 x i32> @llvm.smax.v32i32(<32 x i32>, <32 x i32>)
+
+declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
+declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>)
+declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>)
diff --git a/llvm/test/CodeGen/X86/masked_packus.ll b/llvm/test/CodeGen/X86/masked_packus.ll
new file mode 100644
index 0000000000000..471a5959c9bd9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/masked_packus.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512
+
+define <16 x i8> @_mm_mask_packus_epi16_manual(<16 x i8> %src, i16 noundef %k, <8 x i16> %a, <8 x i16> %b) unnamed_addr {
+; AVX2-LABEL: _mm_mask_packus_epi16_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm_mask_packus_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpackuswb %xmm2, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %sat = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %sh, <16 x i16> splat (i16 255))
+ %tr = trunc nuw <16 x i16> %sat to <16 x i8>
+ %mk = bitcast i16 %k to <16 x i1>
+ %res = select <16 x i1> %mk, <16 x i8> %tr, <16 x i8> %src
+ ret <16 x i8> %res
+}
+
+define <32 x i8> @_mm256_mask_packus_epi16_manual(<32 x i8> %src, i32 noundef %k, <16 x i16> %a, <16 x i16> %b) unnamed_addr {
+; AVX2-LABEL: _mm256_mask_packus_epi16_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm256_mask_packus_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpackuswb %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %sat = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 255))
+ %tr = trunc nuw <32 x i16> %sat to <32 x i8>
+ %mk = bitcast i32 %k to <32 x i1>
+ %res = select <32 x i1> %mk, <32 x i8> %tr, <32 x i8> %src
+ ret <32 x i8> %res
+}
+
+define <64 x i8> @_mm512_mask_packus_epi16_manual(<64 x i8> %src, i64 noundef %k, <32 x i16> %a, <32 x i16> %b) unnamed_addr {
+; AVX2-LABEL: _mm512_mask_packus_epi16_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm6
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm5
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpminuw %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpminuw %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vpackuswb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpminuw %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpminuw %ymm4, %ymm6, %ymm4
+; AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT: vmovq %rdi, %xmm4
+; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4
+; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[4,4,4,4,4,4,4,4,13,13,13,13,13,13,13,13,22,22,22,22,22,22,22,22,31,31,31,31,31,31,31,31]
+; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm512_mask_packus_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq %rdi, %k1
+; AVX512-NEXT: vpackuswb %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+ %sat = tail call <64 x i16> @llvm.umin.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 255))
+ %tr = trunc nuw <64 x i16> %sat to <64 x i8>
+ %mk = bitcast i64 %k to <64 x i1>
+ %res = select <64 x i1> %mk, <64 x i8> %tr, <64 x i8> %src
+ ret <64 x i8> %res
+}
+
+define <8 x i16> @_mm_mask_packus_epi32_manual(<8 x i16> %src, i8 noundef %k, <4 x i32> %a, <4 x i32> %b) unnamed_addr {
+; AVX2-LABEL: _mm_mask_packus_epi32_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm_mask_packus_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpackusdw %xmm2, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %sat = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 65535))
+ %tr = trunc nuw <8 x i32> %sat to <8 x i16>
+ %mk = bitcast i8 %k to <8 x i1>
+ %res = select <8 x i1> %mk, <8 x i16> %tr, <8 x i16> %src
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @_mm256_mask_packus_epi32_manual(<16 x i16> %src, i16 noundef %k, <8 x i32> %a, <8 x i32> %b) unnamed_addr {
+; AVX2-LABEL: _mm256_mask_packus_epi32_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm256_mask_packus_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpackusdw %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %sat = tail call <16 x i32> @llvm.umin.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 65535))
+ %tr = trunc nuw <16 x i32> %sat to <16 x i16>
+ %mk = bitcast i16 %k to <16 x i1>
+ %res = select <16 x i1> %mk, <16 x i16> %tr, <16 x i16> %src
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @_mm512_mask_packus_epi32_manual(<32 x i16> %src, i32 noundef %k, <16 x i32> %a, <16 x i32> %b) unnamed_addr {
+; AVX2-LABEL: _mm512_mask_packus_epi32_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm6
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm5
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT: vpminud %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpminud %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vpackusdw %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpminud %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpminud %ymm4, %ymm6, %ymm4
+; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT: vmovd %edi, %xmm4
+; AVX2-NEXT: vpbroadcastw %xmm4, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: shrl $16, %edi
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm512_mask_packus_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpackusdw %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
+ %sat = tail call <32 x i32> @llvm.umin.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 65535))
+ %tr = trunc nuw <32 x i32> %sat to <32 x i16>
+ %mk = bitcast i32 %k to <32 x i1>
+ %res = select <32 x i1> %mk, <32 x i16> %tr, <32 x i16> %src
+ ret <32 x i16> %res
+}
+
+declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>)
+declare <64 x i16> @llvm.umin.v64i16(<64 x i16>, <64 x i16>)
+
+declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
+declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>)
+declare <32 x i32> @llvm.umin.v32i32(<32 x i32>, <32 x i32>)
>From c0ba6091aa13e920c849e7f9452fa7151eb4fc41 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Sat, 29 Nov 2025 02:28:30 +0100
Subject: [PATCH 3/5] add avx512 checks for existing packss and packus tests
---
llvm/test/CodeGen/X86/packss.ll | 63 ++++++++++++++++++++++
llvm/test/CodeGen/X86/packus.ll | 92 +++++++++++++++++++++++++++++++++
2 files changed, 155 insertions(+)
diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll
index 35919f65d3de0..eda72c4be3828 100644
--- a/llvm/test/CodeGen/X86/packss.ll
+++ b/llvm/test/CodeGen/X86/packss.ll
@@ -7,6 +7,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512
define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
; SSE2-LABEL: trunc_ashr_v4i64:
@@ -44,6 +46,13 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: trunc_ashr_v4i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsraq $63, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: ret{{[l|q]}}
%1 = ashr <4 x i64> %a, <i64 63, i64 63, i64 63, i64 63>
%2 = trunc <4 x i64> %1 to <4 x i32>
ret <4 x i32> %2
@@ -103,6 +112,13 @@ define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) {
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: trunc_ashr_v4i64_bitcast:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsraq $49, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: ret{{[l|q]}}
%1 = ashr <4 x i64> %a0, <i64 49, i64 49, i64 49, i64 49>
%2 = bitcast <4 x i64> %1 to <8 x i32>
%3 = trunc <8 x i32> %2 to <8 x i16>
@@ -133,6 +149,13 @@ define <8 x i16> @trunc_ashr_v8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: trunc_ashr_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: ret{{[l|q]}}
%1 = ashr <8 x i32> %a, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%2 = trunc <8 x i32> %1 to <8 x i16>
ret <8 x i16> %2
@@ -224,6 +247,15 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: trunc_ashr_v4i64_demandedelts:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllq $63, %ymm0, %ymm0
+; AVX512-NEXT: vpsraq $63, %ymm0, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: ret{{[l|q]}}
%1 = shl <4 x i64> %a0, <i64 63, i64 0, i64 63, i64 0>
%2 = ashr <4 x i64> %1, <i64 63, i64 0, i64 63, i64 0>
%3 = bitcast <4 x i64> %2 to <8 x i32>
@@ -246,6 +278,13 @@ define <16 x i8> @packsswb_icmp_zero_128(<8 x i16> %a0) {
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: packsswb_icmp_zero_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k0
+; AVX512-NEXT: vpmovm2b %k0, %xmm0
+; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512-NEXT: ret{{[l|q]}}
%1 = icmp eq <8 x i16> %a0, zeroinitializer
%2 = sext <8 x i1> %1 to <8 x i8>
%3 = shufflevector <8 x i8> %2, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -266,6 +305,13 @@ define <16 x i8> @packsswb_icmp_zero_trunc_128(<8 x i16> %a0) {
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: packsswb_icmp_zero_trunc_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
%1 = icmp eq <8 x i16> %a0, zeroinitializer
%2 = sext <8 x i1> %1 to <8 x i16>
%3 = shufflevector <8 x i16> %2, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -303,6 +349,13 @@ define <32 x i8> @packsswb_icmp_zero_256(<16 x i16> %a0) {
; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: packsswb_icmp_zero_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: ret{{[l|q]}}
%1 = icmp eq <16 x i16> %a0, zeroinitializer
%2 = sext <16 x i1> %1 to <16 x i16>
%3 = bitcast <16 x i16> %2 to <32 x i8>
@@ -341,6 +394,16 @@ define <32 x i8> @packsswb_icmp_zero_trunc_256(<16 x i16> %a0) {
; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: packsswb_icmp_zero_trunc_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: movb $-52, %al
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512-NEXT: ret{{[l|q]}}
%1 = icmp eq <16 x i16> %a0, zeroinitializer
%2 = sext <16 x i1> %1 to <16 x i16>
%3 = shufflevector <16 x i16> zeroinitializer, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index 899e92ec61f5b..90c97e4e9fed0 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -7,6 +7,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX,X64-AVX1
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX,X86-AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX,X64-AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512
define <4 x i32> @trunc_lshr_v4i64(<4 x i64> %a) nounwind {
; SSE2-LABEL: trunc_lshr_v4i64:
@@ -39,6 +41,13 @@ define <4 x i32> @trunc_lshr_v4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: trunc_lshr_v4i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlq $63, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: ret{{[l|q]}}
%1 = lshr <4 x i64> %a, <i64 63, i64 63, i64 63, i64 63>
%2 = trunc <4 x i64> %1 to <4 x i32>
ret <4 x i32> %2
@@ -75,6 +84,13 @@ define <8 x i16> @trunc_lshr_v4i64_bitcast(<4 x i64> %a0) {
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: trunc_lshr_v4i64_bitcast:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: ret{{[l|q]}}
%1 = lshr <4 x i64> %a0, <i64 49, i64 49, i64 49, i64 49>
%2 = bitcast <4 x i64> %1 to <8 x i32>
%3 = trunc <8 x i32> %2 to <8 x i16>
@@ -112,6 +128,13 @@ define <8 x i16> @trunc_lshr_v8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: trunc_lshr_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrld $31, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: ret{{[l|q]}}
%1 = lshr <8 x i32> %a, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%2 = trunc <8 x i32> %1 to <8 x i16>
ret <8 x i16> %2
@@ -187,6 +210,13 @@ define <16 x i8> @shuffle_lshr_2v8i16(<8 x i16> %a0, <8 x i16> %a1) {
; AVX-NEXT: vpsrlw $15, %xmm1, %xmm1
; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: shuffle_lshr_2v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm1
+; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
%lshr0 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%lshr1 = lshr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%bc0 = bitcast <8 x i16> %lshr0 to <16 x i8>
@@ -216,6 +246,13 @@ define <8 x i16> @shuffle_lshr_2v4i32(<4 x i32> %a0, <4 x i32> %a1) {
; AVX-NEXT: vpsrld $31, %xmm1, %xmm1
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: shuffle_lshr_2v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrld $31, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $31, %xmm1, %xmm1
+; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
%lshr0 = lshr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
%lshr1 = lshr <4 x i32> %a1, <i32 31, i32 31, i32 31, i32 31>
%bc0 = bitcast <4 x i32> %lshr0 to <8 x i16>
@@ -245,6 +282,13 @@ define <4 x i32> @shuffle_lshr_2v2i64(<2 x i64> %a0, <2 x i64> %a1) {
; AVX-NEXT: vpsrlq $63, %xmm1, %xmm1
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: shuffle_lshr_2v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlq $63, %xmm1, %xmm1
+; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
%lshr0 = lshr <2 x i64> %a0, <i64 63, i64 63>
%lshr1 = lshr <2 x i64> %a1, <i64 63, i64 63>
%bc0 = bitcast <2 x i64> %lshr0 to <4 x i32>
@@ -274,6 +318,13 @@ define <4 x float> @shuffle_lshr_2v2i64_bitcast(<2 x i64> %a0, <2 x i64> %a1) {
; AVX-NEXT: vpsrlq $63, %xmm1, %xmm1
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: shuffle_lshr_2v2i64_bitcast:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlq $63, %xmm1, %xmm1
+; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
%lshr0 = lshr <2 x i64> %a0, <i64 63, i64 63>
%lshr1 = lshr <2 x i64> %a1, <i64 63, i64 63>
%bc0 = bitcast <2 x i64> %lshr0 to <4 x float>
@@ -318,6 +369,13 @@ define <16 x i8> @packuswb_icmp_zero_128(<8 x i16> %a0) {
; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X64-AVX-NEXT: retq
+;
+; AVX512-LABEL: packuswb_icmp_zero_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512-NEXT: ret{{[l|q]}}
%1 = icmp eq <8 x i16> %a0, zeroinitializer
%2 = zext <8 x i1> %1 to <8 x i8>
%3 = shufflevector <8 x i8> %2, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -340,6 +398,14 @@ define <16 x i8> @packuswb_icmp_zero_trunc_128(<8 x i16> %a0) {
; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: packuswb_icmp_zero_trunc_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
%1 = icmp eq <8 x i16> %a0, zeroinitializer
%2 = zext <8 x i1> %1 to <8 x i16>
%3 = shufflevector <8 x i16> %2, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -397,6 +463,14 @@ define <32 x i8> @packuswb_icmp_zero_256(<16 x i16> %a0) {
; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: packuswb_icmp_zero_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsrlw $15, %ymm0, %ymm0
+; AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: ret{{[l|q]}}
%1 = icmp eq <16 x i16> %a0, zeroinitializer
%2 = zext <16 x i1> %1 to <16 x i16>
%3 = bitcast <16 x i16> %2 to <32 x i8>
@@ -439,9 +513,27 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: packuswb_icmp_zero_trunc_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsrlw $15, %ymm0, %ymm0
+; AVX512-NEXT: movb $-52, %al
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512-NEXT: ret{{[l|q]}}
%1 = icmp eq <16 x i16> %a0, zeroinitializer
%2 = zext <16 x i1> %1 to <16 x i16>
%3 = shufflevector <16 x i16> zeroinitializer, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%4 = trunc <32 x i16> %3 to <32 x i8>
ret <32 x i8> %4
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64-AVX2: {{.*}}
+; X64-SSE2: {{.*}}
+; X64-SSE4: {{.*}}
+; X86-AVX2: {{.*}}
+; X86-SSE2: {{.*}}
+; X86-SSE4: {{.*}}
>From 8b25a26da3261103418edb2212e52e9f6e4f15a6 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Sat, 29 Nov 2025 02:53:15 +0100
Subject: [PATCH 4/5] add tests for manual packss/packus
---
llvm/test/CodeGen/X86/packss.ll | 219 +++++++
llvm/test/CodeGen/X86/packus.ll | 972 +++++++++++++++++++++++++++++++-
2 files changed, 1184 insertions(+), 7 deletions(-)
diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll
index eda72c4be3828..20a392134d3c2 100644
--- a/llvm/test/CodeGen/X86/packss.ll
+++ b/llvm/test/CodeGen/X86/packss.ll
@@ -410,3 +410,222 @@ define <32 x i8> @packsswb_icmp_zero_trunc_256(<16 x i16> %a0) {
%4 = trunc <32 x i16> %3 to <32 x i8>
ret <32 x i8> %4
}
+
+
+define <16 x i8> @_mm_packss_epi16_manual(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: _mm_packss_epi16_manual:
+; SSE: # %bb.0:
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
+;
+; AVX-LABEL: _mm_packss_epi16_manual:
+; AVX: # %bb.0:
+; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: _mm_packss_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %minv = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %sh, <16 x i16> splat (i16 -128))
+ %sat = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %minv, <16 x i16> splat (i16 127))
+ %tr = trunc <16 x i16> %sat to <16 x i8>
+ ret <16 x i8> %tr
+}
+
+define <32 x i8> @_mm256_packss_epi16_manual(<16 x i16> %a, <16 x i16> %b) {
+; X86-SSE-LABEL: _mm256_packss_epi16_manual:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %ebp, -8
+; X86-SSE-NEXT: movl %esp, %ebp
+; X86-SSE-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE-NEXT: andl $-16, %esp
+; X86-SSE-NEXT: subl $16, %esp
+; X86-SSE-NEXT: packsswb %xmm2, %xmm0
+; X86-SSE-NEXT: packsswb 8(%ebp), %xmm1
+; X86-SSE-NEXT: movl %ebp, %esp
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE-NEXT: retl
+;
+; AVX1-LABEL: _mm256_packss_epi16_manual:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: ret{{[l|q]}}
+;
+; AVX2-LABEL: _mm256_packss_epi16_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: ret{{[l|q]}}
+;
+; X64-SSE-LABEL: _mm256_packss_epi16_manual:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: packsswb %xmm2, %xmm0
+; X64-SSE-NEXT: packsswb %xmm3, %xmm1
+; X64-SSE-NEXT: retq
+;
+; AVX512-LABEL: _mm256_packss_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %minv = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 -128))
+ %sat = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> %minv, <32 x i16> splat (i16 127))
+ %tr = trunc <32 x i16> %sat to <32 x i8>
+ ret <32 x i8> %tr
+}
+
+define <64 x i8> @_mm512_packss_epi16_manual(<32 x i16> %a, <32 x i16> %b) {
+; X86-SSE-LABEL: _mm512_packss_epi16_manual:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %ebp, -8
+; X86-SSE-NEXT: movl %esp, %ebp
+; X86-SSE-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE-NEXT: andl $-16, %esp
+; X86-SSE-NEXT: subl $16, %esp
+; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3
+; X86-SSE-NEXT: packsswb 24(%ebp), %xmm0
+; X86-SSE-NEXT: packsswb 40(%ebp), %xmm1
+; X86-SSE-NEXT: packsswb 56(%ebp), %xmm2
+; X86-SSE-NEXT: packsswb 72(%ebp), %xmm3
+; X86-SSE-NEXT: movl %ebp, %esp
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: _mm512_packss_epi16_manual:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: packsswb %xmm4, %xmm0
+; X64-SSE-NEXT: packsswb %xmm5, %xmm1
+; X64-SSE-NEXT: packsswb %xmm6, %xmm2
+; X64-SSE-NEXT: packsswb %xmm7, %xmm3
+; X64-SSE-NEXT: retq
+;
+; AVX512-LABEL: _mm512_packss_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+ %minv = tail call <64 x i16> @llvm.smax.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 -128))
+ %sat = tail call <64 x i16> @llvm.smin.v64i16(<64 x i16> %minv, <64 x i16> splat (i16 127))
+ %tr = trunc <64 x i16> %sat to <64 x i8>
+ ret <64 x i8> %tr
+}
+
+define <8 x i16> @_mm_packss_epi32_manual(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: _mm_packss_epi32_manual:
+; SSE: # %bb.0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
+;
+; AVX-LABEL: _mm_packss_epi32_manual:
+; AVX: # %bb.0:
+; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: _mm_packss_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %minv = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 -32768))
+ %sat = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %minv, <8 x i32> splat (i32 32767))
+ %tr = trunc <8 x i32> %sat to <8 x i16>
+ ret <8 x i16> %tr
+}
+
+define <16 x i16> @_mm256_packss_epi32_manual(<8 x i32> %a, <8 x i32> %b) {
+; X86-SSE-LABEL: _mm256_packss_epi32_manual:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %ebp, -8
+; X86-SSE-NEXT: movl %esp, %ebp
+; X86-SSE-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE-NEXT: andl $-16, %esp
+; X86-SSE-NEXT: subl $16, %esp
+; X86-SSE-NEXT: packssdw %xmm2, %xmm0
+; X86-SSE-NEXT: packssdw 8(%ebp), %xmm1
+; X86-SSE-NEXT: movl %ebp, %esp
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE-NEXT: retl
+;
+; AVX1-LABEL: _mm256_packss_epi32_manual:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: ret{{[l|q]}}
+;
+; AVX2-LABEL: _mm256_packss_epi32_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: ret{{[l|q]}}
+;
+; X64-SSE-LABEL: _mm256_packss_epi32_manual:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: packssdw %xmm2, %xmm0
+; X64-SSE-NEXT: packssdw %xmm3, %xmm1
+; X64-SSE-NEXT: retq
+;
+; AVX512-LABEL: _mm256_packss_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %minv = tail call <16 x i32> @llvm.smax.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 -32768))
+ %sat = tail call <16 x i32> @llvm.smin.v16i32(<16 x i32> %minv, <16 x i32> splat (i32 32767))
+ %tr = trunc <16 x i32> %sat to <16 x i16>
+ ret <16 x i16> %tr
+}
+
+define <32 x i16> @_mm512_packss_epi32_manual(<16 x i32> %a, <16 x i32> %b) {
+; X86-SSE-LABEL: _mm512_packss_epi32_manual:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE-NEXT: .cfi_offset %ebp, -8
+; X86-SSE-NEXT: movl %esp, %ebp
+; X86-SSE-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE-NEXT: andl $-16, %esp
+; X86-SSE-NEXT: subl $16, %esp
+; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3
+; X86-SSE-NEXT: packssdw 24(%ebp), %xmm0
+; X86-SSE-NEXT: packssdw 40(%ebp), %xmm1
+; X86-SSE-NEXT: packssdw 56(%ebp), %xmm2
+; X86-SSE-NEXT: packssdw 72(%ebp), %xmm3
+; X86-SSE-NEXT: movl %ebp, %esp
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: _mm512_packss_epi32_manual:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: packssdw %xmm4, %xmm0
+; X64-SSE-NEXT: packssdw %xmm5, %xmm1
+; X64-SSE-NEXT: packssdw %xmm6, %xmm2
+; X64-SSE-NEXT: packssdw %xmm7, %xmm3
+; X64-SSE-NEXT: retq
+;
+; AVX512-LABEL: _mm512_packss_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
+ %minv = tail call <32 x i32> @llvm.smax.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 -32768))
+ %sat = tail call <32 x i32> @llvm.smin.v32i32(<32 x i32> %minv, <32 x i32> splat (i32 32767))
+ %tr = trunc <32 x i32> %sat to <32 x i16>
+ ret <32 x i16> %tr
+}
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index 90c97e4e9fed0..9cc859e1274e7 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -530,10 +530,968 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
%4 = trunc <32 x i16> %3 to <32 x i8>
ret <32 x i8> %4
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-AVX2: {{.*}}
-; X64-SSE2: {{.*}}
-; X64-SSE4: {{.*}}
-; X86-AVX2: {{.*}}
-; X86-SSE2: {{.*}}
-; X86-SSE4: {{.*}}
+
+define <16 x i8> @_mm_packus_epi16_manual(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: _mm_packus_epi16_manual:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: ret{{[l|q]}}
+;
+; SSE4-LABEL: _mm_packus_epi16_manual:
+; SSE4: # %bb.0:
+; SSE4-NEXT: packuswb %xmm1, %xmm0
+; SSE4-NEXT: ret{{[l|q]}}
+;
+; AVX-LABEL: _mm_packus_epi16_manual:
+; AVX: # %bb.0:
+; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: _mm_packus_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %sat = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %sh, <16 x i16> splat (i16 255))
+ %tr = trunc nuw <16 x i16> %sat to <16 x i8>
+ ret <16 x i8> %tr
+}
+
+define <32 x i8> @_mm256_packus_epi16_manual(<16 x i16> %a, <16 x i16> %b) {
+; X86-SSE2-LABEL: _mm256_packus_epi16_manual:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT: .cfi_offset %ebp, -8
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $16, %esp
+; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm5
+; X86-SSE2-NEXT: psubw %xmm5, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm5
+; X86-SSE2-NEXT: psubw %xmm5, %xmm1
+; X86-SSE2-NEXT: packuswb %xmm3, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm3
+; X86-SSE2-NEXT: psubw %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm3
+; X86-SSE2-NEXT: psubw %xmm3, %xmm0
+; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
+; X86-SSE2-NEXT: movl %ebp, %esp
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE2-NEXT: retl
+;
+; X64-SSE2-LABEL: _mm256_packus_epi16_manual:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: psubusw %xmm4, %xmm5
+; X64-SSE2-NEXT: psubw %xmm5, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: psubusw %xmm4, %xmm5
+; X64-SSE2-NEXT: psubw %xmm5, %xmm1
+; X64-SSE2-NEXT: packuswb %xmm3, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: psubusw %xmm4, %xmm3
+; X64-SSE2-NEXT: psubw %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: psubusw %xmm4, %xmm3
+; X64-SSE2-NEXT: psubw %xmm3, %xmm0
+; X64-SSE2-NEXT: packuswb %xmm2, %xmm0
+; X64-SSE2-NEXT: retq
+;
+; X86-SSE4-LABEL: _mm256_packus_epi16_manual:
+; X86-SSE4: # %bb.0:
+; X86-SSE4-NEXT: pushl %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE4-NEXT: .cfi_offset %ebp, -8
+; X86-SSE4-NEXT: movl %esp, %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE4-NEXT: andl $-16, %esp
+; X86-SSE4-NEXT: subl $16, %esp
+; X86-SSE4-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-SSE4-NEXT: pminuw %xmm3, %xmm1
+; X86-SSE4-NEXT: pminuw %xmm3, %xmm2
+; X86-SSE4-NEXT: pminuw %xmm3, %xmm0
+; X86-SSE4-NEXT: packuswb %xmm2, %xmm0
+; X86-SSE4-NEXT: pminuw 8(%ebp), %xmm3
+; X86-SSE4-NEXT: packuswb %xmm3, %xmm1
+; X86-SSE4-NEXT: movl %ebp, %esp
+; X86-SSE4-NEXT: popl %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE4-NEXT: retl
+;
+; X64-SSE4-LABEL: _mm256_packus_epi16_manual:
+; X64-SSE4: # %bb.0:
+; X64-SSE4-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; X64-SSE4-NEXT: pminuw %xmm4, %xmm3
+; X64-SSE4-NEXT: pminuw %xmm4, %xmm1
+; X64-SSE4-NEXT: packuswb %xmm3, %xmm1
+; X64-SSE4-NEXT: pminuw %xmm4, %xmm2
+; X64-SSE4-NEXT: pminuw %xmm4, %xmm0
+; X64-SSE4-NEXT: packuswb %xmm2, %xmm0
+; X64-SSE4-NEXT: retq
+;
+; AVX1-LABEL: _mm256_packus_epi16_manual:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: ret{{[l|q]}}
+;
+; AVX2-LABEL: _mm256_packus_epi16_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: _mm256_packus_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %sat = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 255))
+ %tr = trunc nuw <32 x i16> %sat to <32 x i8>
+ ret <32 x i8> %tr
+}
+
+define <64 x i8> @_mm512_packus_epi16_manual(<32 x i16> %a, <32 x i16> %b) {
+; X86-SSE2-LABEL: _mm512_packus_epi16_manual:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT: .cfi_offset %ebp, -8
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $16, %esp
+; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3
+; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm5
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm6
+; X86-SSE2-NEXT: psubw %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm6
+; X86-SSE2-NEXT: psubw %xmm6, %xmm3
+; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm6
+; X86-SSE2-NEXT: packuswb %xmm5, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm5
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm5
+; X86-SSE2-NEXT: psubw %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm5
+; X86-SSE2-NEXT: psubw %xmm5, %xmm2
+; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm5
+; X86-SSE2-NEXT: packuswb %xmm6, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm6
+; X86-SSE2-NEXT: psubw %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm6
+; X86-SSE2-NEXT: psubw %xmm6, %xmm1
+; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6
+; X86-SSE2-NEXT: packuswb %xmm5, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm5
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm5
+; X86-SSE2-NEXT: psubw %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT: psubusw %xmm4, %xmm5
+; X86-SSE2-NEXT: psubw %xmm5, %xmm0
+; X86-SSE2-NEXT: packuswb %xmm6, %xmm0
+; X86-SSE2-NEXT: movl %ebp, %esp
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE2-NEXT: retl
+;
+; X64-SSE2-LABEL: _mm512_packus_epi16_manual:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm9
+; X64-SSE2-NEXT: psubusw %xmm8, %xmm9
+; X64-SSE2-NEXT: psubw %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm9
+; X64-SSE2-NEXT: psubusw %xmm8, %xmm9
+; X64-SSE2-NEXT: psubw %xmm9, %xmm3
+; X64-SSE2-NEXT: packuswb %xmm7, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: psubusw %xmm8, %xmm7
+; X64-SSE2-NEXT: psubw %xmm7, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm7
+; X64-SSE2-NEXT: psubusw %xmm8, %xmm7
+; X64-SSE2-NEXT: psubw %xmm7, %xmm2
+; X64-SSE2-NEXT: packuswb %xmm6, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm6
+; X64-SSE2-NEXT: psubusw %xmm8, %xmm6
+; X64-SSE2-NEXT: psubw %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: psubusw %xmm8, %xmm6
+; X64-SSE2-NEXT: psubw %xmm6, %xmm1
+; X64-SSE2-NEXT: packuswb %xmm5, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: psubusw %xmm8, %xmm5
+; X64-SSE2-NEXT: psubw %xmm5, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: psubusw %xmm8, %xmm5
+; X64-SSE2-NEXT: psubw %xmm5, %xmm0
+; X64-SSE2-NEXT: packuswb %xmm4, %xmm0
+; X64-SSE2-NEXT: retq
+;
+; X86-SSE4-LABEL: _mm512_packus_epi16_manual:
+; X86-SSE4: # %bb.0:
+; X86-SSE4-NEXT: pushl %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE4-NEXT: .cfi_offset %ebp, -8
+; X86-SSE4-NEXT: movl %esp, %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE4-NEXT: andl $-16, %esp
+; X86-SSE4-NEXT: subl $16, %esp
+; X86-SSE4-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; X86-SSE4-NEXT: pminuw %xmm4, %xmm2
+; X86-SSE4-NEXT: pminuw %xmm4, %xmm1
+; X86-SSE4-NEXT: pminuw %xmm4, %xmm0
+; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm5
+; X86-SSE4-NEXT: pminuw %xmm4, %xmm5
+; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm3
+; X86-SSE4-NEXT: pminuw %xmm4, %xmm3
+; X86-SSE4-NEXT: packuswb %xmm5, %xmm3
+; X86-SSE4-NEXT: movdqa 56(%ebp), %xmm5
+; X86-SSE4-NEXT: pminuw %xmm4, %xmm5
+; X86-SSE4-NEXT: packuswb %xmm5, %xmm2
+; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm5
+; X86-SSE4-NEXT: pminuw %xmm4, %xmm5
+; X86-SSE4-NEXT: packuswb %xmm5, %xmm1
+; X86-SSE4-NEXT: pminuw 24(%ebp), %xmm4
+; X86-SSE4-NEXT: packuswb %xmm4, %xmm0
+; X86-SSE4-NEXT: movl %ebp, %esp
+; X86-SSE4-NEXT: popl %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE4-NEXT: retl
+;
+; X64-SSE4-LABEL: _mm512_packus_epi16_manual:
+; X64-SSE4: # %bb.0:
+; X64-SSE4-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; X64-SSE4-NEXT: pminuw %xmm8, %xmm7
+; X64-SSE4-NEXT: pminuw %xmm8, %xmm3
+; X64-SSE4-NEXT: packuswb %xmm7, %xmm3
+; X64-SSE4-NEXT: pminuw %xmm8, %xmm6
+; X64-SSE4-NEXT: pminuw %xmm8, %xmm2
+; X64-SSE4-NEXT: packuswb %xmm6, %xmm2
+; X64-SSE4-NEXT: pminuw %xmm8, %xmm5
+; X64-SSE4-NEXT: pminuw %xmm8, %xmm1
+; X64-SSE4-NEXT: packuswb %xmm5, %xmm1
+; X64-SSE4-NEXT: pminuw %xmm8, %xmm4
+; X64-SSE4-NEXT: pminuw %xmm8, %xmm0
+; X64-SSE4-NEXT: packuswb %xmm4, %xmm0
+; X64-SSE4-NEXT: retq
+;
+; X86-AVX1-LABEL: _mm512_packus_epi16_manual:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: pushl %ebp
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX1-NEXT: .cfi_offset %ebp, -8
+; X86-AVX1-NEXT: movl %esp, %ebp
+; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp
+; X86-AVX1-NEXT: andl $-32, %esp
+; X86-AVX1-NEXT: subl $32, %esp
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm5
+; X86-AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm6
+; X86-AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; X86-AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminuw 8(%ebp), %xmm3, %xmm2
+; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpminuw 24(%ebp), %xmm3, %xmm3
+; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; X86-AVX1-NEXT: movl %ebp, %esp
+; X86-AVX1-NEXT: popl %ebp
+; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4
+; X86-AVX1-NEXT: retl
+;
+; X64-AVX1-LABEL: _mm512_packus_epi16_manual:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; X64-AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm5
+; X64-AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm6
+; X64-AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5
+; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; X64-AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm4, %xmm2, %xmm3
+; X64-AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm6
+; X64-AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; X64-AVX1-NEXT: vpminuw %xmm4, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; X64-AVX1-NEXT: retq
+;
+; X86-AVX2-LABEL: _mm512_packus_epi16_manual:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: pushl %ebp
+; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT: .cfi_offset %ebp, -8
+; X86-AVX2-NEXT: movl %esp, %ebp
+; X86-AVX2-NEXT: .cfi_def_cfa_register %ebp
+; X86-AVX2-NEXT: andl $-32, %esp
+; X86-AVX2-NEXT: subl $32, %esp
+; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm3
+; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
+; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
+; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X86-AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1
+; X86-AVX2-NEXT: vpminuw %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT: vpminuw %ymm3, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminuw %ymm3, %ymm4, %ymm2
+; X86-AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; X86-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; X86-AVX2-NEXT: movl %ebp, %esp
+; X86-AVX2-NEXT: popl %ebp
+; X86-AVX2-NEXT: .cfi_def_cfa %esp, 4
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: _mm512_packus_epi16_manual:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
+; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
+; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64-AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpminuw %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT: vpminuw %ymm3, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminuw %ymm3, %ymm4, %ymm2
+; X64-AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; X64-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; X64-AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm512_packus_epi16_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+ %sat = tail call <64 x i16> @llvm.umin.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 255))
+ %tr = trunc nuw <64 x i16> %sat to <64 x i8>
+ ret <64 x i8> %tr
+}
+
+define <8 x i16> @_mm_packus_epi32_manual(<4 x i32> %a, <4 x i32> %b) {
+; X86-SSE2-LABEL: _mm_packus_epi32_manual:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm3, %xmm4
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
+; X86-SSE2-NEXT: pslld $16, %xmm0
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: packssdw %xmm2, %xmm0
+; X86-SSE2-NEXT: retl
+;
+; X64-SSE2-LABEL: _mm_packus_epi32_manual:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pslld $16, %xmm4
+; X64-SSE2-NEXT: psrad $16, %xmm4
+; X64-SSE2-NEXT: pslld $16, %xmm0
+; X64-SSE2-NEXT: psrad $16, %xmm0
+; X64-SSE2-NEXT: packssdw %xmm4, %xmm0
+; X64-SSE2-NEXT: retq
+;
+; SSE4-LABEL: _mm_packus_epi32_manual:
+; SSE4: # %bb.0:
+; SSE4-NEXT: packusdw %xmm1, %xmm0
+; SSE4-NEXT: ret{{[l|q]}}
+;
+; AVX-LABEL: _mm_packus_epi32_manual:
+; AVX: # %bb.0:
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: _mm_packus_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %sat = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 65535))
+ %tr = trunc nuw <8 x i32> %sat to <8 x i16>
+ ret <8 x i16> %tr
+}
+
+define <16 x i16> @_mm256_packus_epi32_manual(<8 x i32> %a, <8 x i32> %b) {
+; X86-SSE2-LABEL: _mm256_packus_epi32_manual:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT: .cfi_offset %ebp, -8
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $32, %esp
+; X86-SSE2-NEXT: movaps %xmm2, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm6
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm6, %xmm4
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm7
+; X86-SSE2-NEXT: pxor %xmm5, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm7, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm6, %xmm4
+; X86-SSE2-NEXT: por %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm5, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pxor %xmm6, %xmm7
+; X86-SSE2-NEXT: por %xmm7, %xmm0
+; X86-SSE2-NEXT: movdqa (%esp), %xmm2 # 16-byte Reload
+; X86-SSE2-NEXT: pxor %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm3
+; X86-SSE2-NEXT: pxor %xmm3, %xmm6
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: por %xmm6, %xmm3
+; X86-SSE2-NEXT: pslld $16, %xmm3
+; X86-SSE2-NEXT: psrad $16, %xmm3
+; X86-SSE2-NEXT: pslld $16, %xmm0
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: packssdw %xmm3, %xmm0
+; X86-SSE2-NEXT: pslld $16, %xmm4
+; X86-SSE2-NEXT: psrad $16, %xmm4
+; X86-SSE2-NEXT: pslld $16, %xmm1
+; X86-SSE2-NEXT: psrad $16, %xmm1
+; X86-SSE2-NEXT: packssdw %xmm4, %xmm1
+; X86-SSE2-NEXT: movl %ebp, %esp
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE2-NEXT: retl
+;
+; X64-SSE2-LABEL: _mm256_packus_epi32_manual:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pxor %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm8
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm8
+; X64-SSE2-NEXT: pcmpeqd %xmm7, %xmm7
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pxor %xmm7, %xmm8
+; X64-SSE2-NEXT: por %xmm8, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm8
+; X64-SSE2-NEXT: pxor %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm8, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm3
+; X64-SSE2-NEXT: pxor %xmm7, %xmm5
+; X64-SSE2-NEXT: por %xmm3, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm6, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm8
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm8
+; X64-SSE2-NEXT: pand %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm7, %xmm8
+; X64-SSE2-NEXT: por %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm2, %xmm6
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm7, %xmm4
+; X64-SSE2-NEXT: pslld $16, %xmm4
+; X64-SSE2-NEXT: psrad $16, %xmm4
+; X64-SSE2-NEXT: pslld $16, %xmm0
+; X64-SSE2-NEXT: psrad $16, %xmm0
+; X64-SSE2-NEXT: packssdw %xmm4, %xmm0
+; X64-SSE2-NEXT: pslld $16, %xmm5
+; X64-SSE2-NEXT: psrad $16, %xmm5
+; X64-SSE2-NEXT: pslld $16, %xmm1
+; X64-SSE2-NEXT: psrad $16, %xmm1
+; X64-SSE2-NEXT: packssdw %xmm5, %xmm1
+; X64-SSE2-NEXT: retq
+;
+; X86-SSE4-LABEL: _mm256_packus_epi32_manual:
+; X86-SSE4: # %bb.0:
+; X86-SSE4-NEXT: pushl %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE4-NEXT: .cfi_offset %ebp, -8
+; X86-SSE4-NEXT: movl %esp, %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE4-NEXT: andl $-16, %esp
+; X86-SSE4-NEXT: subl $16, %esp
+; X86-SSE4-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
+; X86-SSE4-NEXT: pminud %xmm3, %xmm1
+; X86-SSE4-NEXT: pminud %xmm3, %xmm2
+; X86-SSE4-NEXT: pminud %xmm3, %xmm0
+; X86-SSE4-NEXT: packusdw %xmm2, %xmm0
+; X86-SSE4-NEXT: pminud 8(%ebp), %xmm3
+; X86-SSE4-NEXT: packusdw %xmm3, %xmm1
+; X86-SSE4-NEXT: movl %ebp, %esp
+; X86-SSE4-NEXT: popl %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE4-NEXT: retl
+;
+; X64-SSE4-LABEL: _mm256_packus_epi32_manual:
+; X64-SSE4: # %bb.0:
+; X64-SSE4-NEXT: pmovsxbw {{.*#+}} xmm4 = [65535,0,65535,0,65535,0,65535,0]
+; X64-SSE4-NEXT: pminud %xmm4, %xmm3
+; X64-SSE4-NEXT: pminud %xmm4, %xmm1
+; X64-SSE4-NEXT: packusdw %xmm3, %xmm1
+; X64-SSE4-NEXT: pminud %xmm4, %xmm2
+; X64-SSE4-NEXT: pminud %xmm4, %xmm0
+; X64-SSE4-NEXT: packusdw %xmm2, %xmm0
+; X64-SSE4-NEXT: retq
+;
+; AVX1-LABEL: _mm256_packus_epi32_manual:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: ret{{[l|q]}}
+;
+; AVX2-LABEL: _mm256_packus_epi32_manual:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: _mm256_packus_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %sat = tail call <16 x i32> @llvm.umin.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 65535))
+ %tr = trunc nuw <16 x i32> %sat to <16 x i16>
+ ret <16 x i16> %tr
+}
+
+define <32 x i16> @_mm512_packus_epi32_manual(<16 x i32> %a, <16 x i32> %b) {
+; X86-SSE2-LABEL: _mm512_packus_epi32_manual:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT: .cfi_offset %ebp, -8
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $80, %esp
+; X86-SSE2-NEXT: movaps %xmm1, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm0
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pxor %xmm6, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpeqd %xmm7, %xmm7
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pxor %xmm7, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT: pxor %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pxor %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: pxor %xmm6, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pxor %xmm7, %xmm5
+; X86-SSE2-NEXT: por %xmm5, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm6, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm7, %xmm2
+; X86-SSE2-NEXT: por %xmm5, %xmm2
+; X86-SSE2-NEXT: movdqa (%esp), %xmm1 # 16-byte Reload
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: pxor %xmm6, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pxor %xmm7, %xmm5
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pxor %xmm6, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm7, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-SSE2-NEXT: pxor %xmm6, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pxor %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm0
+; X86-SSE2-NEXT: pxor %xmm0, %xmm6
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm3
+; X86-SSE2-NEXT: pxor %xmm3, %xmm7
+; X86-SSE2-NEXT: pand %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm7, %xmm3
+; X86-SSE2-NEXT: pslld $16, %xmm3
+; X86-SSE2-NEXT: psrad $16, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: pslld $16, %xmm0
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: packssdw %xmm3, %xmm0
+; X86-SSE2-NEXT: pslld $16, %xmm5
+; X86-SSE2-NEXT: psrad $16, %xmm5
+; X86-SSE2-NEXT: movdqa (%esp), %xmm1 # 16-byte Reload
+; X86-SSE2-NEXT: pslld $16, %xmm1
+; X86-SSE2-NEXT: psrad $16, %xmm1
+; X86-SSE2-NEXT: packssdw %xmm5, %xmm1
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
+; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
+; X86-SSE2-NEXT: pslld $16, %xmm3
+; X86-SSE2-NEXT: psrad $16, %xmm3
+; X86-SSE2-NEXT: packssdw %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
+; X86-SSE2-NEXT: pslld $16, %xmm3
+; X86-SSE2-NEXT: psrad $16, %xmm3
+; X86-SSE2-NEXT: pslld $16, %xmm4
+; X86-SSE2-NEXT: psrad $16, %xmm4
+; X86-SSE2-NEXT: packssdw %xmm3, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: movl %ebp, %esp
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE2-NEXT: retl
+;
+; X64-SSE2-LABEL: _mm512_packus_epi32_manual:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm8
+; X64-SSE2-NEXT: pxor %xmm10, %xmm8
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183,2147549183,2147549183]
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm12
+; X64-SSE2-NEXT: pcmpgtd %xmm8, %xmm12
+; X64-SSE2-NEXT: pcmpeqd %xmm11, %xmm11
+; X64-SSE2-NEXT: pand %xmm12, %xmm3
+; X64-SSE2-NEXT: pxor %xmm11, %xmm12
+; X64-SSE2-NEXT: por %xmm12, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm12
+; X64-SSE2-NEXT: pxor %xmm10, %xmm12
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm8
+; X64-SSE2-NEXT: pcmpgtd %xmm12, %xmm8
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pxor %xmm11, %xmm8
+; X64-SSE2-NEXT: por %xmm7, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm7
+; X64-SSE2-NEXT: pxor %xmm10, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm12
+; X64-SSE2-NEXT: pcmpgtd %xmm7, %xmm12
+; X64-SSE2-NEXT: pand %xmm12, %xmm2
+; X64-SSE2-NEXT: pxor %xmm11, %xmm12
+; X64-SSE2-NEXT: por %xmm12, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm12
+; X64-SSE2-NEXT: pxor %xmm10, %xmm12
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm12, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm6
+; X64-SSE2-NEXT: pxor %xmm11, %xmm7
+; X64-SSE2-NEXT: por %xmm6, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm10, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm12
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm12
+; X64-SSE2-NEXT: pand %xmm12, %xmm1
+; X64-SSE2-NEXT: pxor %xmm11, %xmm12
+; X64-SSE2-NEXT: por %xmm12, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm12
+; X64-SSE2-NEXT: pxor %xmm10, %xmm12
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm6
+; X64-SSE2-NEXT: pcmpgtd %xmm12, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm5
+; X64-SSE2-NEXT: pxor %xmm11, %xmm6
+; X64-SSE2-NEXT: por %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm10, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm12
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm12
+; X64-SSE2-NEXT: pand %xmm12, %xmm0
+; X64-SSE2-NEXT: pxor %xmm11, %xmm12
+; X64-SSE2-NEXT: por %xmm12, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm10
+; X64-SSE2-NEXT: pcmpgtd %xmm10, %xmm9
+; X64-SSE2-NEXT: pxor %xmm9, %xmm11
+; X64-SSE2-NEXT: pand %xmm4, %xmm9
+; X64-SSE2-NEXT: por %xmm11, %xmm9
+; X64-SSE2-NEXT: pslld $16, %xmm9
+; X64-SSE2-NEXT: psrad $16, %xmm9
+; X64-SSE2-NEXT: pslld $16, %xmm0
+; X64-SSE2-NEXT: psrad $16, %xmm0
+; X64-SSE2-NEXT: packssdw %xmm9, %xmm0
+; X64-SSE2-NEXT: pslld $16, %xmm6
+; X64-SSE2-NEXT: psrad $16, %xmm6
+; X64-SSE2-NEXT: pslld $16, %xmm1
+; X64-SSE2-NEXT: psrad $16, %xmm1
+; X64-SSE2-NEXT: packssdw %xmm6, %xmm1
+; X64-SSE2-NEXT: pslld $16, %xmm7
+; X64-SSE2-NEXT: psrad $16, %xmm7
+; X64-SSE2-NEXT: pslld $16, %xmm2
+; X64-SSE2-NEXT: psrad $16, %xmm2
+; X64-SSE2-NEXT: packssdw %xmm7, %xmm2
+; X64-SSE2-NEXT: pslld $16, %xmm8
+; X64-SSE2-NEXT: psrad $16, %xmm8
+; X64-SSE2-NEXT: pslld $16, %xmm3
+; X64-SSE2-NEXT: psrad $16, %xmm3
+; X64-SSE2-NEXT: packssdw %xmm8, %xmm3
+; X64-SSE2-NEXT: retq
+;
+; X86-SSE4-LABEL: _mm512_packus_epi32_manual:
+; X86-SSE4: # %bb.0:
+; X86-SSE4-NEXT: pushl %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE4-NEXT: .cfi_offset %ebp, -8
+; X86-SSE4-NEXT: movl %esp, %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa_register %ebp
+; X86-SSE4-NEXT: andl $-16, %esp
+; X86-SSE4-NEXT: subl $16, %esp
+; X86-SSE4-NEXT: pmovsxbw {{.*#+}} xmm4 = [65535,0,65535,0,65535,0,65535,0]
+; X86-SSE4-NEXT: pminud %xmm4, %xmm2
+; X86-SSE4-NEXT: pminud %xmm4, %xmm1
+; X86-SSE4-NEXT: pminud %xmm4, %xmm0
+; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm5
+; X86-SSE4-NEXT: pminud %xmm4, %xmm5
+; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm3
+; X86-SSE4-NEXT: pminud %xmm4, %xmm3
+; X86-SSE4-NEXT: packusdw %xmm5, %xmm3
+; X86-SSE4-NEXT: movdqa 56(%ebp), %xmm5
+; X86-SSE4-NEXT: pminud %xmm4, %xmm5
+; X86-SSE4-NEXT: packusdw %xmm5, %xmm2
+; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm5
+; X86-SSE4-NEXT: pminud %xmm4, %xmm5
+; X86-SSE4-NEXT: packusdw %xmm5, %xmm1
+; X86-SSE4-NEXT: pminud 24(%ebp), %xmm4
+; X86-SSE4-NEXT: packusdw %xmm4, %xmm0
+; X86-SSE4-NEXT: movl %ebp, %esp
+; X86-SSE4-NEXT: popl %ebp
+; X86-SSE4-NEXT: .cfi_def_cfa %esp, 4
+; X86-SSE4-NEXT: retl
+;
+; X64-SSE4-LABEL: _mm512_packus_epi32_manual:
+; X64-SSE4: # %bb.0:
+; X64-SSE4-NEXT: pmovsxbw {{.*#+}} xmm8 = [65535,0,65535,0,65535,0,65535,0]
+; X64-SSE4-NEXT: pminud %xmm8, %xmm7
+; X64-SSE4-NEXT: pminud %xmm8, %xmm3
+; X64-SSE4-NEXT: packusdw %xmm7, %xmm3
+; X64-SSE4-NEXT: pminud %xmm8, %xmm6
+; X64-SSE4-NEXT: pminud %xmm8, %xmm2
+; X64-SSE4-NEXT: packusdw %xmm6, %xmm2
+; X64-SSE4-NEXT: pminud %xmm8, %xmm5
+; X64-SSE4-NEXT: pminud %xmm8, %xmm1
+; X64-SSE4-NEXT: packusdw %xmm5, %xmm1
+; X64-SSE4-NEXT: pminud %xmm8, %xmm4
+; X64-SSE4-NEXT: pminud %xmm8, %xmm0
+; X64-SSE4-NEXT: packusdw %xmm4, %xmm0
+; X64-SSE4-NEXT: retq
+;
+; X86-AVX1-LABEL: _mm512_packus_epi32_manual:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: pushl %ebp
+; X86-AVX1-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX1-NEXT: .cfi_offset %ebp, -8
+; X86-AVX1-NEXT: movl %esp, %ebp
+; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp
+; X86-AVX1-NEXT: andl $-32, %esp
+; X86-AVX1-NEXT: subl $32, %esp
+; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; X86-AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm5
+; X86-AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm6
+; X86-AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; X86-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminud 8(%ebp), %xmm3, %xmm2
+; X86-AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpminud 24(%ebp), %xmm3, %xmm3
+; X86-AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; X86-AVX1-NEXT: movl %ebp, %esp
+; X86-AVX1-NEXT: popl %ebp
+; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4
+; X86-AVX1-NEXT: retl
+;
+; X64-AVX1-LABEL: _mm512_packus_epi32_manual:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; X64-AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm5
+; X64-AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm6
+; X64-AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
+; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; X64-AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; X64-AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm3
+; X64-AVX1-NEXT: vpminud %xmm4, %xmm0, %xmm6
+; X64-AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; X64-AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-AVX1-NEXT: vpminud %xmm4, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; X64-AVX1-NEXT: retq
+;
+; X86-AVX2-LABEL: _mm512_packus_epi32_manual:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: pushl %ebp
+; X86-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX2-NEXT: .cfi_offset %ebp, -8
+; X86-AVX2-NEXT: movl %esp, %ebp
+; X86-AVX2-NEXT: .cfi_def_cfa_register %ebp
+; X86-AVX2-NEXT: andl $-32, %esp
+; X86-AVX2-NEXT: subl $32, %esp
+; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm3
+; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
+; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
+; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; X86-AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
+; X86-AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpminud %ymm3, %ymm4, %ymm2
+; X86-AVX2-NEXT: vpackusdw %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; X86-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; X86-AVX2-NEXT: movl %ebp, %esp
+; X86-AVX2-NEXT: popl %ebp
+; X86-AVX2-NEXT: .cfi_def_cfa %esp, 4
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: _mm512_packus_epi32_manual:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
+; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
+; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; X64-AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpminud %ymm3, %ymm4, %ymm2
+; X64-AVX2-NEXT: vpackusdw %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; X64-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; X64-AVX2-NEXT: retq
+;
+; AVX512-LABEL: _mm512_packus_epi32_manual:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
+ %sat = tail call <32 x i32> @llvm.umin.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 65535))
+ %tr = trunc nuw <32 x i32> %sat to <32 x i16>
+ ret <32 x i16> %tr
+}
+
>From 6d9617475a1773b192a66dcf34fe5ebeb8fe1e15 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Sat, 29 Nov 2025 14:33:55 +0100
Subject: [PATCH 5/5] update other tests
---
llvm/test/CodeGen/X86/combine-sub-usat.ll | 6 ------
.../CodeGen/X86/masked_store_trunc_usat.ll | 6 ------
llvm/test/CodeGen/X86/psubus.ll | 18 ------------------
llvm/test/CodeGen/X86/vector-trunc-usat.ll | 6 ------
4 files changed, 36 deletions(-)
diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll
index 36e374bd2e67c..86d2ae3d0f800 100644
--- a/llvm/test/CodeGen/X86/combine-sub-usat.ll
+++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll
@@ -251,18 +251,12 @@ define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) {
;
; SSE41-LABEL: combine_trunc_v8i32_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: psubusw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: combine_trunc_v8i32_v8i16:
; SSE42: # %bb.0:
-; SSE42-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE42-NEXT: pminud %xmm3, %xmm2
-; SSE42-NEXT: pminud %xmm3, %xmm1
; SSE42-NEXT: packusdw %xmm2, %xmm1
; SSE42-NEXT: psubusw %xmm1, %xmm0
; SSE42-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 4c4b6e78d1f8c..06ef2293fc20c 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -4383,9 +4383,6 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; SSE4-LABEL: truncstore_v8i32_v8i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm4, %xmm4
-; SSE4-NEXT: pmovsxbw {{.*#+}} xmm5 = [65535,0,65535,0,65535,0,65535,0]
-; SSE4-NEXT: pminud %xmm5, %xmm1
-; SSE4-NEXT: pminud %xmm5, %xmm0
; SSE4-NEXT: packusdw %xmm1, %xmm0
; SSE4-NEXT: pcmpeqd %xmm4, %xmm3
; SSE4-NEXT: pcmpeqd %xmm4, %xmm2
@@ -7303,9 +7300,6 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) {
; SSE4-LABEL: truncstore_v16i16_v16i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; SSE4-NEXT: pminuw %xmm4, %xmm1
-; SSE4-NEXT: pminuw %xmm4, %xmm0
; SSE4-NEXT: packuswb %xmm1, %xmm0
; SSE4-NEXT: pcmpeqb %xmm2, %xmm3
; SSE4-NEXT: pmovmskb %xmm3, %eax
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index e10b360b35b56..a26112397053d 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -793,9 +793,6 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: test13:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: psubusw %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1047,9 +1044,6 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: test15:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: psubusw %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1565,9 +1559,6 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: psubus_8i32_max:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: psubusw %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1972,9 +1963,6 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin
;
; SSE41-LABEL: psubus_i16_i32_max_swapped:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: psubusw %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -2067,9 +2055,6 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: psubus_i16_i32_min:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: psubusw %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -2656,9 +2641,6 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) {
;
; SSE41-LABEL: test32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm2, %xmm1
; SSE41-NEXT: psubusw %xmm1, %xmm0
; SSE41-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 0806e4960e48a..930758d734d91 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -1383,9 +1383,6 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
;
; SSE41-LABEL: trunc_usat_v8i32_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm2, %xmm1
-; SSE41-NEXT: pminud %xmm2, %xmm0
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -3424,9 +3421,6 @@ define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) {
;
; SSE41-LABEL: trunc_usat_v16i16_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pminuw %xmm2, %xmm1
-; SSE41-NEXT: pminuw %xmm2, %xmm0
; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
More information about the llvm-commits
mailing list