[llvm] [X86] optimize masked truncated saturating stores (PR #169827)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 27 08:08:39 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Folkert de Vries (folkertdev)
<details>
<summary>Changes</summary>
Combine the saturating operation into the masked truncating store. This did not happen before.
https://godbolt.org/z/n1YfavKP6
```asm
_mm256_mask_cvtusepi16_storeu_epi8_manual: # @<!-- -->_mm256_mask_cvtusepi16_storeu_epi8_manual
kmovd k1, esi
vmovdqa ymm0, ymmword ptr [rdx]
vpminuw ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
vpmovwb xmmword ptr [rdi] {k1}, ymm0
vzeroupper
ret
_mm256_mask_cvtusepi16_storeu_epi8_intrinsic: # @<!-- -->_mm256_mask_cvtusepi16_storeu_epi8_intrinsic
kmovd k1, esi
vmovdqa ymm0, ymmword ptr [rdx]
vpmovuswb xmmword ptr [rdi] {k1}, ymm0
vzeroupper
ret
```
---
Patch is 29.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169827.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+46-11)
- (modified) llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll (+60-68)
- (modified) llvm/test/CodeGen/X86/masked_store_trunc_usat.ll (+57-50)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d49f25a950e3a..459d00c29683c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53434,18 +53434,55 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
if (Mst->isCompressingStore())
return SDValue();
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
+ return ScalarStore;
+
EVT VT = Mst->getValue().getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDLoc DL(N);
+
+ EVT ValVT = Mst->getValue().getValueType();
+ EVT MemVT = Mst->getMemoryVT();
+
+ SDValue Mask = Mst->getMask();
+ SDValue Value = Mst->getValue();
+
+ // See if the truncating store can be a saturating truncated store.
+ if (Mst->isTruncatingStore() && ValVT.isVector() && MemVT.isVector() &&
+ ValVT.getScalarType().isInteger() && MemVT.getScalarType().isInteger() &&
+ ValVT.getVectorNumElements() == MemVT.getVectorNumElements() &&
+ Subtarget.hasBWI() && Subtarget.hasVLX()) {
+
+ SDValue SatSrc;
+ bool IsSigned = false;
+ if (SDValue SVal = detectSSatPattern(Value, MemVT)) {
+ SatSrc = SVal;
+ IsSigned = true;
+ } else if (SDValue UVal = detectUSatPattern(Value, MemVT, DAG, DL)) {
+ SatSrc = UVal;
+ }
+
+ if (SatSrc) {
+ unsigned Opc = IsSigned ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
+ SmallVector<SDValue, 4> Ops;
+ Ops.push_back(Mst->getChain());
+ Ops.push_back(SatSrc);
+ Ops.push_back(Mst->getBasePtr());
+ Ops.push_back(Mask);
+
+ MachineMemOperand *MMO = Mst->getMemOperand();
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
+ }
+ }
+
+ // Otherwise don't combine if this store already truncates.
if (Mst->isTruncatingStore())
return SDValue();
- if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
- return ScalarStore;
-
// If the mask value has been legalized to a non-boolean vector, try to
// simplify ops leading up to it. We only demand the MSB of each lane.
- SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
@@ -53461,14 +53498,12 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
Mst->getAddressingMode());
}
- SDValue Value = Mst->getValue();
if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
- TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
- Mst->getMemoryVT())) {
- return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
- Mst->getBasePtr(), Mst->getOffset(), Mask,
- Mst->getMemoryVT(), Mst->getMemOperand(),
- Mst->getAddressingMode(), true);
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), MemVT)) {
+ return DAG.getMaskedStore(Mst->getChain(), DL, Value.getOperand(0),
+ Mst->getBasePtr(), Mst->getOffset(), Mask, MemVT,
+ Mst->getMemOperand(), Mst->getAddressingMode(),
+ true);
}
return SDValue();
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 18d394e1281b4..93b942a1a8acd 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -350,14 +350,21 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v8i64_v8i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v8i64_v8i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512FVL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: vzeroupper
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v8i64_v8i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512BWVL-NEXT: vpmovsqd %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
%b = icmp slt <8 x i64> %x, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
%c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
@@ -964,9 +971,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i64_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqw %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1572,9 +1577,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i64_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1788,14 +1791,21 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v4i64_v4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1}
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v4i64_v4i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512FVL-NEXT: vpmovqd %ymm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: vzeroupper
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v4i64_v4i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512BWVL-NEXT: vpmovsqd %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp slt <4 x i64> %x, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
%c = select <4 x i1> %b, <4 x i64> %x, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
@@ -2141,9 +2151,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i64_v4i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqw %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
@@ -2495,9 +2503,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i64_v4i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
@@ -2641,13 +2647,19 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v2i64_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1}
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v2i64_v2i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512FVL-NEXT: vpmovqd %xmm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v2i64_v2i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512BWVL-NEXT: vpmovsqd %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp slt <2 x i64> %x, <i64 2147483647, i64 2147483647>
%c = select <2 x i1> %b, <2 x i64> %x, <2 x i64> <i64 2147483647, i64 2147483647>
@@ -2832,9 +2844,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512BWVL-LABEL: truncstore_v2i64_v2i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqw %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp slt <2 x i64> %x, <i64 32767, i64 32767>
@@ -3018,9 +3028,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512BWVL-LABEL: truncstore_v2i64_v2i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp slt <2 x i64> %x, <i64 127, i64 127>
@@ -3816,9 +3824,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i32_v16i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdw %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i32> %mask, zeroinitializer
@@ -4594,9 +4600,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i32_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i32> %mask, zeroinitializer
@@ -5034,9 +5038,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i32_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdw %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -5473,9 +5475,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i32_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -5686,9 +5686,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i32_v4i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdw %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp slt <4 x i32> %x, <i32 32767, i32 32767, i32 32767, i32 32767>
@@ -5904,9 +5902,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i32_v4i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp slt <4 x i32> %x, <i32 127, i32 127, i32 127, i32 127>
@@ -7332,9 +7328,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
; AVX512BWVL-LABEL: truncstore_v32i16_v32i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovswb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <32 x i8> %mask, zeroinitializer
@@ -8083,9 +8077,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i16_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovswb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i8> %mask, zeroinitializer
@@ -8445,9 +8437,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i16_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovswb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i16> %mask, zeroinitializer
%b = icmp slt <8 x i16> %x, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
@@ -8471,3 +8461,5 @@ declare void @llvm.masked.store.v2i8.p0(<2 x i8>, ptr, i32, <2 x i1>)
declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512VL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 4c4b6e78d1f8c..cb6b4cc95b530 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -281,13 +281,20 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v8i64_v8i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v8i64_v8i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512FVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512FVL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: vzeroupper
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v8i64_v8i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512BWVL-NEXT: vpmovusqd %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
%b = icmp ult <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -829,8 +836,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i64_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusqw %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1367,8 +1373,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i64_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusqb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1547,13 +1552,20 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX51...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/169827
More information about the llvm-commits
mailing list