[llvm] [X86] Optimize v4i16->v4i8 truncating stores via v4i32 widening (PR #186676)
Jaydeep Chauhan via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 17 04:06:55 PDT 2026
https://github.com/JaydeepChauhan14 updated https://github.com/llvm/llvm-project/pull/186676
>From 2f3cf585137f01d8aab928285e3cbdfee033c5db Mon Sep 17 00:00:00 2001
From: Chauhan Jaydeep Ashwinbhai <chauhan.jaydeep.ashwinbhai at intel.com>
Date: Sun, 15 Mar 2026 08:25:28 -0700
Subject: [PATCH 1/3] [X86] Optimize v4i16->v4i8 truncating stores via v4i32
widening
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 14 ++++++++++
.../test/CodeGen/X86/combine-storetomstore.ll | 26 +++++++++++++++++++
2 files changed, 40 insertions(+)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4fbbf63c39065..ea15cbcf4cde5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54294,6 +54294,20 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
EVT VT = StoredVal.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ // Pattern: store(trunc(load v4i16) to v4i8)
+ if (!St->isTruncatingStore() && VT == MVT::v4i8 && Subtarget.hasAVX512() &&
+ StoredVal.getOpcode() == ISD::TRUNCATE &&
+ StoredVal.getOperand(0).getValueType() == MVT::v4i16 &&
+ StoredVal.hasOneUse() && TLI.isTruncStoreLegal(MVT::v4i32, MVT::v4i8)) {
+
+ SDValue Src = StoredVal.getOperand(0);
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v4i32, Src);
+ return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
+ MVT::v4i8, St->getMemOperand());
+ }
+ }
+
// Convert a store of vXi1 into a store of iX and a bitcast.
if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
VT.getVectorElementType() == MVT::i1) {
diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll
index 45a1172b2323e..b6b189712c43a 100644
--- a/llvm/test/CodeGen/X86/combine-storetomstore.ll
+++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll
@@ -1538,3 +1538,29 @@ define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i
store <8 x i64> %sel, ptr %ptr_vec, align 1
ret void
}
+
+define void @cast_i16x4_to_u8x4(ptr %a0, ptr %a1) {
+; AVX-LABEL: cast_i16x4_to_u8x4:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: cast_i16x4_to_u8x4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovd %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cast_i16x4_to_u8x4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX512-NEXT: vpmovdb %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ %1 = load <4 x i16>, ptr %a1
+ %2 = trunc <4 x i16> %1 to <4 x i8>
+ store <4 x i8> %2, ptr %a0
+ ret void
+}
>From b24d87898803ff23164d1d10f472cee45a00b74b Mon Sep 17 00:00:00 2001
From: Chauhan Jaydeep Ashwinbhai <chauhan.jaydeep.ashwinbhai at intel.com>
Date: Tue, 17 Mar 2026 00:36:17 -0700
Subject: [PATCH 2/3] Addressed the review comments1
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 19 +++---
llvm/test/CodeGen/X86/avx512-trunc.ll | 59 +++++++++++++++++++
.../test/CodeGen/X86/combine-storetomstore.ll | 26 --------
3 files changed, 68 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ea15cbcf4cde5..7a5322e6f8983 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54287,6 +54287,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
StoreSDNode *St = cast<StoreSDNode>(N);
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
@@ -54295,17 +54296,15 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Pattern: store(trunc(load v4i16) to v4i8)
+ SDValue Src;
if (!St->isTruncatingStore() && VT == MVT::v4i8 && Subtarget.hasAVX512() &&
- StoredVal.getOpcode() == ISD::TRUNCATE &&
- StoredVal.getOperand(0).getValueType() == MVT::v4i16 &&
- StoredVal.hasOneUse() && TLI.isTruncStoreLegal(MVT::v4i32, MVT::v4i8)) {
-
- SDValue Src = StoredVal.getOperand(0);
- if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
- SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v4i32, Src);
- return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
- MVT::v4i8, St->getMemOperand());
- }
+ TLI.isTruncStoreLegal(MVT::v4i32, MVT::v4i8) &&
+ sd_match(StoredVal, m_OneUse(m_Trunc(m_Value(
+ Src, m_OneUse(m_SpecificVT(MVT::v4i16)))))) &&
+ ISD::isNormalLoad(Src.getNode())) {
+ SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v4i32, Src);
+ return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
+ MVT::v4i8, St->getMemOperand());
}
// Convert a store of vXi1 into a store of iX and a bitcast.
diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll
index 0e9e95adfa3e2..98c31e2936e4b 100644
--- a/llvm/test/CodeGen/X86/avx512-trunc.ll
+++ b/llvm/test/CodeGen/X86/avx512-trunc.ll
@@ -1074,3 +1074,62 @@ define void @ssat_trunc_db_1024_mem(<32 x i32> %i, ptr %p) {
ret void
}
+; Test load-trunc-store pattern optimization for v4i16 -> v4i8
+define void @test_trunc_v4i16_v4i8(ptr %dst, ptr %src) {
+; KNL-LABEL: test_trunc_v4i16_v4i8:
+; KNL: ## %bb.0:
+; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vmovd %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_trunc_v4i16_v4i8:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SKX-NEXT: vpmovdb %xmm0, (%rdi)
+; SKX-NEXT: retq
+ %1 = load <4 x i16>, ptr %src
+ %2 = trunc <4 x i16> %1 to <4 x i8>
+ store <4 x i8> %2, ptr %dst
+ ret void
+}
+
+define void @test_truncs_v4i16_v4i8(ptr %dst, ptr %src) {
+; ALL-LABEL: test_truncs_v4i16_v4i8:
+; ALL: ## %bb.0:
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, (%rdi)
+; ALL-NEXT: retq
+ %1 = load <4 x i16>, ptr %src
+ %2 = icmp sgt <4 x i16> %1, <i16 -128, i16 -128, i16 -128, i16 -128>
+ %3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> <i16 -128, i16 -128, i16 -128, i16 -128>
+ %4 = icmp slt <4 x i16> %3, <i16 127, i16 127, i16 127, i16 127>
+ %5 = select <4 x i1> %4, <4 x i16> %3, <4 x i16> <i16 127, i16 127, i16 127, i16 127>
+ %6 = trunc <4 x i16> %5 to <4 x i8>
+ store <4 x i8> %6, ptr %dst
+ ret void
+}
+
+define void @test_truncus_v4i16_v4i8(ptr %dst, ptr %src) {
+; KNL-LABEL: test_truncus_v4i16_v4i8:
+; KNL: ## %bb.0:
+; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; KNL-NEXT: vmovd %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_truncus_v4i16_v4i8:
+; SKX: ## %bb.0:
+; SKX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT: vpmovuswb %xmm0, %xmm0
+; SKX-NEXT: vmovd %xmm0, (%rdi)
+; SKX-NEXT: retq
+ %1 = load <4 x i16>, ptr %src
+ %2 = icmp ult <4 x i16> %1, <i16 255, i16 255, i16 255, i16 255>
+ %3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> <i16 255, i16 255, i16 255, i16 255>
+ %4 = trunc <4 x i16> %3 to <4 x i8>
+ store <4 x i8> %4, ptr %dst
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll
index b6b189712c43a..45a1172b2323e 100644
--- a/llvm/test/CodeGen/X86/combine-storetomstore.ll
+++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll
@@ -1538,29 +1538,3 @@ define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i
store <8 x i64> %sel, ptr %ptr_vec, align 1
ret void
}
-
-define void @cast_i16x4_to_u8x4(ptr %a0, ptr %a1) {
-; AVX-LABEL: cast_i16x4_to_u8x4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vmovd %xmm0, (%rdi)
-; AVX-NEXT: retq
-;
-; AVX2-LABEL: cast_i16x4_to_u8x4:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vmovd %xmm0, (%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: cast_i16x4_to_u8x4:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX512-NEXT: vpmovdb %xmm0, (%rdi)
-; AVX512-NEXT: retq
- %1 = load <4 x i16>, ptr %a1
- %2 = trunc <4 x i16> %1 to <4 x i8>
- store <4 x i8> %2, ptr %a0
- ret void
-}
>From 6d71c53741ec209bd08bc66b7cdaf86e6913dfda Mon Sep 17 00:00:00 2001
From: Chauhan Jaydeep Ashwinbhai <chauhan.jaydeep.ashwinbhai at intel.com>
Date: Tue, 17 Mar 2026 04:06:41 -0700
Subject: [PATCH 3/3] Addressed the review comments2
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7a5322e6f8983..22e32d122082a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54299,8 +54299,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
SDValue Src;
if (!St->isTruncatingStore() && VT == MVT::v4i8 && Subtarget.hasAVX512() &&
TLI.isTruncStoreLegal(MVT::v4i32, MVT::v4i8) &&
- sd_match(StoredVal, m_OneUse(m_Trunc(m_Value(
- Src, m_OneUse(m_SpecificVT(MVT::v4i16)))))) &&
+ sd_match(StoredVal, m_OneUse(m_Trunc(m_OneUse(m_Value(
+ Src, m_SpecificVT(MVT::v4i16)))))) &&
ISD::isNormalLoad(Src.getNode())) {
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v4i32, Src);
return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
More information about the llvm-commits
mailing list