[llvm] [X86] Optimize v4i16->v4i8 truncating stores via v4i32 widening (PR #186676)

Tue Mar 17 04:06:55 PDT 2026

https://github.com/JaydeepChauhan14 updated https://github.com/llvm/llvm-project/pull/186676

>From 2f3cf585137f01d8aab928285e3cbdfee033c5db Mon Sep 17 00:00:00 2001
From: Chauhan Jaydeep Ashwinbhai <chauhan.jaydeep.ashwinbhai at intel.com>
Date: Sun, 15 Mar 2026 08:25:28 -0700
Subject: [PATCH 1/3] [X86] Optimize v4i16->v4i8 truncating stores via v4i32
 widening

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 14 ++++++++++
 .../test/CodeGen/X86/combine-storetomstore.ll | 26 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4fbbf63c39065..ea15cbcf4cde5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54294,6 +54294,20 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   EVT VT = StoredVal.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
+  // Pattern: store(trunc(load v4i16) to v4i8)
+  if (!St->isTruncatingStore() && VT == MVT::v4i8 && Subtarget.hasAVX512() &&
+      StoredVal.getOpcode() == ISD::TRUNCATE &&
+      StoredVal.getOperand(0).getValueType() == MVT::v4i16 &&
+      StoredVal.hasOneUse() && TLI.isTruncStoreLegal(MVT::v4i32, MVT::v4i8)) {
+
+    SDValue Src = StoredVal.getOperand(0);
+    if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+      SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v4i32, Src);
+      return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
+                               MVT::v4i8, St->getMemOperand());
+    }
+  }
+
   // Convert a store of vXi1 into a store of iX and a bitcast.
   if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
       VT.getVectorElementType() == MVT::i1) {
diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll
index 45a1172b2323e..b6b189712c43a 100644
--- a/llvm/test/CodeGen/X86/combine-storetomstore.ll
+++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll
@@ -1538,3 +1538,29 @@ define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i
   store <8 x i64> %sel, ptr %ptr_vec, align 1
   ret void
 }
+
+define void @cast_i16x4_to_u8x4(ptr %a0, ptr %a1) {
+; AVX-LABEL: cast_i16x4_to_u8x4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: cast_i16x4_to_u8x4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovd %xmm0, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: cast_i16x4_to_u8x4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX512-NEXT:    vpmovdb %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  %1 = load <4 x i16>, ptr %a1
+  %2 = trunc <4 x i16> %1 to <4 x i8>
+  store <4 x i8> %2, ptr %a0
+  ret void
+}

>From b24d87898803ff23164d1d10f472cee45a00b74b Mon Sep 17 00:00:00 2001
From: Chauhan Jaydeep Ashwinbhai <chauhan.jaydeep.ashwinbhai at intel.com>
Date: Tue, 17 Mar 2026 00:36:17 -0700
Subject: [PATCH 2/3] Addressed the review comments1

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 19 +++---
 llvm/test/CodeGen/X86/avx512-trunc.ll         | 59 +++++++++++++++++++
 .../test/CodeGen/X86/combine-storetomstore.ll | 26 --------
 3 files changed, 68 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ea15cbcf4cde5..7a5322e6f8983 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54287,6 +54287,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   StoreSDNode *St = cast<StoreSDNode>(N);
   EVT StVT = St->getMemoryVT();
   SDLoc dl(St);
@@ -54295,17 +54296,15 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pattern: store(trunc(load v4i16) to v4i8)
+  SDValue Src;
   if (!St->isTruncatingStore() && VT == MVT::v4i8 && Subtarget.hasAVX512() &&
-      StoredVal.getOpcode() == ISD::TRUNCATE &&
-      StoredVal.getOperand(0).getValueType() == MVT::v4i16 &&
-      StoredVal.hasOneUse() && TLI.isTruncStoreLegal(MVT::v4i32, MVT::v4i8)) {
-
-    SDValue Src = StoredVal.getOperand(0);
-    if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
-      SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v4i32, Src);
-      return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
-                               MVT::v4i8, St->getMemOperand());
-    }
+      TLI.isTruncStoreLegal(MVT::v4i32, MVT::v4i8) &&
+      sd_match(StoredVal, m_OneUse(m_Trunc(m_Value(
+                              Src, m_OneUse(m_SpecificVT(MVT::v4i16)))))) &&
+      ISD::isNormalLoad(Src.getNode())) {
+    SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v4i32, Src);
+    return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
+                             MVT::v4i8, St->getMemOperand());
   }
 
   // Convert a store of vXi1 into a store of iX and a bitcast.
diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll
index 0e9e95adfa3e2..98c31e2936e4b 100644
--- a/llvm/test/CodeGen/X86/avx512-trunc.ll
+++ b/llvm/test/CodeGen/X86/avx512-trunc.ll
@@ -1074,3 +1074,62 @@ define void @ssat_trunc_db_1024_mem(<32 x i32> %i, ptr %p) {
   ret void
 }
 
+; Test load-trunc-store pattern optimization for v4i16 -> v4i8
+define void @test_trunc_v4i16_v4i8(ptr %dst, ptr %src) {
+; KNL-LABEL: test_trunc_v4i16_v4i8:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_trunc_v4i16_v4i8:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SKX-NEXT:    vpmovdb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %1 = load <4 x i16>, ptr %src
+  %2 = trunc <4 x i16> %1 to <4 x i8>
+  store <4 x i8> %2, ptr %dst
+  ret void
+}
+
+define void @test_truncs_v4i16_v4i8(ptr %dst, ptr %src) {
+; ALL-LABEL: test_truncs_v4i16_v4i8:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; ALL-NEXT:    vmovd %xmm0, (%rdi)
+; ALL-NEXT:    retq
+  %1 = load <4 x i16>, ptr %src
+  %2 = icmp sgt <4 x i16> %1, <i16 -128, i16 -128, i16 -128, i16 -128>
+  %3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> <i16 -128, i16 -128, i16 -128, i16 -128>
+  %4 = icmp slt <4 x i16> %3, <i16 127, i16 127, i16 127, i16 127>
+  %5 = select <4 x i1> %4, <4 x i16> %3, <4 x i16> <i16 127, i16 127, i16 127, i16 127>
+  %6 = trunc <4 x i16> %5 to <4 x i8>
+  store <4 x i8> %6, ptr %dst
+  ret void
+}
+
+define void @test_truncus_v4i16_v4i8(ptr %dst, ptr %src) {
+; KNL-LABEL: test_truncus_v4i16_v4i8:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT:    vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_truncus_v4i16_v4i8:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT:    vpmovuswb %xmm0, %xmm0
+; SKX-NEXT:    vmovd %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %1 = load <4 x i16>, ptr %src
+  %2 = icmp ult <4 x i16> %1, <i16 255, i16 255, i16 255, i16 255>
+  %3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> <i16 255, i16 255, i16 255, i16 255>
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %dst
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll
index b6b189712c43a..45a1172b2323e 100644
--- a/llvm/test/CodeGen/X86/combine-storetomstore.ll
+++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll
@@ -1538,29 +1538,3 @@ define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i
   store <8 x i64> %sel, ptr %ptr_vec, align 1
   ret void
 }
-
-define void @cast_i16x4_to_u8x4(ptr %a0, ptr %a1) {
-; AVX-LABEL: cast_i16x4_to_u8x4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vmovd %xmm0, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX2-LABEL: cast_i16x4_to_u8x4:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: cast_i16x4_to_u8x4:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX512-NEXT:    vpmovdb %xmm0, (%rdi)
-; AVX512-NEXT:    retq
-  %1 = load <4 x i16>, ptr %a1
-  %2 = trunc <4 x i16> %1 to <4 x i8>
-  store <4 x i8> %2, ptr %a0
-  ret void
-}

>From 6d71c53741ec209bd08bc66b7cdaf86e6913dfda Mon Sep 17 00:00:00 2001
From: Chauhan Jaydeep Ashwinbhai <chauhan.jaydeep.ashwinbhai at intel.com>
Date: Tue, 17 Mar 2026 04:06:41 -0700
Subject: [PATCH 3/3] Addressed the review comments2

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7a5322e6f8983..22e32d122082a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54299,8 +54299,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   SDValue Src;
   if (!St->isTruncatingStore() && VT == MVT::v4i8 && Subtarget.hasAVX512() &&
       TLI.isTruncStoreLegal(MVT::v4i32, MVT::v4i8) &&
-      sd_match(StoredVal, m_OneUse(m_Trunc(m_Value(
-                              Src, m_OneUse(m_SpecificVT(MVT::v4i16)))))) &&
+      sd_match(StoredVal, m_OneUse(m_Trunc(m_OneUse(m_Value(
+                              Src, m_SpecificVT(MVT::v4i16)))))) &&
       ISD::isNormalLoad(Src.getNode())) {
     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v4i32, Src);
     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),