[llvm-branch-commits] [llvm] release/22.x: [DAG] Narrow vselect mask to vXi1 in foldToMaskedStore (#201609) (PR #202880)

Wed Jun 10 00:56:09 PDT 2026

https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/202880

Backport e6bd7887070e92bba3615de04d3fdefde4beb2de

Requested by: @phoebewang

>From d1667f626af5c4e330be84cae89ef0df9206bddb Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Sun, 7 Jun 2026 10:14:56 +0200
Subject: [PATCH] [DAG] Narrow vselect mask to vXi1 in foldToMaskedStore
 (#201609)

foldToMaskedStore (added in
https://github.com/llvm/llvm-project/commit/1c0ac80d4a9ef6c21914f2317003979952c2a2c3)
rewrites
  store(vselect(cond, x, load(ptr)), ptr) -> masked_store(x, ptr, cond)
passing the vselect condition straight through as the store mask. A
masked
store follows the IR convention of a vXi1 mask, but the condition can be
a
wider boolean vector. On AVX512F targets without VLX, a maxnum/minnum
store-back lowers the NaN test with a legacy packed (CMPP) comparison
whose
result is a vXi32/vXi64 vector, so the masked store is created with a
wide
mask and LowerMSTORE asserts:

  Assertion `Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
             "Unexpected mask type"' failed.

When the matching vXi1 type is legal, narrow the mask to it before
building
the masked store. Targets where vXi1 is illegal (e.g. AVX/AVX2) keep the
wide
mask and continue to lower it as a blend/vmaskmov, and targets whose
vselect
condition is already vXi1 (e.g. AArch64 SVE, RISC-V RVV) are unaffected.

This fixes the crash at the source and lets the X86 LowerMSTORE keep its
invariant of only ever seeing a vXi1 mask (no target-specific
workaround).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply at anthropic.com>
(cherry picked from commit e6bd7887070e92bba3615de04d3fdefde4beb2de)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  13 ++
 .../X86/avx512-maxnum-minnum-masked-store.ll  | 151 ++++++++++++++++++
 2 files changed, 164 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3fdb9bf7e5171..ee9238753735a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23117,6 +23117,19 @@ static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG,
   if (LoadPos == 1)
     Mask = DAG.getNOT(Dl, Mask, Mask.getValueType());
 
+  // A masked store follows the IR convention of a vXi1 mask (one bit per
+  // element). A vselect condition may instead be a wider boolean vector, e.g.
+  // a vXi32/vXi64 comparison result produced on AVX512 targets without VLX.
+  // When the matching vXi1 type is legal, narrow the mask to it so that targets
+  // expecting a vXi1 mask lower it correctly. Targets where vXi1 is illegal
+  // (e.g. AVX/AVX2) keep the wide mask and lower it as a blend/vmaskmov.
+  EVT MaskVT = Mask.getValueType();
+  if (MaskVT.getVectorElementType() != MVT::i1) {
+    EVT BoolVT = MaskVT.changeVectorElementType(*DAG.getContext(), MVT::i1);
+    if (TLI.isTypeLegal(BoolVT))
+      Mask = DAG.getNode(ISD::TRUNCATE, Dl, BoolVT, Mask);
+  }
+
   return DAG.getMaskedStore(Store->getChain(), Dl, OtherVec, StorePtr,
                             StoreOffset, Mask, VT, Store->getMemOperand(),
                             Store->getAddressingMode());
diff --git a/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll b/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll
new file mode 100644
index 0000000000000..f6ca1490b558c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx     | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx2    | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL
+
+; On AVX512F targets without VLX (e.g. KNL), a maxnum/minnum store-back fuses a
+; legacy CMPP vector comparison into a masked store via foldToMaskedStore, so
+; the vselect mask is a wide vNi32/vNi64 vector rather than vNi1. The combine
+; must narrow the mask to vNi1 (legal here) before building the masked store, so
+; the widening custom lowering only ever sees an i1 mask and does not assert.
+;
+; On AVX/AVX2 the matching vNi1 type is illegal, so the combine keeps the wide
+; mask and the masked store lowers to vmaskmov (the isTypeLegal guard leaves
+; this path unchanged).
+
+define void @maxnum_v4f32_masked_store(<4 x float> %a, ptr %ptr) {
+; AVX-LABEL: maxnum_v4f32_masked_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovups (%rdi), %xmm1
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: maxnum_v4f32_masked_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovups (%rdi), %xmm1
+; AVX512F-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vmovups %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: maxnum_v4f32_masked_store:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovups (%rdi), %xmm1
+; AVX512VL-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcmpordps %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovups %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
+  %b = load <4 x float>, ptr %ptr, align 4
+  %m = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
+  store <4 x float> %m, ptr %ptr, align 4
+  ret void
+}
+
+define void @maxnum_v2f64_masked_store(<2 x double> %a, ptr %ptr) {
+; AVX-LABEL: maxnum_v2f64_masked_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovupd (%rdi), %xmm1
+; AVX-NEXT:    vmaxpd %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: maxnum_v2f64_masked_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovupd (%rdi), %xmm1
+; AVX512F-NEXT:    vmaxpd %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: maxnum_v2f64_masked_store:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovupd (%rdi), %xmm1
+; AVX512VL-NEXT:    vmaxpd %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcmpordpd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovupd %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
+  %b = load <2 x double>, ptr %ptr, align 8
+  %m = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
+  store <2 x double> %m, ptr %ptr, align 8
+  ret void
+}
+
+define void @minnum_v4f32_masked_store(<4 x float> %a, ptr %ptr) {
+; AVX-LABEL: minnum_v4f32_masked_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovups (%rdi), %xmm1
+; AVX-NEXT:    vminps %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: minnum_v4f32_masked_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovups (%rdi), %xmm1
+; AVX512F-NEXT:    vminps %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vmovups %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: minnum_v4f32_masked_store:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovups (%rdi), %xmm1
+; AVX512VL-NEXT:    vminps %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcmpordps %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovups %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
+  %b = load <4 x float>, ptr %ptr, align 4
+  %m = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
+  store <4 x float> %m, ptr %ptr, align 4
+  ret void
+}
+
+define void @minnum_v2f64_masked_store(<2 x double> %a, ptr %ptr) {
+; AVX-LABEL: minnum_v2f64_masked_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovupd (%rdi), %xmm1
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: minnum_v2f64_masked_store:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovupd (%rdi), %xmm1
+; AVX512F-NEXT:    vminpd %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: minnum_v2f64_masked_store:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovupd (%rdi), %xmm1
+; AVX512VL-NEXT:    vminpd %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcmpordpd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovupd %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
+  %b = load <2 x double>, ptr %ptr, align 8
+  %m = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
+  store <2 x double> %m, ptr %ptr, align 8
+  ret void
+}