[llvm-branch-commits] [llvm] release/22.x: [DAG] Narrow vselect mask to vXi1 in foldToMaskedStore (#201609) (PR #202880)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 10 00:56:09 PDT 2026
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/202880
Backport e6bd7887070e92bba3615de04d3fdefde4beb2de
Requested by: @phoebewang
>From d1667f626af5c4e330be84cae89ef0df9206bddb Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Sun, 7 Jun 2026 10:14:56 +0200
Subject: [PATCH] [DAG] Narrow vselect mask to vXi1 in foldToMaskedStore
(#201609)
foldToMaskedStore (added in
https://github.com/llvm/llvm-project/commit/1c0ac80d4a9ef6c21914f2317003979952c2a2c3)
rewrites
store(vselect(cond, x, load(ptr)), ptr) -> masked_store(x, ptr, cond)
passing the vselect condition straight through as the store mask. A
masked
store follows the IR convention of a vXi1 mask, but the condition can be
a
wider boolean vector. On AVX512F targets without VLX, a maxnum/minnum
store-back lowers the NaN test with a legacy packed (CMPP) comparison
whose
result is a vXi32/vXi64 vector, so the masked store is created with a
wide
mask and LowerMSTORE asserts:
Assertion `Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type"' failed.
When the matching vXi1 type is legal, narrow the mask to it before
building
the masked store. Targets where vXi1 is illegal (e.g. AVX/AVX2) keep the
wide
mask and continue to lower it as a blend/vmaskmov, and targets whose
vselect
condition is already vXi1 (e.g. AArch64 SVE, RISC-V RVV) are unaffected.
This fixes the crash at the source and lets the X86 LowerMSTORE keep its
invariant of only ever seeing a vXi1 mask (no target-specific
workaround).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply at anthropic.com>
(cherry picked from commit e6bd7887070e92bba3615de04d3fdefde4beb2de)
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 ++
.../X86/avx512-maxnum-minnum-masked-store.ll | 151 ++++++++++++++++++
2 files changed, 164 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3fdb9bf7e5171..ee9238753735a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23117,6 +23117,19 @@ static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG,
if (LoadPos == 1)
Mask = DAG.getNOT(Dl, Mask, Mask.getValueType());
+ // A masked store follows the IR convention of a vXi1 mask (one bit per
+ // element). A vselect condition may instead be a wider boolean vector, e.g.
+ // a vXi32/vXi64 comparison result produced on AVX512 targets without VLX.
+ // When the matching vXi1 type is legal, narrow the mask to it so that targets
+ // expecting a vXi1 mask lower it correctly. Targets where vXi1 is illegal
+ // (e.g. AVX/AVX2) keep the wide mask and lower it as a blend/vmaskmov.
+ EVT MaskVT = Mask.getValueType();
+ if (MaskVT.getVectorElementType() != MVT::i1) {
+ EVT BoolVT = MaskVT.changeVectorElementType(*DAG.getContext(), MVT::i1);
+ if (TLI.isTypeLegal(BoolVT))
+ Mask = DAG.getNode(ISD::TRUNCATE, Dl, BoolVT, Mask);
+ }
+
return DAG.getMaskedStore(Store->getChain(), Dl, OtherVec, StorePtr,
StoreOffset, Mask, VT, Store->getMemOperand(),
Store->getAddressingMode());
diff --git a/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll b/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll
new file mode 100644
index 0000000000000..f6ca1490b558c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL
+
+; On AVX512F targets without VLX (e.g. KNL), a maxnum/minnum store-back fuses a
+; legacy CMPP vector comparison into a masked store via foldToMaskedStore, so
+; the vselect mask is a wide vNi32/vNi64 vector rather than vNi1. The combine
+; must narrow the mask to vNi1 (legal here) before building the masked store, so
+; the widening custom lowering only ever sees an i1 mask and does not assert.
+;
+; On AVX/AVX2 the matching vNi1 type is illegal, so the combine keeps the wide
+; mask and the masked store lowers to vmaskmov (the isTypeLegal guard leaves
+; this path unchanged).
+
+define void @maxnum_v4f32_masked_store(<4 x float> %a, ptr %ptr) {
+; AVX-LABEL: maxnum_v4f32_masked_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovups (%rdi), %xmm1
+; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: maxnum_v4f32_masked_store:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm1
+; AVX512F-NEXT: vmaxps %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: maxnum_v4f32_masked_store:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovups (%rdi), %xmm1
+; AVX512VL-NEXT: vmaxps %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vcmpordps %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovups %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: retq
+ %b = load <4 x float>, ptr %ptr, align 4
+ %m = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
+ store <4 x float> %m, ptr %ptr, align 4
+ ret void
+}
+
+define void @maxnum_v2f64_masked_store(<2 x double> %a, ptr %ptr) {
+; AVX-LABEL: maxnum_v2f64_masked_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovupd (%rdi), %xmm1
+; AVX-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: maxnum_v2f64_masked_store:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovupd (%rdi), %xmm1
+; AVX512F-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k0
+; AVX512F-NEXT: kshiftrw $14, %k0, %k1
+; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: maxnum_v2f64_masked_store:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovupd (%rdi), %xmm1
+; AVX512VL-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vcmpordpd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovupd %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: retq
+ %b = load <2 x double>, ptr %ptr, align 8
+ %m = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
+ store <2 x double> %m, ptr %ptr, align 8
+ ret void
+}
+
+define void @minnum_v4f32_masked_store(<4 x float> %a, ptr %ptr) {
+; AVX-LABEL: minnum_v4f32_masked_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovups (%rdi), %xmm1
+; AVX-NEXT: vminps %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: minnum_v4f32_masked_store:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm1
+; AVX512F-NEXT: vminps %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: minnum_v4f32_masked_store:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovups (%rdi), %xmm1
+; AVX512VL-NEXT: vminps %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vcmpordps %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovups %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: retq
+ %b = load <4 x float>, ptr %ptr, align 4
+ %m = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
+ store <4 x float> %m, ptr %ptr, align 4
+ ret void
+}
+
+define void @minnum_v2f64_masked_store(<2 x double> %a, ptr %ptr) {
+; AVX-LABEL: minnum_v2f64_masked_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovupd (%rdi), %xmm1
+; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: minnum_v2f64_masked_store:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovupd (%rdi), %xmm1
+; AVX512F-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k0
+; AVX512F-NEXT: kshiftrw $14, %k0, %k1
+; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: minnum_v2f64_masked_store:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovupd (%rdi), %xmm1
+; AVX512VL-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vcmpordpd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovupd %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: retq
+ %b = load <2 x double>, ptr %ptr, align 8
+ %m = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
+ store <2 x double> %m, ptr %ptr, align 8
+ ret void
+}
More information about the llvm-branch-commits
mailing list