[llvm-branch-commits] [llvm] release/22.x: [DAG] Narrow vselect mask to vXi1 in foldToMaskedStore (#201609) (PR #202880)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 10 00:56:49 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: llvmbot
<details>
<summary>Changes</summary>
Backport e6bd7887070e92bba3615de04d3fdefde4beb2de
Requested by: @<!-- -->phoebewang
---
Full diff: https://github.com/llvm/llvm-project/pull/202880.diff
2 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+13)
- (added) llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll (+151)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3fdb9bf7e5171..ee9238753735a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23117,6 +23117,19 @@ static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG,
if (LoadPos == 1)
Mask = DAG.getNOT(Dl, Mask, Mask.getValueType());
+ // A masked store follows the IR convention of a vXi1 mask (one bit per
+ // element). A vselect condition may instead be a wider boolean vector, e.g.
+ // a vXi32/vXi64 comparison result produced on AVX512 targets without VLX.
+ // When the matching vXi1 type is legal, narrow the mask to it so that targets
+ // expecting a vXi1 mask lower it correctly. Targets where vXi1 is illegal
+ // (e.g. AVX/AVX2) keep the wide mask and lower it as a blend/vmaskmov.
+ EVT MaskVT = Mask.getValueType();
+ if (MaskVT.getVectorElementType() != MVT::i1) {
+ EVT BoolVT = MaskVT.changeVectorElementType(*DAG.getContext(), MVT::i1);
+ if (TLI.isTypeLegal(BoolVT))
+ Mask = DAG.getNode(ISD::TRUNCATE, Dl, BoolVT, Mask);
+ }
+
return DAG.getMaskedStore(Store->getChain(), Dl, OtherVec, StorePtr,
StoreOffset, Mask, VT, Store->getMemOperand(),
Store->getAddressingMode());
diff --git a/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll b/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll
new file mode 100644
index 0000000000000..f6ca1490b558c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-maxnum-minnum-masked-store.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL
+
+; On AVX512F targets without VLX (e.g. KNL), a maxnum/minnum store-back fuses a
+; legacy CMPP vector comparison into a masked store via foldToMaskedStore, so
+; the vselect mask is a wide vNi32/vNi64 vector rather than vNi1. The combine
+; must narrow the mask to vNi1 (legal here) before building the masked store, so
+; the widening custom lowering only ever sees an i1 mask and does not assert.
+;
+; On AVX/AVX2 the matching vNi1 type is illegal, so the combine keeps the wide
+; mask and the masked store lowers to vmaskmov (the isTypeLegal guard leaves
+; this path unchanged).
+
+define void @maxnum_v4f32_masked_store(<4 x float> %a, ptr %ptr) {
+; AVX-LABEL: maxnum_v4f32_masked_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovups (%rdi), %xmm1
+; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: maxnum_v4f32_masked_store:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm1
+; AVX512F-NEXT: vmaxps %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: maxnum_v4f32_masked_store:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovups (%rdi), %xmm1
+; AVX512VL-NEXT: vmaxps %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vcmpordps %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovups %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: retq
+ %b = load <4 x float>, ptr %ptr, align 4
+ %m = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
+ store <4 x float> %m, ptr %ptr, align 4
+ ret void
+}
+
+define void @maxnum_v2f64_masked_store(<2 x double> %a, ptr %ptr) {
+; AVX-LABEL: maxnum_v2f64_masked_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovupd (%rdi), %xmm1
+; AVX-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: maxnum_v2f64_masked_store:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovupd (%rdi), %xmm1
+; AVX512F-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k0
+; AVX512F-NEXT: kshiftrw $14, %k0, %k1
+; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: maxnum_v2f64_masked_store:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovupd (%rdi), %xmm1
+; AVX512VL-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vcmpordpd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovupd %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: retq
+ %b = load <2 x double>, ptr %ptr, align 8
+ %m = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
+ store <2 x double> %m, ptr %ptr, align 8
+ ret void
+}
+
+define void @minnum_v4f32_masked_store(<4 x float> %a, ptr %ptr) {
+; AVX-LABEL: minnum_v4f32_masked_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovups (%rdi), %xmm1
+; AVX-NEXT: vminps %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: minnum_v4f32_masked_store:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm1
+; AVX512F-NEXT: vminps %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: minnum_v4f32_masked_store:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovups (%rdi), %xmm1
+; AVX512VL-NEXT: vminps %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vcmpordps %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovups %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: retq
+ %b = load <4 x float>, ptr %ptr, align 4
+ %m = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
+ store <4 x float> %m, ptr %ptr, align 4
+ ret void
+}
+
+define void @minnum_v2f64_masked_store(<2 x double> %a, ptr %ptr) {
+; AVX-LABEL: minnum_v2f64_masked_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovupd (%rdi), %xmm1
+; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: minnum_v2f64_masked_store:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovupd (%rdi), %xmm1
+; AVX512F-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k0
+; AVX512F-NEXT: kshiftrw $14, %k0, %k1
+; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: minnum_v2f64_masked_store:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovupd (%rdi), %xmm1
+; AVX512VL-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vcmpordpd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovupd %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: retq
+ %b = load <2 x double>, ptr %ptr, align 8
+ %m = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
+ store <2 x double> %m, ptr %ptr, align 8
+ ret void
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/202880
More information about the llvm-branch-commits
mailing list