[clang] [llvm] [X86][AVX512] rematerialize smaller predicate masks (PR #166178)
Ahmed Nour via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 3 11:33:11 PST 2025
https://github.com/ahmednoursphinx updated https://github.com/llvm/llvm-project/pull/166178
>From 3cdf74e3eee0e206a3588acb1b4e53fd66eb9517 Mon Sep 17 00:00:00 2001
From: ahmed <ahmednour.mohamed2012 at gmail.com>
Date: Mon, 3 Nov 2025 17:16:29 +0200
Subject: [PATCH 1/6] fix: rematerialize smaller predicate masks
---
llvm/lib/Target/X86/X86InstrAVX512.td | 25 ++++++
llvm/lib/Target/X86/X86InstrInfo.cpp | 6 ++
llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 93 ++++++++++++++++++++
3 files changed, 124 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 1b748b7355716..9fae602974242 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> {
defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+// 8-bit mask set operations for AVX512DQ
+let Predicates = [HasDQI] in {
+ defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>;
+ defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>;
+}
+
// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
let Predicates = [HasAVX512] in {
def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
@@ -3173,6 +3179,25 @@ let Predicates = [HasAVX512] in {
def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>;
}
+// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper bits
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
+ def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
+}
+
+// Optimize bitconvert of all-ones constants to use kxnor instructions
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>;
+ def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>;
+}
+let Predicates = [HasAVX512] in {
+ def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>;
+}
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>;
+ def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>;
+}
+
// Patterns for kmask insert_subvector/extract_subvector to/from index=0
multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
RegisterClass RC, ValueType VT> {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6b2a7a4ec3583..3eadac4f827bc 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -789,9 +789,11 @@ bool X86InstrInfo::isReMaterializableImpl(
case X86::FsFLD0SS:
case X86::FsFLD0SH:
case X86::FsFLD0F128:
+ case X86::KSET0B:
case X86::KSET0D:
case X86::KSET0Q:
case X86::KSET0W:
+ case X86::KSET1B:
case X86::KSET1D:
case X86::KSET1Q:
case X86::KSET1W:
@@ -6352,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// registers, since it is not usable as a write mask.
// FIXME: A more advanced approach would be to choose the best input mask
// register based on context.
+ case X86::KSET0B:
+ return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
case X86::KSET0W:
return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
case X86::KSET0D:
return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
case X86::KSET0Q:
return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
+ case X86::KSET1B:
+ return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
case X86::KSET1W:
return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
case X86::KSET1D:
diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
new file mode 100644
index 0000000000000..6a1a0af05d05c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQBW
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+
+; Test case 1: v16i1 with all bits set (should use kxnorw on all targets)
+define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_all:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: kxnorw %k0, %k0, %k1
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: gather_all:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1
+; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: gather_all:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: kxnorw %k0, %k0, %k1
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: gather_all:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k1
+; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT: retq
+ %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+ %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float>undef)
+ ret <16 x float> %res
+}
+
+; Test case 2: v8i1 with lower 8 bits set (should use kxnorb on AVX512DQ targets)
+define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_lower:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: movw $255, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: gather_lower:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: gather_lower:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: movw $255, %ax
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: gather_lower:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT: retq
+ %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+ %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float>undef)
+ ret <16 x float> %res
+}
+
+
>From 4d2cfe334a511eb4e6baa7c98a34ea3e51ecd62d Mon Sep 17 00:00:00 2001
From: ahmed <ahmednour.mohamed2012 at gmail.com>
Date: Mon, 3 Nov 2025 17:20:20 +0200
Subject: [PATCH 2/6] chore: update formatting
---
llvm/lib/Target/X86/X86InstrAVX512.td | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 9fae602974242..8a06296751f0d 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3179,23 +3179,24 @@ let Predicates = [HasAVX512] in {
def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>;
}
-// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper bits
+// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper
+// bits
let Predicates = [HasDQI] in {
def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
- def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
+ def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
}
// Optimize bitconvert of all-ones constants to use kxnor instructions
let Predicates = [HasDQI] in {
- def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>;
- def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>;
+ def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>;
+ def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>;
}
let Predicates = [HasAVX512] in {
- def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>;
+ def : Pat<(v16i1(bitconvert(i16 65535))), (KSET1W)>;
}
let Predicates = [HasBWI] in {
- def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>;
- def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>;
+ def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>;
+ def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>;
}
// Patterns for kmask insert_subvector/extract_subvector to/from index=0
>From b46db3285690008f1e97561c65aa2f3eccbcbc37 Mon Sep 17 00:00:00 2001
From: ahmed <ahmednour.mohamed2012 at gmail.com>
Date: Mon, 3 Nov 2025 17:24:48 +0200
Subject: [PATCH 3/6] fix: Use poison values for placeholders
---
llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
index 6a1a0af05d05c..485ffe6ee07b6 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -39,11 +39,11 @@ define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
; AVX512DQBW-NEXT: retq
- %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
- %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+ %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+ %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
- %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float>undef)
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison)
ret <16 x float> %res
}
@@ -82,11 +82,11 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
; AVX512DQBW-NEXT: retq
- %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
- %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+ %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+ %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
- %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float>undef)
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> poison)
ret <16 x float> %res
}
>From 25a8351e3af8fd8430e00ee53d97c25c3295163e Mon Sep 17 00:00:00 2001
From: ahmed <ahmednour.mohamed2012 at gmail.com>
Date: Mon, 3 Nov 2025 17:26:43 +0200
Subject: [PATCH 4/6] fix: Update formatting
---
clang/include/clang/Basic/DiagnosticLexKinds.td | 14 +++++++-------
clang/include/clang/Driver/Options.td | 7 ++++---
llvm/lib/Target/PowerPC/PPCInstrFuture.td | 2 +-
3 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 417187222e448..e3796e3637742 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -90,13 +90,13 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">;
def err_conflict_marker : Error<"version control conflict marker in file">;
-def err_counter_overflow : Error<
- "'__COUNTER__' value cannot exceed 2'147'483'647">;
-def ext_counter : Extension<
- "'__COUNTER__' is a C2y extension">, InGroup<C2y>;
-def warn_counter : Warning<
- "'__COUNTER__' is incompatible with standards before C2y">,
- InGroup<CPre2yCompat>, DefaultIgnore;
+def err_counter_overflow
+ : Error<"'__COUNTER__' value cannot exceed 2'147'483'647">;
+def ext_counter : Extension<"'__COUNTER__' is a C2y extension">, InGroup<C2y>;
+def warn_counter
+ : Warning<"'__COUNTER__' is incompatible with standards before C2y">,
+ InGroup<CPre2yCompat>,
+ DefaultIgnore;
def err_raw_delim_too_long : Error<
"raw string delimiter longer than 16 characters"
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 20955ef1b852e..af254bc0a7cf8 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8445,9 +8445,10 @@ def aligned_alloc_unavailable : Flag<["-"], "faligned-alloc-unavailable">,
MarshallingInfoFlag<LangOpts<"AlignedAllocationUnavailable">>,
ShouldParseIf<faligned_allocation.KeyPath>;
-def finitial_counter_value_EQ : Joined<["-"], "finitial-counter-value=">,
- HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">,
- MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">;
+def finitial_counter_value_EQ
+ : Joined<["-"], "finitial-counter-value=">,
+ HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">,
+ MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">;
} // let Visibility = [CC1Option]
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 0c2e44e18f463..424f0e06cc3d3 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -362,7 +362,7 @@ let Predicates = [HasVSX, IsISAFuture] in {
"lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
def LXVPB32X
: XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp),
- (ins (memr $RA):$addr, g8rc:$RB),
+ (ins(memr $RA):$addr, g8rc:$RB),
"lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
}
>From 5063a2b09dcaf875358ca2a193b20a2a074999ed Mon Sep 17 00:00:00 2001
From: ahmed <ahmednour.mohamed2012 at gmail.com>
Date: Mon, 3 Nov 2025 20:31:51 +0200
Subject: [PATCH 5/6] Revert "fix: Update formatting"
This reverts commit 25a8351e3af8fd8430e00ee53d97c25c3295163e.
---
clang/include/clang/Basic/DiagnosticLexKinds.td | 14 +++++++-------
clang/include/clang/Driver/Options.td | 7 +++----
llvm/lib/Target/PowerPC/PPCInstrFuture.td | 2 +-
3 files changed, 11 insertions(+), 12 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index e3796e3637742..417187222e448 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -90,13 +90,13 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">;
def err_conflict_marker : Error<"version control conflict marker in file">;
-def err_counter_overflow
- : Error<"'__COUNTER__' value cannot exceed 2'147'483'647">;
-def ext_counter : Extension<"'__COUNTER__' is a C2y extension">, InGroup<C2y>;
-def warn_counter
- : Warning<"'__COUNTER__' is incompatible with standards before C2y">,
- InGroup<CPre2yCompat>,
- DefaultIgnore;
+def err_counter_overflow : Error<
+ "'__COUNTER__' value cannot exceed 2'147'483'647">;
+def ext_counter : Extension<
+ "'__COUNTER__' is a C2y extension">, InGroup<C2y>;
+def warn_counter : Warning<
+ "'__COUNTER__' is incompatible with standards before C2y">,
+ InGroup<CPre2yCompat>, DefaultIgnore;
def err_raw_delim_too_long : Error<
"raw string delimiter longer than 16 characters"
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index af254bc0a7cf8..20955ef1b852e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8445,10 +8445,9 @@ def aligned_alloc_unavailable : Flag<["-"], "faligned-alloc-unavailable">,
MarshallingInfoFlag<LangOpts<"AlignedAllocationUnavailable">>,
ShouldParseIf<faligned_allocation.KeyPath>;
-def finitial_counter_value_EQ
- : Joined<["-"], "finitial-counter-value=">,
- HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">,
- MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">;
+def finitial_counter_value_EQ : Joined<["-"], "finitial-counter-value=">,
+ HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">,
+ MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">;
} // let Visibility = [CC1Option]
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 424f0e06cc3d3..0c2e44e18f463 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -362,7 +362,7 @@ let Predicates = [HasVSX, IsISAFuture] in {
"lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
def LXVPB32X
: XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp),
- (ins(memr $RA):$addr, g8rc:$RB),
+ (ins (memr $RA):$addr, g8rc:$RB),
"lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
}
>From 934e4fac8f7f706fa0f3a0fbac85c4e6e92e02e2 Mon Sep 17 00:00:00 2001
From: ahmed <ahmednour.mohamed2012 at gmail.com>
Date: Mon, 3 Nov 2025 21:32:45 +0200
Subject: [PATCH 6/6] refactor: PR Feedback
---
llvm/lib/Target/X86/X86InstrAVX512.td | 3 -
llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 98 ++++++++++++++------
2 files changed, 71 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 8a06296751f0d..45e556e7c13a8 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3191,9 +3191,6 @@ let Predicates = [HasDQI] in {
def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>;
def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>;
}
-let Predicates = [HasAVX512] in {
- def : Pat<(v16i1(bitconvert(i16 65535))), (KSET1W)>;
-}
let Predicates = [HasBWI] in {
def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>;
def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>;
diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
index 485ffe6ee07b6..c1ace37bc9ed2 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -1,44 +1,88 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQBW
declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
-; Test case 1: v16i1 with all bits set (should use kxnorw on all targets)
-define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
-; AVX512F-LABEL: gather_all:
+; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ)
+define <8 x float> @mask_v8i1_allones(ptr %ptr) {
+; AVX512F-LABEL: mask_v8i1_allones:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: kxnorw %k0, %k0, %k1
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: movw $255, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
-; AVX512DQ-LABEL: gather_all:
+; AVX512DQ-LABEL: mask_v8i1_allones:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1
-; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
-; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: gather_all:
+; AVX512BW-LABEL: mask_v8i1_allones:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kxnorw %k0, %k0, %k1
-; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
-; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: movw $255, %ax
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
-; AVX512DQBW-LABEL: gather_all:
+; AVX512DQBW-LABEL: mask_v8i1_allones:
; AVX512DQBW: # %bb.0:
-; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k1
-; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
-; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQBW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQBW-NEXT: retq
+ %res = call <8 x float> @llvm.masked.expandload.v8f32(ptr %ptr, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> zeroinitializer)
+ ret <8 x float> %res
+}
+
+; Test case 2: v16i1 with lower 8 bits set via bitconvert (should use kxnorb on AVX512DQ)
+define <16 x float> @mask_v16i1_lower8(ptr %ptr) {
+; AVX512F-LABEL: mask_v16i1_lower8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movw $255, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: mask_v16i1_lower8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: mask_v16i1_lower8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movw $255, %ax
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: mask_v16i1_lower8:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
; AVX512DQBW-NEXT: retq
+ %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> zeroinitializer)
+ ret <16 x float> %res
+}
+
+; Test case 3: v16i1 with all bits set (should use kxnorw on all targets)
+define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512-LABEL: gather_all:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kxnorw %k0, %k0, %k1
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
%broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
%broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -47,7 +91,7 @@ define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
ret <16 x float> %res
}
-; Test case 2: v8i1 with lower 8 bits set (should use kxnorb on AVX512DQ targets)
+; Test case 4: v8i1 with lower 8 bits set in gather (should use kxnorb on AVX512DQ targets)
define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
; AVX512F-LABEL: gather_lower:
; AVX512F: # %bb.0:
More information about the llvm-commits
mailing list