[llvm] [X86] Blocklist instructions that are unsafe for masked-load folding. (PR #178888)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 5 04:56:56 PST 2026
https://github.com/azwolski updated https://github.com/llvm/llvm-project/pull/178888
>From 27b945ab6af5aa3b99fb953a57adb0f92afb3951 Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Thu, 29 Jan 2026 16:03:35 +0100
Subject: [PATCH 1/6] [X86] Blacklist instructions that folding the masked load
into is invalid.
---
llvm/lib/Target/X86/X86InstrFoldTables.cpp | 8 ++
llvm/lib/Target/X86/X86InstrFoldTables.h | 4 +
llvm/lib/Target/X86/X86InstrInfo.cpp | 5 +
llvm/utils/TableGen/X86FoldTablesEmitter.cpp | 59 +++++++++++
llvm/utils/TableGen/X86ManualFoldTables.def | 104 +++++++++++++++++++
5 files changed, 180 insertions(+)
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 560b8c378ead7..9b22f6bb767c5 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -143,6 +143,14 @@ const X86FoldTableEntry *llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) {
return lookupFoldTableImpl(FoldTable, RegOp);
}
+bool llvm::isNonFoldableWithSameMask(unsigned RegOp) {
+ // NonFoldableWithSameMask table stores instruction opcodes that are unsafe
+ // for masked-load folding when the same mask is used.
+ ArrayRef<unsigned> Table(NonFoldableWithSameMaskTable);
+ auto I = llvm::lower_bound(Table, RegOp);
+ return I != Table.end() && *I == RegOp;
+}
+
const X86FoldTableEntry *llvm::lookupBroadcastFoldTable(unsigned RegOp,
unsigned OpNum) {
ArrayRef<X86FoldTableEntry> FoldTable;
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.h b/llvm/lib/Target/X86/X86InstrFoldTables.h
index 9c5dea48d2273..35a3e993e3f96 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.h
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -44,6 +44,10 @@ const X86FoldTableEntry *lookupTwoAddrFoldTable(unsigned RegOp);
// operand OpNum.
const X86FoldTableEntry *lookupFoldTable(unsigned RegOp, unsigned OpNum);
+// Check if an instruction is unsafe for masked-load folding when the load
+// and instruction have the same mask.
+bool isNonFoldableWithSameMask(unsigned RegOp);
+
// Look up the broadcast folding table entry for folding a broadcast with
// operand OpNum.
const X86FoldTableEntry *lookupBroadcastFoldTable(unsigned RegOp,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index c99865cc2dfcd..2b6c21d48125a 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -8155,6 +8155,11 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MaskReg = Op2.getReg();
if (MaskReg) {
+ // Some instructions are invalid to fold into even with the same mask.
+ // Folding is unsafe if an active destination element may read from a
+ // source element that is masked off.
+ if (isNonFoldableWithSameMask(MI.getOpcode()))
+ return nullptr;
bool HasSameMask = false;
for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
const MachineOperand &Op = MI.getOperand(I);
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index cbb7f89bee679..3500f445d240b 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -51,6 +51,48 @@ static const std::set<StringRef> NoFoldSet = {
#include "X86ManualFoldTables.def"
};
+const std::set<StringRef> NoFoldSameMaskPrefixSet = {
+#define NOFOLD_SAME_MASK_PREFIX(PREFIX) #PREFIX,
+#include "X86ManualFoldTables.def"
+};
+
+const std::set<StringRef> NoFoldSameMaskSet = {
+#define NOFOLD_SAME_MASK(INSN) #INSN,
+#include "X86ManualFoldTables.def"
+};
+
+// Check if instruction is unsafe for masked-load folding.
+static bool isNoFoldMaskedInstruction(const CodeGenInstruction *Inst) {
+ StringRef Name = Inst->getName();
+
+ // First check exact instruction name
+ if (NoFoldSameMaskSet.count(Name))
+ return true;
+
+ // Then strip suffixes to get base name for prefix matching
+ // Strip k-register suffix: kz or k
+ if (Name.ends_with("kz"))
+ Name = Name.drop_back(2);
+ else if (Name.ends_with("k"))
+ Name = Name.drop_back(1);
+ else
+ return false; // Not a k-register instruction
+
+ // Strip operand form suffix (check longer patterns first)
+ if (Name.ends_with("rri"))
+ Name = Name.drop_back(3);
+ else if (Name.ends_with("rr") || Name.ends_with("ri"))
+ Name = Name.drop_back(2);
+
+ // Strip vector size suffix: Z128, Z256, or Z
+ if (Name.ends_with("Z128") || Name.ends_with("Z256"))
+ Name = Name.drop_back(4);
+ else if (Name.ends_with("Z"))
+ Name = Name.drop_back(1);
+
+ return NoFoldSameMaskPrefixSet.count(Name);
+}
+
static bool isExplicitAlign(const CodeGenInstruction *Inst) {
return any_of(ExplicitAlign, [Inst](const char *InstStr) {
return Inst->getName().contains(InstStr);
@@ -195,6 +237,7 @@ class X86FoldTablesEmitter {
FoldTable BroadcastTable2;
FoldTable BroadcastTable3;
FoldTable BroadcastTable4;
+ std::vector<const CodeGenInstruction *> NonFoldableWithSameMaskTable;
public:
X86FoldTablesEmitter(const RecordKeeper &R) : Records(R), Target(R) {}
@@ -230,6 +273,14 @@ class X86FoldTablesEmitter {
OS << "};\n\n";
}
+
+ void printTable(const std::vector<const CodeGenInstruction *> &Instructions,
+ StringRef TableName, raw_ostream &OS) {
+ OS << "static const unsigned " << TableName << "[] = {\n";
+ for (auto Inst : Instructions)
+ OS << " X86::" << Inst->getName() << ",\n";
+ OS << "};\n\n";
+ }
};
} // namespace
@@ -644,6 +695,13 @@ void X86FoldTablesEmitter::run(raw_ostream &OS) {
if (hasRSTRegClass(Inst) || hasPtrTailcallRegClass(Inst))
continue;
+ // Check if this instruction has a prefix in NoFoldSameMaskPrefixSet or is
+ // in NoFoldSameMaskSet (problematic for masked-load folding) and add to
+ // NonFoldableWithSameMaskTable.
+ if (isNoFoldMaskedInstruction(Inst)) {
+ NonFoldableWithSameMaskTable.push_back(Inst);
+ }
+
// Add all the memory form instructions to MemInsts, and all the register
// form instructions to RegInsts[Opc], where Opc is the opcode of each
// instructions. this helps reducing the runtime of the backend.
@@ -749,6 +807,7 @@ void X86FoldTablesEmitter::run(raw_ostream &OS) {
PRINT_TABLE(BroadcastTable2)
PRINT_TABLE(BroadcastTable3)
PRINT_TABLE(BroadcastTable4)
+ PRINT_TABLE(NonFoldableWithSameMaskTable)
}
static TableGen::Emitter::OptClass<X86FoldTablesEmitter>
diff --git a/llvm/utils/TableGen/X86ManualFoldTables.def b/llvm/utils/TableGen/X86ManualFoldTables.def
index 003712ae124c7..537ef96bce8ac 100644
--- a/llvm/utils/TableGen/X86ManualFoldTables.def
+++ b/llvm/utils/TableGen/X86ManualFoldTables.def
@@ -295,3 +295,107 @@ ENTRY(MOVSDrr, MOVLPDrm, TB_NO_REVERSE)
ENTRY(VMOVSDZrr, VMOVLPDZ128rm, TB_NO_REVERSE)
ENTRY(VMOVSDrr, VMOVLPDrm, TB_NO_REVERSE)
#undef ENTRY
+// Prefixes for instructions that are unsafe for masked-load folding.
+// Folding with the same mask is only safe if every active destination
+// element reads only from source elements that are also active under the same mask.
+// These instructions perform element rearrangement/broadcasting that may cause
+// active destination elements to read from masked-off source elements.
+#ifndef NOFOLD_SAME_MASK_PREFIX
+#define NOFOLD_SAME_MASK_PREFIX(PREFIX)
+#endif
+NOFOLD_SAME_MASK_PREFIX(VALIGND)
+NOFOLD_SAME_MASK_PREFIX(VALIGNQ)
+NOFOLD_SAME_MASK_PREFIX(VBROADCASTF32X2)
+NOFOLD_SAME_MASK_PREFIX(VBROADCASTI32X2)
+NOFOLD_SAME_MASK_PREFIX(VBROADCASTSD)
+NOFOLD_SAME_MASK_PREFIX(VBROADCASTSS)
+NOFOLD_SAME_MASK_PREFIX(VDBPSADBW)
+NOFOLD_SAME_MASK_PREFIX(VEXPANDPD)
+NOFOLD_SAME_MASK_PREFIX(VEXPANDPS)
+NOFOLD_SAME_MASK_PREFIX(VGF2P8AFFINEINVQB)
+NOFOLD_SAME_MASK_PREFIX(VGF2P8AFFINEQB)
+NOFOLD_SAME_MASK_PREFIX(VINSERTF32X4)
+NOFOLD_SAME_MASK_PREFIX(VINSERTF32X8)
+NOFOLD_SAME_MASK_PREFIX(VINSERTF64X2)
+NOFOLD_SAME_MASK_PREFIX(VINSERTF64X4)
+NOFOLD_SAME_MASK_PREFIX(VINSERTI32X4)
+NOFOLD_SAME_MASK_PREFIX(VINSERTI32X8)
+NOFOLD_SAME_MASK_PREFIX(VINSERTI64X2)
+NOFOLD_SAME_MASK_PREFIX(VINSERTI64X4)
+NOFOLD_SAME_MASK_PREFIX(VMOVDDUP)
+NOFOLD_SAME_MASK_PREFIX(VMOVSHDUP)
+NOFOLD_SAME_MASK_PREFIX(VMOVSLDUP)
+NOFOLD_SAME_MASK_PREFIX(VMPSADBW)
+NOFOLD_SAME_MASK_PREFIX(VPACKSSDW)
+NOFOLD_SAME_MASK_PREFIX(VPACKSSWB)
+NOFOLD_SAME_MASK_PREFIX(VPACKUSDW)
+NOFOLD_SAME_MASK_PREFIX(VPACKUSWB)
+NOFOLD_SAME_MASK_PREFIX(VPALIGNR)
+NOFOLD_SAME_MASK_PREFIX(VPBROADCASTB)
+NOFOLD_SAME_MASK_PREFIX(VPBROADCASTD)
+NOFOLD_SAME_MASK_PREFIX(VPBROADCASTQ)
+NOFOLD_SAME_MASK_PREFIX(VPBROADCASTW)
+NOFOLD_SAME_MASK_PREFIX(VPCONFLICTD)
+NOFOLD_SAME_MASK_PREFIX(VPCONFLICTQ)
+NOFOLD_SAME_MASK_PREFIX(VPERMB)
+NOFOLD_SAME_MASK_PREFIX(VPERMD)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2B)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2D)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2PD)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2PS)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2Q)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2W)
+NOFOLD_SAME_MASK_PREFIX(VPERMPD)
+NOFOLD_SAME_MASK_PREFIX(VPERMPS)
+NOFOLD_SAME_MASK_PREFIX(VPERMQ)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2B)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2D)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2PD)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2PS)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2Q)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2W)
+NOFOLD_SAME_MASK_PREFIX(VPERMW)
+NOFOLD_SAME_MASK_PREFIX(VPEXPANDB)
+NOFOLD_SAME_MASK_PREFIX(VPEXPANDD)
+NOFOLD_SAME_MASK_PREFIX(VPEXPANDQ)
+NOFOLD_SAME_MASK_PREFIX(VPEXPANDW)
+NOFOLD_SAME_MASK_PREFIX(VPMULTISHIFTQB)
+NOFOLD_SAME_MASK_PREFIX(VPSHUFD)
+NOFOLD_SAME_MASK_PREFIX(VPSHUFHW)
+NOFOLD_SAME_MASK_PREFIX(VPSHUFLW)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKHBW)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKHDQ)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKHQDQ)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKHWD)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKLBW)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKLDQ)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKLQDQ)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKLWD)
+NOFOLD_SAME_MASK_PREFIX(VSHUFF32X4)
+NOFOLD_SAME_MASK_PREFIX(VSHUFF64X2)
+NOFOLD_SAME_MASK_PREFIX(VSHUFI32X4)
+NOFOLD_SAME_MASK_PREFIX(VSHUFI64X2)
+NOFOLD_SAME_MASK_PREFIX(VSHUFPD)
+NOFOLD_SAME_MASK_PREFIX(VSHUFPS)
+NOFOLD_SAME_MASK_PREFIX(VUNPCKHPD)
+NOFOLD_SAME_MASK_PREFIX(VUNPCKHPS)
+NOFOLD_SAME_MASK_PREFIX(VUNPCKLPD)
+NOFOLD_SAME_MASK_PREFIX(VUNPCKLPS)
+#undef NOFOLD_SAME_MASK_PREFIX
+#ifndef NOFOLD_SAME_MASK
+#define NOFOLD_SAME_MASK(INSN)
+#endif
+// VPERMILPD/VPERMILPS rik forms: Only rik forms are listed here; rrk forms are NOT blocked
+NOFOLD_SAME_MASK(VPERMILPDZ128rik)
+NOFOLD_SAME_MASK(VPERMILPDZ128rikz)
+NOFOLD_SAME_MASK(VPERMILPDZ256rik)
+NOFOLD_SAME_MASK(VPERMILPDZ256rikz)
+NOFOLD_SAME_MASK(VPERMILPDZrik)
+NOFOLD_SAME_MASK(VPERMILPDZrikz)
+NOFOLD_SAME_MASK(VPERMILPSZ128rik)
+NOFOLD_SAME_MASK(VPERMILPSZ128rikz)
+NOFOLD_SAME_MASK(VPERMILPSZ256rik)
+NOFOLD_SAME_MASK(VPERMILPSZ256rikz)
+NOFOLD_SAME_MASK(VPERMILPSZrik)
+NOFOLD_SAME_MASK(VPERMILPSZrikz)
+#undef NOFOLD_SAME_MASK
\ No newline at end of file
>From 775f7ed7b13155498d8ca3c08df6a4e917fa7d31 Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Thu, 29 Jan 2026 16:04:00 +0100
Subject: [PATCH 2/6] [X86] Update masked load folding tests
---
llvm/test/CodeGen/X86/interleave-load-fold.ll | 17 +-
.../X86/non-foldable-with-the-same-mask.mir | 147 +++---
llvm/test/TableGen/x86-fold-tables.inc | 446 ++++++++++++++++++
3 files changed, 544 insertions(+), 66 deletions(-)
diff --git a/llvm/test/CodeGen/X86/interleave-load-fold.ll b/llvm/test/CodeGen/X86/interleave-load-fold.ll
index 28f313bf6a0fa..e2430ff5e1c03 100644
--- a/llvm/test/CodeGen/X86/interleave-load-fold.ll
+++ b/llvm/test/CodeGen/X86/interleave-load-fold.ll
@@ -5,8 +5,10 @@ define <16 x i8> @interleave_masked_select(ptr %mask, ptr %src) nounwind {
; X64-LABEL: interleave_masked_select:
; X64: # %bb.0:
; X64-NEXT: kmovw (%rdi), %k1
-; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; X64-NEXT: vpunpcklbw {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; X64-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1}
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT: vpunpcklbw {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; X64-NEXT: retq
%mask_vec = load <16 x i1>, ptr %mask
%vec2 = load <16 x i8>, ptr %src
@@ -19,12 +21,13 @@ define <16 x i8> @interleave_masked_select(ptr %mask, ptr %src) nounwind {
define <16 x i1> @interleave_masked_blend(i16 %mask, ptr %src1, ptr %src2) nounwind {
; X64-LABEL: interleave_masked_blend:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa (%rsi), %xmm0
; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT: vpunpcklbw {{.*#+}} xmm2 {%k1} {z} = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; X64-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; X64-NEXT: vmovdqa (%rsi), %xmm0
+; X64-NEXT: vpblendmb (%rdx), %xmm0, %xmm1 {%k1}
+; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpunpcklbw {{.*#+}} xmm1 {%k1} {z} = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X64-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
; X64-NEXT: retq
%mask_vec = bitcast i16 %mask to <16 x i1>
%vec1 = load <16 x i8>, ptr %src1
diff --git a/llvm/test/CodeGen/X86/non-foldable-with-the-same-mask.mir b/llvm/test/CodeGen/X86/non-foldable-with-the-same-mask.mir
index 3281218848c0a..95cb460495512 100644
--- a/llvm/test/CodeGen/X86/non-foldable-with-the-same-mask.mir
+++ b/llvm/test/CodeGen/X86/non-foldable-with-the-same-mask.mir
@@ -21,9 +21,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VALIGNDZ128rmik:%[0-9]+]]:vr128 = VALIGNDZ128rmik [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 1 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VALIGNDZ128rmik]]
+ ; CHECK-NEXT: [[VALIGNDZ128rrik:%[0-9]+]]:vr128 = VALIGNDZ128rrik [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQA32Z128rmkz]], 1
+ ; CHECK-NEXT: $xmm0 = COPY [[VALIGNDZ128rrik]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -53,9 +54,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPALIGNRZ128rmikz:%[0-9]+]]:vr128 = VPALIGNRZ128rmikz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 4 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPALIGNRZ128rmikz]]
+ ; CHECK-NEXT: [[VPALIGNRZ128rrikz:%[0-9]+]]:vr128 = VPALIGNRZ128rrikz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU8Z128rmkz]], 4
+ ; CHECK-NEXT: $xmm0 = COPY [[VPALIGNRZ128rrikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk16wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -84,8 +86,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VBROADCASTSSZ128rmk:%[0-9]+]]:vr128 = VBROADCASTSSZ128rmk [[AVX512_128_SET0_]], [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VBROADCASTSSZ128rmk]]
+ ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VBROADCASTSSZ128rrk:%[0-9]+]]:vr128 = VBROADCASTSSZ128rrk [[AVX512_128_SET0_]], [[COPY]], [[VMOVAPSZ128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VBROADCASTSSZ128rrk]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -113,8 +116,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk2wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VMOVDDUPZ128rmkz:%[0-9]+]]:vr128 = VMOVDDUPZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VMOVDDUPZ128rmkz]]
+ ; CHECK-NEXT: [[VMOVAPDZ128rmkz:%[0-9]+]]:vr128x = VMOVAPDZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VMOVDDUPZ128rrkz:%[0-9]+]]:vr128 = VMOVDDUPZ128rrkz [[COPY]], [[VMOVAPDZ128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VMOVDDUPZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk2wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -142,8 +146,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VMOVSHDUPZ128rmkz:%[0-9]+]]:vr128 = VMOVSHDUPZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VMOVSHDUPZ128rmkz]]
+ ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VMOVSHDUPZ128rrkz:%[0-9]+]]:vr128 = VMOVSHDUPZ128rrkz [[COPY]], [[VMOVAPSZ128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VMOVSHDUPZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -171,8 +176,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPBROADCASTDZ128rmkz:%[0-9]+]]:vr128 = VPBROADCASTDZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPBROADCASTDZ128rmkz]]
+ ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VPBROADCASTDZ128rrkz:%[0-9]+]]:vr128 = VPBROADCASTDZ128rrkz [[COPY]], [[VMOVDQA32Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPBROADCASTDZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -201,9 +207,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VDBPSADBWZ128rmikz:%[0-9]+]]:vr128 = VDBPSADBWZ128rmikz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 0 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VDBPSADBWZ128rmikz]]
+ ; CHECK-NEXT: [[VDBPSADBWZ128rrikz:%[0-9]+]]:vr128 = VDBPSADBWZ128rrikz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU16Z128rmkz]], 0
+ ; CHECK-NEXT: $xmm0 = COPY [[VDBPSADBWZ128rrikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk8wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -233,9 +240,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VGF2P8AFFINEQBZ128rmikz:%[0-9]+]]:vr128 = VGF2P8AFFINEQBZ128rmikz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 0 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VGF2P8AFFINEQBZ128rmikz]]
+ ; CHECK-NEXT: [[VGF2P8AFFINEQBZ128rrikz:%[0-9]+]]:vr128 = VGF2P8AFFINEQBZ128rrikz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU8Z128rmkz]], 0
+ ; CHECK-NEXT: $xmm0 = COPY [[VGF2P8AFFINEQBZ128rrikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk16wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -265,9 +273,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VMPSADBWZ128rmikz:%[0-9]+]]:vr128 = VMPSADBWZ128rmikz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 0 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VMPSADBWZ128rmikz]]
+ ; CHECK-NEXT: [[VMPSADBWZ128rrikz:%[0-9]+]]:vr128 = VMPSADBWZ128rrikz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU16Z128rmkz]], 0
+ ; CHECK-NEXT: $xmm0 = COPY [[VMPSADBWZ128rrikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk8wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -296,8 +305,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPCONFLICTDZ128rmk:%[0-9]+]]:vr128 = VPCONFLICTDZ128rmk [[AVX512_128_SET0_]], [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPCONFLICTDZ128rmk]]
+ ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VPCONFLICTDZ128rrk:%[0-9]+]]:vr128 = VPCONFLICTDZ128rrk [[AVX512_128_SET0_]], [[COPY]], [[VMOVDQA32Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPCONFLICTDZ128rrk]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -326,9 +336,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPMULTISHIFTQBZ128rmkz:%[0-9]+]]:vr128 = VPMULTISHIFTQBZ128rmkz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPMULTISHIFTQBZ128rmkz]]
+ ; CHECK-NEXT: [[VPMULTISHIFTQBZ128rrkz:%[0-9]+]]:vr128 = VPMULTISHIFTQBZ128rrkz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU8Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPMULTISHIFTQBZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk16wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -357,8 +368,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VEXPANDPSZ128rmk:%[0-9]+]]:vr128 = VEXPANDPSZ128rmk [[AVX512_128_SET0_]], [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VEXPANDPSZ128rmk]]
+ ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VEXPANDPSZ128rrk:%[0-9]+]]:vr128 = VEXPANDPSZ128rrk [[AVX512_128_SET0_]], [[COPY]], [[VMOVAPSZ128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VEXPANDPSZ128rrk]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -385,11 +397,12 @@ body: |
; CHECK-LABEL: name: test_vinserti32x4_same_mask
; CHECK: liveins: $rdi, $k1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_256_SET0_:%[0-9]+]]:vr256x = AVX512_256_SET0
; CHECK-NEXT: [[AVX512_256_SET0_1:%[0-9]+]]:vr256x = AVX512_256_SET0
- ; CHECK-NEXT: [[VINSERTI32X4Z256rmikz:%[0-9]+]]:vr256 = VINSERTI32X4Z256rmikz [[COPY]], [[AVX512_256_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 1 :: (load (s128))
- ; CHECK-NEXT: $ymm0 = COPY [[VINSERTI32X4Z256rmikz]]
+ ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VINSERTI32X4Z256rrikz:%[0-9]+]]:vr256 = VINSERTI32X4Z256rrikz [[COPY]], [[AVX512_256_SET0_1]], [[VMOVDQA32Z128rmkz]], 1
+ ; CHECK-NEXT: $ymm0 = COPY [[VINSERTI32X4Z256rrikz]]
; CHECK-NEXT: RET 0, $ymm0
%0:vk4wm = COPY $k1
%1:vr256x = AVX512_256_SET0
@@ -419,9 +432,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPACKSSDWZ128rmk:%[0-9]+]]:vr128 = VPACKSSDWZ128rmk [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPACKSSDWZ128rmk]]
+ ; CHECK-NEXT: [[VPACKSSDWZ128rrk:%[0-9]+]]:vr128 = VPACKSSDWZ128rrk [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU16Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPACKSSDWZ128rrk]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk8wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -451,9 +465,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
; CHECK-NEXT: [[AVX512_256_SET0_:%[0-9]+]]:vr256x = AVX512_256_SET0
+ ; CHECK-NEXT: [[VMOVDQA32Z256rmkz:%[0-9]+]]:vr256x = VMOVDQA32Z256rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
; CHECK-NEXT: [[AVX512_256_SET0_1:%[0-9]+]]:vr256x = AVX512_256_SET0
- ; CHECK-NEXT: [[VPERMDZ256rmk:%[0-9]+]]:vr256 = VPERMDZ256rmk [[AVX512_256_SET0_]], [[COPY]], [[AVX512_256_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
- ; CHECK-NEXT: $ymm0 = COPY [[VPERMDZ256rmk]]
+ ; CHECK-NEXT: [[VPERMDZ256rrk:%[0-9]+]]:vr256 = VPERMDZ256rrk [[AVX512_256_SET0_]], [[COPY]], [[AVX512_256_SET0_1]], [[VMOVDQA32Z256rmkz]]
+ ; CHECK-NEXT: $ymm0 = COPY [[VPERMDZ256rrk]]
; CHECK-NEXT: RET 0, $ymm0
%0:vk8wm = COPY $k1
%1:vr256x = AVX512_256_SET0
@@ -483,9 +498,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPERMI2DZ128rmkz:%[0-9]+]]:vr128x = VPERMI2DZ128rmkz [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPERMI2DZ128rmkz]]
+ ; CHECK-NEXT: [[VPERMI2DZ128rrkz:%[0-9]+]]:vr128x = VPERMI2DZ128rrkz [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQA32Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPERMI2DZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -512,8 +528,9 @@ body: |
; CHECK: liveins: $rdi, $k1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
- ; CHECK-NEXT: [[VPERMILPSZ128mikz:%[0-9]+]]:vr128 = VPERMILPSZ128mikz [[COPY]], $rdi, 1, $noreg, 0, $noreg, 27 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPERMILPSZ128mikz]]
+ ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VPERMILPSZ128rikz:%[0-9]+]]:vr128 = VPERMILPSZ128rikz [[COPY]], [[VMOVAPSZ128rmkz]], 27
+ ; CHECK-NEXT: $xmm0 = COPY [[VPERMILPSZ128rikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = VMOVAPSZ128rmkz %0, $rdi, 1, $noreg, 0, $noreg :: (load (s128))
@@ -541,9 +558,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
; CHECK-NEXT: [[AVX512_256_SET0_:%[0-9]+]]:vr256x = AVX512_256_SET0
+ ; CHECK-NEXT: [[VMOVAPSZ256rmkz:%[0-9]+]]:vr256x = VMOVAPSZ256rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
; CHECK-NEXT: [[AVX512_256_SET0_1:%[0-9]+]]:vr256x = AVX512_256_SET0
- ; CHECK-NEXT: [[VPERMPSZ256rmkz:%[0-9]+]]:vr256 = VPERMPSZ256rmkz [[COPY]], [[AVX512_256_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
- ; CHECK-NEXT: $ymm0 = COPY [[VPERMPSZ256rmkz]]
+ ; CHECK-NEXT: [[VPERMPSZ256rrkz:%[0-9]+]]:vr256 = VPERMPSZ256rrkz [[COPY]], [[AVX512_256_SET0_1]], [[VMOVAPSZ256rmkz]]
+ ; CHECK-NEXT: $ymm0 = COPY [[VPERMPSZ256rrkz]]
; CHECK-NEXT: RET 0, $ymm0
%0:vk8wm = COPY $k1
%1:vr256x = AVX512_256_SET0
@@ -573,9 +591,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPERMT2DZ128rmkz:%[0-9]+]]:vr128x = VPERMT2DZ128rmkz [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPERMT2DZ128rmkz]]
+ ; CHECK-NEXT: [[VPERMT2DZ128rrkz:%[0-9]+]]:vr128x = VPERMT2DZ128rrkz [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQA32Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPERMT2DZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -605,9 +624,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+ ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPERMWZ128rmkz:%[0-9]+]]:vr128 = VPERMWZ128rmkz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPERMWZ128rmkz]]
+ ; CHECK-NEXT: [[VPERMWZ128rrkz:%[0-9]+]]:vr128 = VPERMWZ128rrkz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU16Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPERMWZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk8wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -634,8 +654,9 @@ body: |
; CHECK: liveins: $rdi, $k1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
- ; CHECK-NEXT: [[VPSHUFDZ128mikz:%[0-9]+]]:vr128 = VPSHUFDZ128mikz [[COPY]], $rdi, 1, $noreg, 0, $noreg, 27 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFDZ128mikz]]
+ ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VPSHUFDZ128rikz:%[0-9]+]]:vr128 = VPSHUFDZ128rikz [[COPY]], [[VMOVDQA32Z128rmkz]], 27
+ ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFDZ128rikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = VMOVDQA32Z128rmkz %0, $rdi, 1, $noreg, 0, $noreg :: (load (s128))
@@ -660,8 +681,9 @@ body: |
; CHECK: liveins: $rdi, $k1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
- ; CHECK-NEXT: [[VPSHUFHWZ128mikz:%[0-9]+]]:vr128 = VPSHUFHWZ128mikz [[COPY]], $rdi, 1, $noreg, 0, $noreg, 27 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFHWZ128mikz]]
+ ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VPSHUFHWZ128rikz:%[0-9]+]]:vr128 = VPSHUFHWZ128rikz [[COPY]], [[VMOVDQU16Z128rmkz]], 27
+ ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFHWZ128rikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk8wm = COPY $k1
%1:vr128x = VMOVDQU16Z128rmkz %0, $rdi, 1, $noreg, 0, $noreg :: (load (s128))
@@ -686,8 +708,9 @@ body: |
; CHECK: liveins: $rdi, $k1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
- ; CHECK-NEXT: [[VPSHUFLWZ128mikz:%[0-9]+]]:vr128 = VPSHUFLWZ128mikz [[COPY]], $rdi, 1, $noreg, 0, $noreg, 27 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFLWZ128mikz]]
+ ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VPSHUFLWZ128rikz:%[0-9]+]]:vr128 = VPSHUFLWZ128rikz [[COPY]], [[VMOVDQU16Z128rmkz]], 27
+ ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFLWZ128rikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk8wm = COPY $k1
%1:vr128x = VMOVDQU16Z128rmkz %0, $rdi, 1, $noreg, 0, $noreg :: (load (s128))
@@ -715,9 +738,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
; CHECK-NEXT: [[AVX512_256_SET0_:%[0-9]+]]:vr256x = AVX512_256_SET0
+ ; CHECK-NEXT: [[VMOVAPSZ256rmkz:%[0-9]+]]:vr256x = VMOVAPSZ256rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
; CHECK-NEXT: [[AVX512_256_SET0_1:%[0-9]+]]:vr256x = AVX512_256_SET0
- ; CHECK-NEXT: [[VSHUFF32X4Z256rmikz:%[0-9]+]]:vr256 = VSHUFF32X4Z256rmikz [[COPY]], [[AVX512_256_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 2 :: (load (s256))
- ; CHECK-NEXT: $ymm0 = COPY [[VSHUFF32X4Z256rmikz]]
+ ; CHECK-NEXT: [[VSHUFF32X4Z256rrikz:%[0-9]+]]:vr256 = VSHUFF32X4Z256rrikz [[COPY]], [[AVX512_256_SET0_1]], [[VMOVAPSZ256rmkz]], 2
+ ; CHECK-NEXT: $ymm0 = COPY [[VSHUFF32X4Z256rrikz]]
; CHECK-NEXT: RET 0, $ymm0
%0:vk8wm = COPY $k1
%1:vr256x = AVX512_256_SET0
@@ -746,8 +770,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk2wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VSHUFPDZ128rmikz:%[0-9]+]]:vr128 = VSHUFPDZ128rmikz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg, 1 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VSHUFPDZ128rmikz]]
+ ; CHECK-NEXT: [[VMOVAPDZ128rmkz:%[0-9]+]]:vr128x = VMOVAPDZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VSHUFPDZ128rrikz:%[0-9]+]]:vr128 = VSHUFPDZ128rrikz [[COPY]], [[AVX512_128_SET0_]], [[VMOVAPDZ128rmkz]], 1
+ ; CHECK-NEXT: $xmm0 = COPY [[VSHUFPDZ128rrikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk2wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -775,8 +800,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VSHUFPSZ128rmikz:%[0-9]+]]:vr128 = VSHUFPSZ128rmikz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg, 68 :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VSHUFPSZ128rmikz]]
+ ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VSHUFPSZ128rrikz:%[0-9]+]]:vr128 = VSHUFPSZ128rrikz [[COPY]], [[AVX512_128_SET0_]], [[VMOVAPSZ128rmkz]], 68
+ ; CHECK-NEXT: $xmm0 = COPY [[VSHUFPSZ128rrikz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -804,8 +830,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPUNPCKHBWZ128rmkz:%[0-9]+]]:vr128 = VPUNPCKHBWZ128rmkz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPUNPCKHBWZ128rmkz]]
+ ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VPUNPCKHBWZ128rrkz:%[0-9]+]]:vr128 = VPUNPCKHBWZ128rrkz [[COPY]], [[AVX512_128_SET0_]], [[VMOVDQU8Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPUNPCKHBWZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk16wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -833,8 +860,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VPUNPCKLBWZ128rmkz:%[0-9]+]]:vr128 = VPUNPCKLBWZ128rmkz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VPUNPCKLBWZ128rmkz]]
+ ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VPUNPCKLBWZ128rrkz:%[0-9]+]]:vr128 = VPUNPCKLBWZ128rrkz [[COPY]], [[AVX512_128_SET0_]], [[VMOVDQU8Z128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VPUNPCKLBWZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk16wm = COPY $k1
%1:vr128x = AVX512_128_SET0
@@ -862,8 +890,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
- ; CHECK-NEXT: [[VUNPCKLPSZ128rmkz:%[0-9]+]]:vr128 = VUNPCKLPSZ128rmkz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
- ; CHECK-NEXT: $xmm0 = COPY [[VUNPCKLPSZ128rmkz]]
+ ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+ ; CHECK-NEXT: [[VUNPCKLPSZ128rrkz:%[0-9]+]]:vr128 = VUNPCKLPSZ128rrkz [[COPY]], [[AVX512_128_SET0_]], [[VMOVAPSZ128rmkz]]
+ ; CHECK-NEXT: $xmm0 = COPY [[VUNPCKLPSZ128rrkz]]
; CHECK-NEXT: RET 0, $xmm0
%0:vk4wm = COPY $k1
%1:vr128x = AVX512_128_SET0
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index bafc98a69ddae..d8aec6d58c449 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -10749,3 +10749,449 @@ static const X86FoldTableEntry BroadcastTable4[] = {
{X86::VXORPSZrrk, X86::VXORPSZrmbk, TB_BCAST_SS},
};
+static const unsigned NonFoldableWithSameMaskTable[] = {
+ X86::VALIGNDZ128rrik,
+ X86::VALIGNDZ128rrikz,
+ X86::VALIGNDZ256rrik,
+ X86::VALIGNDZ256rrikz,
+ X86::VALIGNDZrrik,
+ X86::VALIGNDZrrikz,
+ X86::VALIGNQZ128rrik,
+ X86::VALIGNQZ128rrikz,
+ X86::VALIGNQZ256rrik,
+ X86::VALIGNQZ256rrikz,
+ X86::VALIGNQZrrik,
+ X86::VALIGNQZrrikz,
+ X86::VBROADCASTF32X2Z256rrk,
+ X86::VBROADCASTF32X2Z256rrkz,
+ X86::VBROADCASTF32X2Zrrk,
+ X86::VBROADCASTF32X2Zrrkz,
+ X86::VBROADCASTI32X2Z128rrk,
+ X86::VBROADCASTI32X2Z128rrkz,
+ X86::VBROADCASTI32X2Z256rrk,
+ X86::VBROADCASTI32X2Z256rrkz,
+ X86::VBROADCASTI32X2Zrrk,
+ X86::VBROADCASTI32X2Zrrkz,
+ X86::VBROADCASTSDZ256rrk,
+ X86::VBROADCASTSDZ256rrkz,
+ X86::VBROADCASTSDZrrk,
+ X86::VBROADCASTSDZrrkz,
+ X86::VBROADCASTSSZ128rrk,
+ X86::VBROADCASTSSZ128rrkz,
+ X86::VBROADCASTSSZ256rrk,
+ X86::VBROADCASTSSZ256rrkz,
+ X86::VBROADCASTSSZrrk,
+ X86::VBROADCASTSSZrrkz,
+ X86::VDBPSADBWZ128rrik,
+ X86::VDBPSADBWZ128rrikz,
+ X86::VDBPSADBWZ256rrik,
+ X86::VDBPSADBWZ256rrikz,
+ X86::VDBPSADBWZrrik,
+ X86::VDBPSADBWZrrikz,
+ X86::VEXPANDPDZ128rrk,
+ X86::VEXPANDPDZ128rrkz,
+ X86::VEXPANDPDZ256rrk,
+ X86::VEXPANDPDZ256rrkz,
+ X86::VEXPANDPDZrrk,
+ X86::VEXPANDPDZrrkz,
+ X86::VEXPANDPSZ128rrk,
+ X86::VEXPANDPSZ128rrkz,
+ X86::VEXPANDPSZ256rrk,
+ X86::VEXPANDPSZ256rrkz,
+ X86::VEXPANDPSZrrk,
+ X86::VEXPANDPSZrrkz,
+ X86::VGF2P8AFFINEINVQBZ128rrik,
+ X86::VGF2P8AFFINEINVQBZ128rrikz,
+ X86::VGF2P8AFFINEINVQBZ256rrik,
+ X86::VGF2P8AFFINEINVQBZ256rrikz,
+ X86::VGF2P8AFFINEINVQBZrrik,
+ X86::VGF2P8AFFINEINVQBZrrikz,
+ X86::VGF2P8AFFINEQBZ128rrik,
+ X86::VGF2P8AFFINEQBZ128rrikz,
+ X86::VGF2P8AFFINEQBZ256rrik,
+ X86::VGF2P8AFFINEQBZ256rrikz,
+ X86::VGF2P8AFFINEQBZrrik,
+ X86::VGF2P8AFFINEQBZrrikz,
+ X86::VINSERTF32X4Z256rrik,
+ X86::VINSERTF32X4Z256rrikz,
+ X86::VINSERTF32X4Zrrik,
+ X86::VINSERTF32X4Zrrikz,
+ X86::VINSERTF32X8Zrrik,
+ X86::VINSERTF32X8Zrrikz,
+ X86::VINSERTF64X2Z256rrik,
+ X86::VINSERTF64X2Z256rrikz,
+ X86::VINSERTF64X2Zrrik,
+ X86::VINSERTF64X2Zrrikz,
+ X86::VINSERTF64X4Zrrik,
+ X86::VINSERTF64X4Zrrikz,
+ X86::VINSERTI32X4Z256rrik,
+ X86::VINSERTI32X4Z256rrikz,
+ X86::VINSERTI32X4Zrrik,
+ X86::VINSERTI32X4Zrrikz,
+ X86::VINSERTI32X8Zrrik,
+ X86::VINSERTI32X8Zrrikz,
+ X86::VINSERTI64X2Z256rrik,
+ X86::VINSERTI64X2Z256rrikz,
+ X86::VINSERTI64X2Zrrik,
+ X86::VINSERTI64X2Zrrikz,
+ X86::VINSERTI64X4Zrrik,
+ X86::VINSERTI64X4Zrrikz,
+ X86::VMOVDDUPZ128rrk,
+ X86::VMOVDDUPZ128rrkz,
+ X86::VMOVDDUPZ256rrk,
+ X86::VMOVDDUPZ256rrkz,
+ X86::VMOVDDUPZrrk,
+ X86::VMOVDDUPZrrkz,
+ X86::VMOVSHDUPZ128rrk,
+ X86::VMOVSHDUPZ128rrkz,
+ X86::VMOVSHDUPZ256rrk,
+ X86::VMOVSHDUPZ256rrkz,
+ X86::VMOVSHDUPZrrk,
+ X86::VMOVSHDUPZrrkz,
+ X86::VMOVSLDUPZ128rrk,
+ X86::VMOVSLDUPZ128rrkz,
+ X86::VMOVSLDUPZ256rrk,
+ X86::VMOVSLDUPZ256rrkz,
+ X86::VMOVSLDUPZrrk,
+ X86::VMOVSLDUPZrrkz,
+ X86::VMPSADBWZ128rrik,
+ X86::VMPSADBWZ128rrikz,
+ X86::VMPSADBWZ256rrik,
+ X86::VMPSADBWZ256rrikz,
+ X86::VMPSADBWZrrik,
+ X86::VMPSADBWZrrikz,
+ X86::VPACKSSDWZ128rrk,
+ X86::VPACKSSDWZ128rrkz,
+ X86::VPACKSSDWZ256rrk,
+ X86::VPACKSSDWZ256rrkz,
+ X86::VPACKSSDWZrrk,
+ X86::VPACKSSDWZrrkz,
+ X86::VPACKSSWBZ128rrk,
+ X86::VPACKSSWBZ128rrkz,
+ X86::VPACKSSWBZ256rrk,
+ X86::VPACKSSWBZ256rrkz,
+ X86::VPACKSSWBZrrk,
+ X86::VPACKSSWBZrrkz,
+ X86::VPACKUSDWZ128rrk,
+ X86::VPACKUSDWZ128rrkz,
+ X86::VPACKUSDWZ256rrk,
+ X86::VPACKUSDWZ256rrkz,
+ X86::VPACKUSDWZrrk,
+ X86::VPACKUSDWZrrkz,
+ X86::VPACKUSWBZ128rrk,
+ X86::VPACKUSWBZ128rrkz,
+ X86::VPACKUSWBZ256rrk,
+ X86::VPACKUSWBZ256rrkz,
+ X86::VPACKUSWBZrrk,
+ X86::VPACKUSWBZrrkz,
+ X86::VPALIGNRZ128rrik,
+ X86::VPALIGNRZ128rrikz,
+ X86::VPALIGNRZ256rrik,
+ X86::VPALIGNRZ256rrikz,
+ X86::VPALIGNRZrrik,
+ X86::VPALIGNRZrrikz,
+ X86::VPBROADCASTBZ128rrk,
+ X86::VPBROADCASTBZ128rrkz,
+ X86::VPBROADCASTBZ256rrk,
+ X86::VPBROADCASTBZ256rrkz,
+ X86::VPBROADCASTBZrrk,
+ X86::VPBROADCASTBZrrkz,
+ X86::VPBROADCASTDZ128rrk,
+ X86::VPBROADCASTDZ128rrkz,
+ X86::VPBROADCASTDZ256rrk,
+ X86::VPBROADCASTDZ256rrkz,
+ X86::VPBROADCASTDZrrk,
+ X86::VPBROADCASTDZrrkz,
+ X86::VPBROADCASTQZ128rrk,
+ X86::VPBROADCASTQZ128rrkz,
+ X86::VPBROADCASTQZ256rrk,
+ X86::VPBROADCASTQZ256rrkz,
+ X86::VPBROADCASTQZrrk,
+ X86::VPBROADCASTQZrrkz,
+ X86::VPBROADCASTWZ128rrk,
+ X86::VPBROADCASTWZ128rrkz,
+ X86::VPBROADCASTWZ256rrk,
+ X86::VPBROADCASTWZ256rrkz,
+ X86::VPBROADCASTWZrrk,
+ X86::VPBROADCASTWZrrkz,
+ X86::VPCONFLICTDZ128rrk,
+ X86::VPCONFLICTDZ128rrkz,
+ X86::VPCONFLICTDZ256rrk,
+ X86::VPCONFLICTDZ256rrkz,
+ X86::VPCONFLICTDZrrk,
+ X86::VPCONFLICTDZrrkz,
+ X86::VPCONFLICTQZ128rrk,
+ X86::VPCONFLICTQZ128rrkz,
+ X86::VPCONFLICTQZ256rrk,
+ X86::VPCONFLICTQZ256rrkz,
+ X86::VPCONFLICTQZrrk,
+ X86::VPCONFLICTQZrrkz,
+ X86::VPERMBZ128rrk,
+ X86::VPERMBZ128rrkz,
+ X86::VPERMBZ256rrk,
+ X86::VPERMBZ256rrkz,
+ X86::VPERMBZrrk,
+ X86::VPERMBZrrkz,
+ X86::VPERMDZ256rrk,
+ X86::VPERMDZ256rrkz,
+ X86::VPERMDZrrk,
+ X86::VPERMDZrrkz,
+ X86::VPERMI2BZ128rrk,
+ X86::VPERMI2BZ128rrkz,
+ X86::VPERMI2BZ256rrk,
+ X86::VPERMI2BZ256rrkz,
+ X86::VPERMI2BZrrk,
+ X86::VPERMI2BZrrkz,
+ X86::VPERMI2DZ128rrk,
+ X86::VPERMI2DZ128rrkz,
+ X86::VPERMI2DZ256rrk,
+ X86::VPERMI2DZ256rrkz,
+ X86::VPERMI2DZrrk,
+ X86::VPERMI2DZrrkz,
+ X86::VPERMI2PDZ128rrk,
+ X86::VPERMI2PDZ128rrkz,
+ X86::VPERMI2PDZ256rrk,
+ X86::VPERMI2PDZ256rrkz,
+ X86::VPERMI2PDZrrk,
+ X86::VPERMI2PDZrrkz,
+ X86::VPERMI2PSZ128rrk,
+ X86::VPERMI2PSZ128rrkz,
+ X86::VPERMI2PSZ256rrk,
+ X86::VPERMI2PSZ256rrkz,
+ X86::VPERMI2PSZrrk,
+ X86::VPERMI2PSZrrkz,
+ X86::VPERMI2QZ128rrk,
+ X86::VPERMI2QZ128rrkz,
+ X86::VPERMI2QZ256rrk,
+ X86::VPERMI2QZ256rrkz,
+ X86::VPERMI2QZrrk,
+ X86::VPERMI2QZrrkz,
+ X86::VPERMI2WZ128rrk,
+ X86::VPERMI2WZ128rrkz,
+ X86::VPERMI2WZ256rrk,
+ X86::VPERMI2WZ256rrkz,
+ X86::VPERMI2WZrrk,
+ X86::VPERMI2WZrrkz,
+ X86::VPERMILPDZ128rik,
+ X86::VPERMILPDZ128rikz,
+ X86::VPERMILPDZ256rik,
+ X86::VPERMILPDZ256rikz,
+ X86::VPERMILPDZrik,
+ X86::VPERMILPDZrikz,
+ X86::VPERMILPSZ128rik,
+ X86::VPERMILPSZ128rikz,
+ X86::VPERMILPSZ256rik,
+ X86::VPERMILPSZ256rikz,
+ X86::VPERMILPSZrik,
+ X86::VPERMILPSZrikz,
+ X86::VPERMPDZ256rik,
+ X86::VPERMPDZ256rikz,
+ X86::VPERMPDZ256rrk,
+ X86::VPERMPDZ256rrkz,
+ X86::VPERMPDZrik,
+ X86::VPERMPDZrikz,
+ X86::VPERMPDZrrk,
+ X86::VPERMPDZrrkz,
+ X86::VPERMPSZ256rrk,
+ X86::VPERMPSZ256rrkz,
+ X86::VPERMPSZrrk,
+ X86::VPERMPSZrrkz,
+ X86::VPERMQZ256rik,
+ X86::VPERMQZ256rikz,
+ X86::VPERMQZ256rrk,
+ X86::VPERMQZ256rrkz,
+ X86::VPERMQZrik,
+ X86::VPERMQZrikz,
+ X86::VPERMQZrrk,
+ X86::VPERMQZrrkz,
+ X86::VPERMT2BZ128rrk,
+ X86::VPERMT2BZ128rrkz,
+ X86::VPERMT2BZ256rrk,
+ X86::VPERMT2BZ256rrkz,
+ X86::VPERMT2BZrrk,
+ X86::VPERMT2BZrrkz,
+ X86::VPERMT2DZ128rrk,
+ X86::VPERMT2DZ128rrkz,
+ X86::VPERMT2DZ256rrk,
+ X86::VPERMT2DZ256rrkz,
+ X86::VPERMT2DZrrk,
+ X86::VPERMT2DZrrkz,
+ X86::VPERMT2PDZ128rrk,
+ X86::VPERMT2PDZ128rrkz,
+ X86::VPERMT2PDZ256rrk,
+ X86::VPERMT2PDZ256rrkz,
+ X86::VPERMT2PDZrrk,
+ X86::VPERMT2PDZrrkz,
+ X86::VPERMT2PSZ128rrk,
+ X86::VPERMT2PSZ128rrkz,
+ X86::VPERMT2PSZ256rrk,
+ X86::VPERMT2PSZ256rrkz,
+ X86::VPERMT2PSZrrk,
+ X86::VPERMT2PSZrrkz,
+ X86::VPERMT2QZ128rrk,
+ X86::VPERMT2QZ128rrkz,
+ X86::VPERMT2QZ256rrk,
+ X86::VPERMT2QZ256rrkz,
+ X86::VPERMT2QZrrk,
+ X86::VPERMT2QZrrkz,
+ X86::VPERMT2WZ128rrk,
+ X86::VPERMT2WZ128rrkz,
+ X86::VPERMT2WZ256rrk,
+ X86::VPERMT2WZ256rrkz,
+ X86::VPERMT2WZrrk,
+ X86::VPERMT2WZrrkz,
+ X86::VPERMWZ128rrk,
+ X86::VPERMWZ128rrkz,
+ X86::VPERMWZ256rrk,
+ X86::VPERMWZ256rrkz,
+ X86::VPERMWZrrk,
+ X86::VPERMWZrrkz,
+ X86::VPEXPANDBZ128rrk,
+ X86::VPEXPANDBZ128rrkz,
+ X86::VPEXPANDBZ256rrk,
+ X86::VPEXPANDBZ256rrkz,
+ X86::VPEXPANDBZrrk,
+ X86::VPEXPANDBZrrkz,
+ X86::VPEXPANDDZ128rrk,
+ X86::VPEXPANDDZ128rrkz,
+ X86::VPEXPANDDZ256rrk,
+ X86::VPEXPANDDZ256rrkz,
+ X86::VPEXPANDDZrrk,
+ X86::VPEXPANDDZrrkz,
+ X86::VPEXPANDQZ128rrk,
+ X86::VPEXPANDQZ128rrkz,
+ X86::VPEXPANDQZ256rrk,
+ X86::VPEXPANDQZ256rrkz,
+ X86::VPEXPANDQZrrk,
+ X86::VPEXPANDQZrrkz,
+ X86::VPEXPANDWZ128rrk,
+ X86::VPEXPANDWZ128rrkz,
+ X86::VPEXPANDWZ256rrk,
+ X86::VPEXPANDWZ256rrkz,
+ X86::VPEXPANDWZrrk,
+ X86::VPEXPANDWZrrkz,
+ X86::VPMULTISHIFTQBZ128rrk,
+ X86::VPMULTISHIFTQBZ128rrkz,
+ X86::VPMULTISHIFTQBZ256rrk,
+ X86::VPMULTISHIFTQBZ256rrkz,
+ X86::VPMULTISHIFTQBZrrk,
+ X86::VPMULTISHIFTQBZrrkz,
+ X86::VPSHUFDZ128rik,
+ X86::VPSHUFDZ128rikz,
+ X86::VPSHUFDZ256rik,
+ X86::VPSHUFDZ256rikz,
+ X86::VPSHUFDZrik,
+ X86::VPSHUFDZrikz,
+ X86::VPSHUFHWZ128rik,
+ X86::VPSHUFHWZ128rikz,
+ X86::VPSHUFHWZ256rik,
+ X86::VPSHUFHWZ256rikz,
+ X86::VPSHUFHWZrik,
+ X86::VPSHUFHWZrikz,
+ X86::VPSHUFLWZ128rik,
+ X86::VPSHUFLWZ128rikz,
+ X86::VPSHUFLWZ256rik,
+ X86::VPSHUFLWZ256rikz,
+ X86::VPSHUFLWZrik,
+ X86::VPSHUFLWZrikz,
+ X86::VPUNPCKHBWZ128rrk,
+ X86::VPUNPCKHBWZ128rrkz,
+ X86::VPUNPCKHBWZ256rrk,
+ X86::VPUNPCKHBWZ256rrkz,
+ X86::VPUNPCKHBWZrrk,
+ X86::VPUNPCKHBWZrrkz,
+ X86::VPUNPCKHDQZ128rrk,
+ X86::VPUNPCKHDQZ128rrkz,
+ X86::VPUNPCKHDQZ256rrk,
+ X86::VPUNPCKHDQZ256rrkz,
+ X86::VPUNPCKHDQZrrk,
+ X86::VPUNPCKHDQZrrkz,
+ X86::VPUNPCKHQDQZ128rrk,
+ X86::VPUNPCKHQDQZ128rrkz,
+ X86::VPUNPCKHQDQZ256rrk,
+ X86::VPUNPCKHQDQZ256rrkz,
+ X86::VPUNPCKHQDQZrrk,
+ X86::VPUNPCKHQDQZrrkz,
+ X86::VPUNPCKHWDZ128rrk,
+ X86::VPUNPCKHWDZ128rrkz,
+ X86::VPUNPCKHWDZ256rrk,
+ X86::VPUNPCKHWDZ256rrkz,
+ X86::VPUNPCKHWDZrrk,
+ X86::VPUNPCKHWDZrrkz,
+ X86::VPUNPCKLBWZ128rrk,
+ X86::VPUNPCKLBWZ128rrkz,
+ X86::VPUNPCKLBWZ256rrk,
+ X86::VPUNPCKLBWZ256rrkz,
+ X86::VPUNPCKLBWZrrk,
+ X86::VPUNPCKLBWZrrkz,
+ X86::VPUNPCKLDQZ128rrk,
+ X86::VPUNPCKLDQZ128rrkz,
+ X86::VPUNPCKLDQZ256rrk,
+ X86::VPUNPCKLDQZ256rrkz,
+ X86::VPUNPCKLDQZrrk,
+ X86::VPUNPCKLDQZrrkz,
+ X86::VPUNPCKLQDQZ128rrk,
+ X86::VPUNPCKLQDQZ128rrkz,
+ X86::VPUNPCKLQDQZ256rrk,
+ X86::VPUNPCKLQDQZ256rrkz,
+ X86::VPUNPCKLQDQZrrk,
+ X86::VPUNPCKLQDQZrrkz,
+ X86::VPUNPCKLWDZ128rrk,
+ X86::VPUNPCKLWDZ128rrkz,
+ X86::VPUNPCKLWDZ256rrk,
+ X86::VPUNPCKLWDZ256rrkz,
+ X86::VPUNPCKLWDZrrk,
+ X86::VPUNPCKLWDZrrkz,
+ X86::VSHUFF32X4Z256rrik,
+ X86::VSHUFF32X4Z256rrikz,
+ X86::VSHUFF32X4Zrrik,
+ X86::VSHUFF32X4Zrrikz,
+ X86::VSHUFF64X2Z256rrik,
+ X86::VSHUFF64X2Z256rrikz,
+ X86::VSHUFF64X2Zrrik,
+ X86::VSHUFF64X2Zrrikz,
+ X86::VSHUFI32X4Z256rrik,
+ X86::VSHUFI32X4Z256rrikz,
+ X86::VSHUFI32X4Zrrik,
+ X86::VSHUFI32X4Zrrikz,
+ X86::VSHUFI64X2Z256rrik,
+ X86::VSHUFI64X2Z256rrikz,
+ X86::VSHUFI64X2Zrrik,
+ X86::VSHUFI64X2Zrrikz,
+ X86::VSHUFPDZ128rrik,
+ X86::VSHUFPDZ128rrikz,
+ X86::VSHUFPDZ256rrik,
+ X86::VSHUFPDZ256rrikz,
+ X86::VSHUFPDZrrik,
+ X86::VSHUFPDZrrikz,
+ X86::VSHUFPSZ128rrik,
+ X86::VSHUFPSZ128rrikz,
+ X86::VSHUFPSZ256rrik,
+ X86::VSHUFPSZ256rrikz,
+ X86::VSHUFPSZrrik,
+ X86::VSHUFPSZrrikz,
+ X86::VUNPCKHPDZ128rrk,
+ X86::VUNPCKHPDZ128rrkz,
+ X86::VUNPCKHPDZ256rrk,
+ X86::VUNPCKHPDZ256rrkz,
+ X86::VUNPCKHPDZrrk,
+ X86::VUNPCKHPDZrrkz,
+ X86::VUNPCKHPSZ128rrk,
+ X86::VUNPCKHPSZ128rrkz,
+ X86::VUNPCKHPSZ256rrk,
+ X86::VUNPCKHPSZ256rrkz,
+ X86::VUNPCKHPSZrrk,
+ X86::VUNPCKHPSZrrkz,
+ X86::VUNPCKLPDZ128rrk,
+ X86::VUNPCKLPDZ128rrkz,
+ X86::VUNPCKLPDZ256rrk,
+ X86::VUNPCKLPDZ256rrkz,
+ X86::VUNPCKLPDZrrk,
+ X86::VUNPCKLPDZrrkz,
+ X86::VUNPCKLPSZ128rrk,
+ X86::VUNPCKLPSZ128rrkz,
+ X86::VUNPCKLPSZ256rrk,
+ X86::VUNPCKLPSZ256rrkz,
+ X86::VUNPCKLPSZrrk,
+ X86::VUNPCKLPSZrrkz,
+};
>From 771419dfed88c24b663e6a2ab211ec48a126aded Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Fri, 30 Jan 2026 15:00:07 +0100
Subject: [PATCH 3/6] [X86] Add comment to clarify NOFOLD_SAME_MASK_PREFIX
usage
---
llvm/utils/TableGen/X86ManualFoldTables.def | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/utils/TableGen/X86ManualFoldTables.def b/llvm/utils/TableGen/X86ManualFoldTables.def
index 537ef96bce8ac..d1cd3e9d46260 100644
--- a/llvm/utils/TableGen/X86ManualFoldTables.def
+++ b/llvm/utils/TableGen/X86ManualFoldTables.def
@@ -300,6 +300,7 @@ ENTRY(VMOVSDrr, VMOVLPDrm, TB_NO_REVERSE)
// element reads only from source elements that are also active under the same mask.
// These instructions perform element rearrangement/broadcasting that may cause
// active destination elements to read from masked-off source elements.
+// Matches the following patterns: OPCODE[,Z,Z128,Z256][,rr,ri,rri][k,kz].
#ifndef NOFOLD_SAME_MASK_PREFIX
#define NOFOLD_SAME_MASK_PREFIX(PREFIX)
#endif
>From 69b348e4846370e6e7ef4c1ac5a352e4e925de61 Mon Sep 17 00:00:00 2001
From: azwolski <antoni.zwolski at intel.com>
Date: Fri, 30 Jan 2026 15:09:28 +0100
Subject: [PATCH 4/6] Update llvm/utils/TableGen/X86ManualFoldTables.def
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: Mészáros Gergely <maetveis at gmail.com>
---
llvm/utils/TableGen/X86ManualFoldTables.def | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/utils/TableGen/X86ManualFoldTables.def b/llvm/utils/TableGen/X86ManualFoldTables.def
index d1cd3e9d46260..693b3de69ae46 100644
--- a/llvm/utils/TableGen/X86ManualFoldTables.def
+++ b/llvm/utils/TableGen/X86ManualFoldTables.def
@@ -399,4 +399,4 @@ NOFOLD_SAME_MASK(VPERMILPSZ256rik)
NOFOLD_SAME_MASK(VPERMILPSZ256rikz)
NOFOLD_SAME_MASK(VPERMILPSZrik)
NOFOLD_SAME_MASK(VPERMILPSZrikz)
-#undef NOFOLD_SAME_MASK
\ No newline at end of file
+#undef NOFOLD_SAME_MASK
>From 4ac9357c3794e0c3ffa55f4cd1b5d84b6fe35c5d Mon Sep 17 00:00:00 2001
From: Antoni Zwolski <antoni.zwolski at intel.com>
Date: Fri, 30 Jan 2026 16:28:42 +0100
Subject: [PATCH 5/6] [X86] Add missing line to x86-fold-tables.inc
---
llvm/test/TableGen/x86-fold-tables.inc | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index d8aec6d58c449..0b49ba59a7943 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -11195,3 +11195,4 @@ static const unsigned NonFoldableWithSameMaskTable[] = {
X86::VUNPCKLPSZrrk,
X86::VUNPCKLPSZrrkz,
};
+
>From 9f242e4634a2ef78edd535c0f6eb4eeaa6597130 Mon Sep 17 00:00:00 2001
From: azwolski <antoni.zwolski at intel.com>
Date: Thu, 5 Feb 2026 13:56:45 +0100
Subject: [PATCH 6/6] Update llvm/utils/TableGen/X86FoldTablesEmitter.cpp
Co-authored-by: Phoebe Wang <phoebe.wang at intel.com>
---
llvm/utils/TableGen/X86FoldTablesEmitter.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 3500f445d240b..a909d4f0378b8 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -89,6 +89,8 @@ static bool isNoFoldMaskedInstruction(const CodeGenInstruction *Inst) {
Name = Name.drop_back(4);
else if (Name.ends_with("Z"))
Name = Name.drop_back(1);
+ else
+ return false; // Not a AVX512 instruction
return NoFoldSameMaskPrefixSet.count(Name);
}
More information about the llvm-commits
mailing list