[llvm] [X86][CodeGen] Support folding memory broadcast in X86InstrInfo::foldMemoryOperandImpl (PR #79761)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 28 09:13:25 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Shengchen Kan (KanRobert)
<details>
<summary>Changes</summary>
---
Patch is 360.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79761.diff
9 Files Affected:
- (modified) llvm/lib/Target/X86/X86FixupVectorConstants.cpp (+2-2)
- (modified) llvm/lib/Target/X86/X86InstrAVX512.td (+1-1)
- (modified) llvm/lib/Target/X86/X86InstrFoldTables.cpp (+20-3)
- (modified) llvm/lib/Target/X86/X86InstrFoldTables.h (+7-1)
- (modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+152-4)
- (modified) llvm/lib/Target/X86/X86InstrInfo.h (+7)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll (+179-185)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll (+778-792)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll (+644-636)
``````````diff
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index d4af94c7f92ee7..037a745d632fbc 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -406,14 +406,14 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
if (OpSrc32) {
if (const X86FoldTableEntry *Mem2Bcst =
- llvm::lookupBroadcastFoldTable(OpSrc32, 32)) {
+ llvm::lookupBroadcastFoldTableBySize(OpSrc32, 32)) {
OpBcst32 = Mem2Bcst->DstOp;
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
}
}
if (OpSrc64) {
if (const X86FoldTableEntry *Mem2Bcst =
- llvm::lookupBroadcastFoldTable(OpSrc64, 64)) {
+ llvm::lookupBroadcastFoldTableBySize(OpSrc64, 64)) {
OpBcst64 = Mem2Bcst->DstOp;
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
}
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index bb5e22c7142793..b588f660e2744e 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -1067,7 +1067,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
MaskInfo.RC:$src0))],
DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_K, Sched<[SchedRR]>;
- let hasSideEffects = 0, mayLoad = 1 in
+ let hasSideEffects = 0, mayLoad = 1, isReMaterializable = 1, canFoldAsLoad = 1 in
def rm : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
(ins SrcInfo.ScalarMemOp:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 63136af2295f4b..b27936d381b6ee 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -145,6 +145,23 @@ llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) {
return lookupFoldTableImpl(FoldTable, RegOp);
}
+const X86FoldTableEntry *
+llvm::lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum) {
+ ArrayRef<X86FoldTableEntry> FoldTable;
+ if (OpNum == 1)
+ FoldTable = ArrayRef(BroadcastTable1);
+ else if (OpNum == 2)
+ FoldTable = ArrayRef(BroadcastTable2);
+ else if (OpNum == 3)
+ FoldTable = ArrayRef(BroadcastTable3);
+ else if (OpNum == 4)
+ FoldTable = ArrayRef(BroadcastTable4);
+ else
+ return nullptr;
+
+ return lookupFoldTableImpl(FoldTable, RegOp);
+}
+
namespace {
// This class stores the memory unfolding tables. It is instantiated as a
@@ -288,8 +305,8 @@ struct X86BroadcastFoldTable {
};
} // namespace
-static bool matchBroadcastSize(const X86FoldTableEntry &Entry,
- unsigned BroadcastBits) {
+bool llvm::matchBroadcastSize(const X86FoldTableEntry &Entry,
+ unsigned BroadcastBits) {
switch (Entry.Flags & TB_BCAST_MASK) {
case TB_BCAST_W:
case TB_BCAST_SH:
@@ -305,7 +322,7 @@ static bool matchBroadcastSize(const X86FoldTableEntry &Entry,
}
const X86FoldTableEntry *
-llvm::lookupBroadcastFoldTable(unsigned MemOp, unsigned BroadcastBits) {
+llvm::lookupBroadcastFoldTableBySize(unsigned MemOp, unsigned BroadcastBits) {
static X86BroadcastFoldTable BroadcastFoldTable;
auto &Table = BroadcastFoldTable.Table;
for (auto I = llvm::lower_bound(Table, MemOp);
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.h b/llvm/lib/Target/X86/X86InstrFoldTables.h
index e3890d6aa8eb0f..5fb5b17ef6125a 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.h
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -44,14 +44,20 @@ const X86FoldTableEntry *lookupTwoAddrFoldTable(unsigned RegOp);
// operand OpNum.
const X86FoldTableEntry *lookupFoldTable(unsigned RegOp, unsigned OpNum);
+// Look up the broadcast folding table entry for folding a broadcast with
+// operand OpNum.
+const X86FoldTableEntry *lookupBroadcastFoldTable(unsigned RegOp,
+ unsigned OpNum);
+
// Look up the memory unfolding table entry for this instruction.
const X86FoldTableEntry *lookupUnfoldTable(unsigned MemOp);
// Look up the broadcast folding table entry for this instruction from
// the regular memory instruction.
-const X86FoldTableEntry *lookupBroadcastFoldTable(unsigned MemOp,
+const X86FoldTableEntry *lookupBroadcastFoldTableBySize(unsigned MemOp,
unsigned BroadcastBits);
+bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits);
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 9a95464287c5dc..e71407b727b644 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -862,6 +862,28 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
// AVX-512
+ case X86::VPBROADCASTBZ128rm:
+ case X86::VPBROADCASTBZ256rm:
+ case X86::VPBROADCASTBZrm:
+ case X86::VBROADCASTF32X2Z256rm:
+ case X86::VBROADCASTF32X2Zrm:
+ case X86::VBROADCASTI32X2Z128rm:
+ case X86::VBROADCASTI32X2Z256rm:
+ case X86::VBROADCASTI32X2Zrm:
+ case X86::VPBROADCASTWZ128rm:
+ case X86::VPBROADCASTWZ256rm:
+ case X86::VPBROADCASTWZrm:
+ case X86::VPBROADCASTDZ128rm:
+ case X86::VPBROADCASTDZ256rm:
+ case X86::VPBROADCASTDZrm:
+ case X86::VBROADCASTSSZ128rm:
+ case X86::VBROADCASTSSZ256rm:
+ case X86::VBROADCASTSSZrm:
+ case X86::VPBROADCASTQZ128rm:
+ case X86::VPBROADCASTQZ256rm:
+ case X86::VPBROADCASTQZrm:
+ case X86::VBROADCASTSDZ256rm:
+ case X86::VBROADCASTSDZrm:
case X86::VMOVSSZrm:
case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
@@ -8063,6 +8085,39 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MOs.push_back(MachineOperand::CreateReg(0, false));
break;
}
+ case X86::VPBROADCASTBZ128rm:
+ case X86::VPBROADCASTBZ256rm:
+ case X86::VPBROADCASTBZrm:
+ case X86::VBROADCASTF32X2Z256rm:
+ case X86::VBROADCASTF32X2Zrm:
+ case X86::VBROADCASTI32X2Z128rm:
+ case X86::VBROADCASTI32X2Z256rm:
+ case X86::VBROADCASTI32X2Zrm:
+ // No instructions currently fuse with 8bits or 32bits x 2.
+ return nullptr;
+
+#define FOLD_BROADCAST(SIZE) \
+ MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
+ LoadMI.operands_begin() + NumOps); \
+ return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
+ Alignment, /*AllowCommute=*/true);
+ case X86::VPBROADCASTWZ128rm:
+ case X86::VPBROADCASTWZ256rm:
+ case X86::VPBROADCASTWZrm:
+ FOLD_BROADCAST(16);
+ case X86::VPBROADCASTDZ128rm:
+ case X86::VPBROADCASTDZ256rm:
+ case X86::VPBROADCASTDZrm:
+ case X86::VBROADCASTSSZ128rm:
+ case X86::VBROADCASTSSZ256rm:
+ case X86::VBROADCASTSSZrm:
+ FOLD_BROADCAST(32);
+ case X86::VPBROADCASTQZ128rm:
+ case X86::VPBROADCASTQZ256rm:
+ case X86::VPBROADCASTQZrm:
+ case X86::VBROADCASTSDZ256rm:
+ case X86::VBROADCASTSDZrm:
+ FOLD_BROADCAST(64);
default: {
if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
return nullptr;
@@ -8077,6 +8132,80 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
/*Size=*/0, Alignment, /*AllowCommute=*/true);
}
+MachineInstr *X86InstrInfo::foldMemoryBroadcast(
+ MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned BitsSize, Align Alignment, bool AllowCommute) const {
+
+ const X86FoldTableEntry *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum);
+
+ if (I)
+ return matchBroadcastSize(*I, BitsSize)
+ ? FuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
+ : nullptr;
+
+ // TODO: Share code with foldMemoryOperandImpl for the commute
+ if (AllowCommute) {
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
+ if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+ bool HasDef = MI.getDesc().getNumDefs();
+ Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
+ Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+ Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
+ bool Tied1 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+ bool Tied2 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+ // If either of the commutable operands are tied to the destination
+ // then we can not commute + fold.
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
+ (HasDef && Reg0 == Reg2 && Tied2))
+ return nullptr;
+
+ MachineInstr *CommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!CommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (CommutedMI != &MI) {
+ // New instruction. We can't fold from this.
+ CommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Attempt to fold with the commuted version of the instruction.
+ MachineInstr *NewMI = foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs,
+ InsertPt, BitsSize, Alignment,
+ /*AllowCommute=*/false);
+ if (NewMI)
+ return NewMI;
+
+ // Folding failed again - undo the commute before returning.
+ MachineInstr *UncommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!UncommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (UncommutedMI != &MI) {
+ // New instruction. It doesn't need to be kept.
+ UncommutedMI->eraseFromParent();
+ return nullptr;
+ }
+
+ // Return here to prevent duplicate fuse failure report.
+ return nullptr;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing && !MI.isCopy())
+ dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
+ return nullptr;
+}
+
static SmallVector<MachineMemOperand *, 2>
extractLoadMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
SmallVector<MachineMemOperand *, 2> LoadMMOs;
@@ -8130,6 +8259,18 @@ static unsigned getBroadcastOpcode(const X86FoldTableEntry *I,
switch (I->Flags & TB_BCAST_MASK) {
default:
llvm_unreachable("Unexpected broadcast type!");
+ case TB_BCAST_W:
+ switch (SpillSize) {
+ default:
+ llvm_unreachable("Unknown spill size");
+ case 16:
+ return X86::VPBROADCASTWZ128rm;
+ case 32:
+ return X86::VPBROADCASTWZ256rm;
+ case 64:
+ return X86::VPBROADCASTWZrm;
+ }
+ break;
case TB_BCAST_D:
switch (SpillSize) {
default:
@@ -8191,7 +8332,11 @@ bool X86InstrInfo::unfoldMemoryOperand(
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
- bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
+ unsigned BCastType = I->Flags & TB_FOLDED_BCAST;
+ // FIXME: Support TB_BCAST_SH in getBroadcastOpcode?
+ if (BCastType == TB_BCAST_SH)
+ return false;
+
if (UnfoldLoad && !FoldedLoad)
return false;
UnfoldLoad &= FoldedLoad;
@@ -8231,7 +8376,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
unsigned Opc;
- if (FoldedBCast) {
+ if (BCastType) {
Opc = getBroadcastOpcode(I, RC, Subtarget);
} else {
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
@@ -8341,7 +8486,10 @@ bool X86InstrInfo::unfoldMemoryOperand(
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
- bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
+ unsigned BCastType = I->Flags & TB_FOLDED_BCAST;
+ // FIXME: Support TB_BCAST_SH in getBroadcastOpcode?
+ if (BCastType == TB_BCAST_SH)
+ return false;
const MCInstrDesc &MCID = get(Opc);
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -8377,7 +8525,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
// memory access is slow above.
unsigned Opc;
- if (FoldedBCast) {
+ if (BCastType) {
Opc = getBroadcastOpcode(I, RC, Subtarget);
} else {
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 0cb69050656109..3a1f98a005ca3a 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -643,6 +643,13 @@ class X86InstrInfo final : public X86GenInstrInfo {
MachineBasicBlock::iterator InsertPt,
unsigned Size, Align Alignment) const;
+ MachineInstr *foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
+ unsigned OpNum,
+ ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
+ unsigned BitsSize, Align Alignment,
+ bool AllowCommute) const;
+
/// isFrameOperand - Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index c562e9d9a32808..85dd0dcd0d4daf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -14967,14 +14967,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm0
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
; AVX512-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm29
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26
; AVX512-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22
; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm5
; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm6
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -14985,11 +14985,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %xmm7
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm27
+; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20
; AVX512-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm30 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm28[0,1,0,2]
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,0,2]
; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
@@ -15006,8 +15006,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm15
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18
; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19
@@ -15017,146 +15017,145 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
; AVX512-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7]
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm7
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm13
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm22
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpor %ymm1, %ymm8, %ymm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512-FCP-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/79761
More information about the llvm-commits
mailing list