[llvm] [X86][AMX] Check also AMX register live out for copy lowering (PR #93692)
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Wed May 29 07:31:40 PDT 2024
https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/93692
Another bug fix for #83628.
>From c64a5d9851e91adbdb5a6d662a0f880401e07245 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang at intel.com>
Date: Wed, 29 May 2024 22:17:10 +0800
Subject: [PATCH] [X86][AMX] Check also AMX register live out for copy lowering
Another bug fix for #83628.
---
llvm/lib/Target/X86/X86LowerTileCopy.cpp | 14 +++++--
llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll | 42 ++++++++++++++++-----
2 files changed, 44 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index 60c024556ff13..f27676a27e86c 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -81,7 +81,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
- // There won't be a tile copy if no tile register live in.
+ // There won't be a tile copy if neither tile register live in nor live out.
bool HasTileCopy = false;
for (const auto &LI : MBB.liveins()) {
if (TILERegs.test(LI.PhysReg)) {
@@ -89,10 +89,18 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
break;
}
}
- if (!HasTileCopy)
- continue;
LiveRegUnits UsedRegs(*TRI);
UsedRegs.addLiveOuts(MBB);
+ if (!HasTileCopy) {
+ for (auto RegT : TILERegs.set_bits()) {
+ if (UsedRegs.available(RegT)) {
+ HasTileCopy = true;
+ break;
+ }
+ }
+ }
+ if (!HasTileCopy)
+ continue;
for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
UsedRegs.stepBackward(MI);
if (!MI.isCopy())
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
index 7511e5953dac1..15e7136f4a503 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -52,14 +52,11 @@ declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_
declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
-define void @PR90954(ptr %0, ptr %1, i32 %2) {
+define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
; CHECK-LABEL: PR90954:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %r13
@@ -67,11 +64,6 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
-; CHECK-NEXT: .cfi_offset %rbx, -56
-; CHECK-NEXT: .cfi_offset %r12, -48
-; CHECK-NEXT: .cfi_offset %r13, -40
-; CHECK-NEXT: .cfi_offset %r14, -32
-; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
@@ -202,5 +194,37 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
br label %6
}
+define void @multi_use() nounwind {
+; CHECK-LABEL: multi_use:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $64, %ax
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $16, %cx
+; CHECK-NEXT: tilezero %tmm0
+; CHECK-NEXT: movabsq $64, %rbp
+; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
+; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
+; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm1
+; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm0
+; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
+ %2 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
+ %3 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
+ ret void
+}
+
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)
More information about the llvm-commits
mailing list