[llvm] [X86][AMX] Check also AMX register live out for copy lowering (PR #93692)

Phoebe Wang via llvm-commits llvm-commits at lists.llvm.org
Wed May 29 07:31:40 PDT 2024


https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/93692

Another bug fix for #83628.

>From c64a5d9851e91adbdb5a6d662a0f880401e07245 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang at intel.com>
Date: Wed, 29 May 2024 22:17:10 +0800
Subject: [PATCH] [X86][AMX] Check also AMX register live out for copy lowering

Another bug fix for #83628.
---
 llvm/lib/Target/X86/X86LowerTileCopy.cpp    | 14 +++++--
 llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll | 42 ++++++++++++++++-----
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index 60c024556ff13..f27676a27e86c 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -81,7 +81,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    // There won't be a tile copy if no tile register live in.
+    // There won't be a tile copy if neither tile register live in nor live out.
     bool HasTileCopy = false;
     for (const auto &LI : MBB.liveins()) {
       if (TILERegs.test(LI.PhysReg)) {
@@ -89,10 +89,18 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
     }
-    if (!HasTileCopy)
-      continue;
     LiveRegUnits UsedRegs(*TRI);
     UsedRegs.addLiveOuts(MBB);
+    if (!HasTileCopy) {
+      for (auto RegT : TILERegs.set_bits()) {
+        if (UsedRegs.available(RegT)) {
+          HasTileCopy = true;
+          break;
+        }
+      }
+    }
+    if (!HasTileCopy)
+      continue;
     for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
       UsedRegs.stepBackward(MI);
       if (!MI.isCopy())
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
index 7511e5953dac1..15e7136f4a503 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -52,14 +52,11 @@ declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_
 declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
 
-define void @PR90954(ptr %0, ptr %1, i32 %2) {
+define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
 ; CHECK-LABEL: PR90954:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %rbp, -16
 ; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %r13
@@ -67,11 +64,6 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
 ; CHECK-NEXT:    subq $5120, %rsp # imm = 0x1400
-; CHECK-NEXT:    .cfi_offset %rbx, -56
-; CHECK-NEXT:    .cfi_offset %r12, -48
-; CHECK-NEXT:    .cfi_offset %r13, -40
-; CHECK-NEXT:    .cfi_offset %r14, -32
-; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
@@ -202,5 +194,37 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
   br label %6
 }
 
+define void @multi_use() nounwind {
+; CHECK-LABEL: multi_use:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    subq $2928, %rsp # imm = 0xB70
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, %ax
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $16, %cx
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    movabsq $64, %rbp
+; CHECK-NEXT:    tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
+; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
+; CHECK-NEXT:    tdpbf16ps %tmm0, %tmm0, %tmm1
+; CHECK-NEXT:    tdpbf16ps %tmm0, %tmm0, %tmm0
+; CHECK-NEXT:    addq $2928, %rsp # imm = 0xB70
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
+  %2 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
+  %3 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
+  ret void
+}
+
 declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
 declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)



More information about the llvm-commits mailing list