[llvm] [AMX][PreTileConfig] Ensure that PLDTILECFGV instruction is sinked closer to tile use instruction. (PR #155673)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 3 11:04:03 PDT 2025


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/155673

>From 0662a06bb682d81e109b04ea48ce0b62f199ffdc Mon Sep 17 00:00:00 2001
From: Karthik Senthil <karthik.senthil at intel.com>
Date: Wed, 27 Aug 2025 11:25:51 -0700
Subject: [PATCH] [AMX][PreTileConfig] Ensure that PLDTILECFGV instruction is
 sinked closer to tile use instruction.

According AMX ABI, tile registers (including config) are volatile hence
requiring caller to save/restore config register. This is done in X86's
FastPreTileConfig pass. Currently the PLDTILECFGV instruction is emitted
immediately after the call which can be problematic if call returns a
value in say rax register and AMX tile is configured using the same
register. This PR addresses this issue by ensuring that PLDTILECFGV is
sinked closer to first instruction using a tile after the call.
---
 llvm/lib/Target/X86/X86FastPreTileConfig.cpp  |  13 +-
 llvm/test/CodeGen/X86/AMX/amx-across-func.ll  |   4 +-
 .../X86/AMX/amx-sink-config-after-calls.mir   | 151 ++++++++++++++++++
 3 files changed, 164 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/AMX/amx-sink-config-after-calls.mir

diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index d3c239250943e..787b71d425cb3 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
       MachineBasicBlock::iterator I;
       if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
         I = ++LastShapeMI->getIterator();
-      else
-        I = ++MI.getIterator();
+      else {
+        // Call can overwrite registers like rax, ensure the tile config
+        // instruction is sinked closer to first instruction that uses tile.
+        auto UseIt = MI.getIterator();
+        while (UseIt != MBB.end()) {
+          if (HasTileOperand(MRI, *UseIt))
+            break;
+          ++UseIt;
+        }
+        I = UseIt;
+      }
       Config(*I);
       HasUnconfigTile = false;
       continue;
diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
index 320c96535abba..2bda8db040296 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -139,12 +139,12 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; O0-NEXT:    callq foo
 ; O0-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
 ; O0-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; O0-NEXT:    movl $32, %esi
+; O0-NEXT:    movl $buf+2048, %edx
 ; O0-NEXT:    # implicit-def: $al
 ; O0-NEXT:    movb %al, {{[0-9]+}}(%rsp)
 ; O0-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
 ; O0-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; O0-NEXT:    movl $32, %esi
-; O0-NEXT:    movl $buf+2048, %edx
 ; O0-NEXT:    tileloadd (%rdx,%rsi), %tmm0
 ; O0-NEXT:    movl $64, %esi
 ; O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
diff --git a/llvm/test/CodeGen/X86/AMX/amx-sink-config-after-calls.mir b/llvm/test/CodeGen/X86/AMX/amx-sink-config-after-calls.mir
new file mode 100644
index 0000000000000..b76faf6cbb88b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-sink-config-after-calls.mir
@@ -0,0 +1,151 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-- -mattr=+amx-int8,avx512f -run-pass="fastpretileconfig,regallocfast,fasttileconfig" -o - %s | FileCheck %s
+
+# Test to verify that ldtilecfg instructions are sinked closer to tile defining
+# instructions after a call. This ensures call does not overwrite values in
+# registers being used for configuring the AMX tile.
+
+...
+---
+name:            test_api
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 1, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 2, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 3, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: tile, preferred-register: '', flags: [  ] }
+  - { id: 5, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 6, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 9, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 10, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 13, class: tile, preferred-register: '', flags: [  ] }
+  - { id: 14, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 15, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 18, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 19, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 22, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 23, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 24, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 25, class: tile, preferred-register: '', flags: [  ] }
+  - { id: 26, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 29, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 30, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 33, class: tile, preferred-register: '', flags: [  ] }
+  - { id: 34, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 35, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 38, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 39, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 40, class: gr16, preferred-register: '', flags: [  ] }
+  - { id: 41, class: gr16, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1024
+stack:
+  - { id: 0, size: 1024, alignment: 1024 }
+  - { id: 1, size: 1024, alignment: 1024 }
+  - { id: 2, size: 32, alignment: 32 }
+  - { id: 3, size: 32, alignment: 32 }
+  - { id: 4, size: 8, alignment: 8 }
+machineFunctionInfo:
+  amxProgModel:    ManagedRA
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+
+    ; CHECK-LABEL: name: test_api
+    ; CHECK: liveins: $rdi, $rsi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
+    ; CHECK-NEXT: VMOVUPSZmr %stack.5, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.5, align 4)
+    ; CHECK-NEXT: MOV8mi %stack.5, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.5, align 4)
+    ; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, $rsi :: (store (s64) into %stack.8)
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 16
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.2, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $cx = MOV16ri 16
+    ; CHECK-NEXT: MOV16mr %stack.7, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.7)
+    ; CHECK-NEXT: renamable $ax = MOV16ri 2
+    ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $ax :: (store (s16) into %stack.6)
+    ; CHECK-NEXT: $al = IMPLICIT_DEF
+    ; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
+    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
+    ; CHECK-NEXT: $al = IMPLICIT_DEF
+    ; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
+    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
+    ; CHECK-NEXT: PLDTILECFGV %stack.5, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.5, align 4)
+    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 64
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.1, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 64
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.1, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+    ; CHECK-NEXT: renamable $rdx = MOV32ri64 16
+    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdi, 1, killed renamable $rdx, 0, $noreg, killed renamable $tmm0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def $rsp, implicit-def dead $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: CALL64pcrel32 &foo, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax
+    ; CHECK-NEXT: $rsi = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8)
+    ; CHECK-NEXT: $cx = MOV16rm %stack.7, 1, $noreg, 0, $noreg :: (load (s16) from %stack.7)
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def $rsp, implicit-def dead $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: renamable $rdx = COPY $rax
+    ; CHECK-NEXT: $ax = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
+    ; CHECK-NEXT: MOV64mr killed renamable $rsi, 1, $noreg, 0, $noreg, killed renamable $rdx
+    ; CHECK-NEXT: renamable $rdx = MOV64rm %stack.4, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 16
+    ; CHECK-NEXT: $al = IMPLICIT_DEF
+    ; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
+    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
+    ; CHECK-NEXT: $al = IMPLICIT_DEF
+    ; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
+    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
+    ; CHECK-NEXT: PLDTILECFGV %stack.5, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.5, align 4)
+    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 64
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.0, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 64
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.0, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 16
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.4, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+    ; CHECK-NEXT: RET64
+    %2:gr64 = COPY $rsi
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY killed %0
+    %3:gr64 = COPY killed %2
+    %38:gr64_nosp = MOV32ri64 16
+    %39:gr64 = LEA64r %stack.2, 1, $noreg, 0, $noreg
+    %40:gr16 = MOV16ri 16
+    %41:gr16 = MOV16ri 2
+    %33:tile = PTILELOADDV %41:gr16, %40:gr16, killed %39, 1, killed %38, 0, $noreg
+    %34:gr64_nosp = MOV32ri64 64
+    %35:gr64 = LEA64r %stack.1, 1, $noreg, 0, $noreg
+    PTILESTOREDV %41:gr16, %40:gr16, killed %35, 1, killed %34, 0, $noreg, %33
+    %29:gr64_nosp = MOV32ri64 64
+    %30:gr64 = LEA64r %stack.1, 1, $noreg, 0, $noreg
+    %25:tile = PTILELOADDV %41:gr16, %40:gr16, killed %30, 1, killed %29, 0, $noreg
+    %26:gr64_nosp = MOV32ri64 16
+    PTILESTOREDV %41:gr16, %40:gr16, %1, 1, killed %26, 0, $noreg, %25
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 &foo, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
+    %24:gr64 = COPY $rax
+    MOV64mr %3, 1, $noreg, 0, $noreg, %24
+    %22:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg
+    %19:gr64_nosp = MOV32ri64 16
+    %13:tile = PTILELOADDV %41:gr16, %40:gr16, %22, 1, killed %19, 0, $noreg
+    %14:gr64_nosp = MOV32ri64 64
+    %15:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg
+    PTILESTOREDV %41:gr16, %40:gr16, killed %15, 1, killed %14, 0, $noreg, %13
+    %9:gr64_nosp = MOV32ri64 64
+    %10:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg
+    %4:tile = PTILELOADDV %41:gr16, %40:gr16, killed %10, 1, killed %9, 0, $noreg
+    %5:gr64_nosp = MOV32ri64 16
+    %6:gr64 = LEA64r %stack.4, 1, $noreg, 0, $noreg
+    PTILESTOREDV %41:gr16, %40:gr16, killed %6, 1, killed %5, 0, $noreg, %4
+    RET64
+...



More information about the llvm-commits mailing list