[llvm] ad9091c - [X86] Allow PTILEZEROV and PTILELOADDV to be rematerializable
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 26 05:56:11 PST 2021
Author: Wang, Pengfei
Date: 2021-02-26T21:55:59+08:00
New Revision: ad9091c5fa9f3749496d1a8ce8481f0011ed40da
URL: https://github.com/llvm/llvm-project/commit/ad9091c5fa9f3749496d1a8ce8481f0011ed40da
DIFF: https://github.com/llvm/llvm-project/commit/ad9091c5fa9f3749496d1a8ce8481f0011ed40da.diff
LOG: [X86] Allow PTILEZEROV and PTILELOADDV to be rematerializable
Spilling and reloading AMX registers are expensive. We allow PTILEZEROV
and PTILELOADDV to be rematerializable to avoid the register spilling.
Reviewed By: LuoYuanke
Differential Revision: https://reviews.llvm.org/D97453
Added:
Modified:
llvm/lib/Target/X86/X86InstrAMX.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/test/CodeGen/X86/AMX/amx-across-func.ll
llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 6731599b909f..cc31c322c12b 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,13 +48,15 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
VEX, T8XD;
// Pseduo instruction for RA.
- def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ let isReMaterializable = 1, canFoldAsLoad = 1 in
+ def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
TILE:$src4), []>;
- def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+ def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
GR16:$src2), []>;
let usesCustomInserter = 1 in {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b1a14febe4ba..745bf435b0c2 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1006,6 +1006,8 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MOV64ri:
case X86::MOV64ri32:
case X86::MOV8ri:
+ case X86::PTILEZEROV:
+ case X86::PTILELOADDV:
return true;
case X86::MOV8rm:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
index b687d03f92ba..fa097164fdc1 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -131,13 +131,10 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0
-; CHECK-NEXT: movabsq $64, %rax
-; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movabsq $64, %rax
-; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
+; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0
; CHECK-NEXT: tilestored %tmm0, (%r13,%r12)
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
index 5fcfa2f295e7..1273eb248668 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
@@ -95,7 +95,7 @@ define dso_local void @test2(i8 *%buf) nounwind {
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
@@ -105,11 +105,9 @@ define dso_local void @test2(i8 *%buf) nounwind {
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, %r14w
-; CHECK-NEXT: tilezero %tmm3
+; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_3
@@ -120,28 +118,20 @@ define dso_local void @test2(i8 *%buf) nounwind {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movabsq $64, %rax
-; CHECK-NEXT: tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movabsq $64, %rax
-; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
+; CHECK-NEXT: tilezero %tmm2
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1
-; CHECK-NEXT: # implicit-def: $rax
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movabsq $64, %rax
-; CHECK-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15)
; CHECK-NEXT: incl %ebp
; CHECK-NEXT: cmpw $100, %bp
; CHECK-NEXT: jl .LBB1_2
; CHECK-NEXT: .LBB1_3: # %exit
-; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT: addq $72, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
index 57b67c456b36..6964cc4c2d21 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -120,7 +120,7 @@ define dso_local void @test3(i8 *%buf) nounwind {
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: subq $3032, %rsp # imm = 0xBD8
+; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
@@ -143,10 +143,9 @@ define dso_local void @test3(i8 *%buf) nounwind {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movabsq $64, %rax
-; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
@@ -155,13 +154,12 @@ define dso_local void @test3(i8 *%buf) nounwind {
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
-; CHECK-NEXT: movabsq $64, %rax
-; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
+; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: incl %ebp
; CHECK-NEXT: cmpw $100, %bp
; CHECK-NEXT: jl .LBB1_2
; CHECK-NEXT: .LBB1_3: # %exit
-; CHECK-NEXT: addq $3032, %rsp # imm = 0xBD8
+; CHECK-NEXT: addq $72, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
More information about the llvm-commits
mailing list