[llvm] [WIP][PowerPC] Add phony subregisters to cover the high half of the VSX registers. (PR #94628)

Thu Jun 6 08:15:40 PDT 2024

https://github.com/stefanp-ibm created https://github.com/llvm/llvm-project/pull/94628

On PowerPC we have VSX registers which overlap with floating point registers.
However, the floating point registers only overlap with half of each VSX
register while the other half is never used alone. This patch adds phony
registers for the other half of the VSX registers in order to fully cover them
and to make sure that the lane masks are not the same for the VSX and the
floating point register.

Note: This patch is still Work in Progress as there are a number of LIT
failures that need to be investigated.


>From eaaad1691cc732c4c4e53197d2d629c1bb5dcb74 Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp at ca.ibm.com>
Date: Thu, 6 Jun 2024 10:07:00 -0500
Subject: [PATCH] [WIP][PowerPC] Add phony subregisters to cover the high half
 of the VSX registers.

On PowerPC we have VSX registers which overlap with floating point registers.
However, the floating point registers only overlap with half of each VSX
register while the other half is never used alone. This patch adds phony
registers for the other half of the VSX registers in order to fully cover them
and to make sure that the lane masks are not the same for the VSX and the
floating point register.

Note: This patch is still Work in Progress as there are a number of LIT
failures that need to be investigated.
---
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp   | 86 ++++++++++++++++++-
 llvm/lib/Target/PowerPC/PPCRegisterInfo.td    | 31 +++++--
 llvm/test/CodeGen/PowerPC/frem.ll             |  5 --
 .../test/CodeGen/PowerPC/subreg-lanemasks.mir | 63 ++++++++++++++
 4 files changed, 170 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir

diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9e8da59615dfb..a2d089f32cabf 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -435,7 +435,91 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
-  assert(checkAllSuperRegsMarked(Reserved));
+  // Mark phony regsiters for the VSR high bits as reserved so that they are
+  // not used.
+  Reserved.set(PPC::FH0);
+  Reserved.set(PPC::FH1);
+  Reserved.set(PPC::FH2);
+  Reserved.set(PPC::FH3);
+  Reserved.set(PPC::FH4);
+  Reserved.set(PPC::FH5);
+  Reserved.set(PPC::FH6);
+  Reserved.set(PPC::FH7);
+  Reserved.set(PPC::FH8);
+  Reserved.set(PPC::FH9);
+  Reserved.set(PPC::FH10);
+  Reserved.set(PPC::FH11);
+  Reserved.set(PPC::FH12);
+  Reserved.set(PPC::FH13);
+  Reserved.set(PPC::FH14);
+  Reserved.set(PPC::FH15);
+  Reserved.set(PPC::FH16);
+  Reserved.set(PPC::FH17);
+  Reserved.set(PPC::FH18);
+  Reserved.set(PPC::FH19);
+  Reserved.set(PPC::FH20);
+  Reserved.set(PPC::FH21);
+  Reserved.set(PPC::FH22);
+  Reserved.set(PPC::FH23);
+  Reserved.set(PPC::FH24);
+  Reserved.set(PPC::FH25);
+  Reserved.set(PPC::FH26);
+  Reserved.set(PPC::FH27);
+  Reserved.set(PPC::FH28);
+  Reserved.set(PPC::FH29);
+  Reserved.set(PPC::FH30);
+  Reserved.set(PPC::FH31);
+
+  Reserved.set(PPC::VFH0);
+  Reserved.set(PPC::VFH1);
+  Reserved.set(PPC::VFH2);
+  Reserved.set(PPC::VFH3);
+  Reserved.set(PPC::VFH4);
+  Reserved.set(PPC::VFH5);
+  Reserved.set(PPC::VFH6);
+  Reserved.set(PPC::VFH7);
+  Reserved.set(PPC::VFH8);
+  Reserved.set(PPC::VFH9);
+  Reserved.set(PPC::VFH10);
+  Reserved.set(PPC::VFH11);
+  Reserved.set(PPC::VFH12);
+  Reserved.set(PPC::VFH13);
+  Reserved.set(PPC::VFH14);
+  Reserved.set(PPC::VFH15);
+  Reserved.set(PPC::VFH16);
+  Reserved.set(PPC::VFH17);
+  Reserved.set(PPC::VFH18);
+  Reserved.set(PPC::VFH19);
+  Reserved.set(PPC::VFH20);
+  Reserved.set(PPC::VFH21);
+  Reserved.set(PPC::VFH22);
+  Reserved.set(PPC::VFH23);
+  Reserved.set(PPC::VFH24);
+  Reserved.set(PPC::VFH25);
+  Reserved.set(PPC::VFH26);
+  Reserved.set(PPC::VFH27);
+  Reserved.set(PPC::VFH28);
+  Reserved.set(PPC::VFH29);
+  Reserved.set(PPC::VFH30);
+  Reserved.set(PPC::VFH31);
+
+  assert(checkAllSuperRegsMarked(Reserved,
+			         {PPC::FH0,  PPC::FH1,  PPC::FH2,  PPC::FH3,
+				  PPC::FH4,  PPC::FH5,  PPC::FH6,  PPC::FH7,
+				  PPC::FH8,  PPC::FH9,  PPC::FH10, PPC::FH11,
+				  PPC::FH12, PPC::FH13, PPC::FH14, PPC::FH15, 
+				  PPC::FH16, PPC::FH17, PPC::FH18, PPC::FH19,
+				  PPC::FH20, PPC::FH21, PPC::FH22, PPC::FH23,
+				  PPC::FH24, PPC::FH25, PPC::FH26, PPC::FH27,
+				  PPC::FH28, PPC::FH29, PPC::FH30, PPC::FH31, 
+				  PPC::VFH0,  PPC::VFH1,  PPC::VFH2,  PPC::VFH3,
+				  PPC::VFH4,  PPC::VFH5,  PPC::VFH6,  PPC::VFH7,
+				  PPC::VFH8,  PPC::VFH9,  PPC::VFH10, PPC::VFH11,
+				  PPC::VFH12, PPC::VFH13, PPC::VFH14, PPC::VFH15, 
+				  PPC::VFH16, PPC::VFH17, PPC::VFH18, PPC::VFH19,
+				  PPC::VFH20, PPC::VFH21, PPC::VFH22, PPC::VFH23,
+				  PPC::VFH24, PPC::VFH25, PPC::VFH26, PPC::VFH27,
+				  PPC::VFH28, PPC::VFH29, PPC::VFH30, PPC::VFH31}));
   return Reserved;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 8a37e40414eee..30e936a157e01 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -17,6 +17,7 @@ def sub_un : SubRegIndex<1, 3>;
 def sub_32 : SubRegIndex<32>;
 def sub_32_hi_phony : SubRegIndex<32,32>;
 def sub_64 : SubRegIndex<64>;
+def sub_64_hi_phony : SubRegIndex<64,64>;
 def sub_vsx0 : SubRegIndex<128>;
 def sub_vsx1 : SubRegIndex<128, 128>;
 def sub_gp8_x0 : SubRegIndex<64>;
@@ -77,19 +78,19 @@ class VF<bits<5> num, string n> : PPCReg<n> {
 }
 
 // VR - One of the 32 128-bit vector registers
-class VR<VF SubReg, string n> : PPCReg<n> {
+class VR<VF SubReg, VF SubRegH, string n> : PPCReg<n> {
   let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
   let HWEncoding{5} = 0;
-  let SubRegs = [SubReg];
-  let SubRegIndices = [sub_64];
+  let SubRegs = [SubReg, SubRegH];
+  let SubRegIndices = [sub_64, sub_64_hi_phony];
 }
 
 // VSRL - One of the 32 128-bit VSX registers that overlap with the scalar
 // floating-point registers.
-class VSRL<FPR SubReg, string n> : PPCReg<n> {
+class VSRL<FPR SubReg, FPR SubRegH, string n> : PPCReg<n> {
   let HWEncoding = SubReg.HWEncoding;
-  let SubRegs = [SubReg];
-  let SubRegIndices = [sub_64];
+  let SubRegs = [SubReg, SubRegH];
+  let SubRegIndices = [sub_64, sub_64_hi_phony];
 }
 
 // VSXReg - One of the VSX registers in the range vs32-vs63 with numbering
@@ -155,6 +156,12 @@ foreach Index = 0-31 in {
                 DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
 }
 
+let isArtificial = 1 in {
+  foreach Index = 0-31 in {
+    def FH#Index : FPR<-1, "">;
+  }
+}
+
 // Floating-point pair registers
 foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
   def Fpair#Index : FPPair<"fp"#Index, Index>;
@@ -168,15 +175,21 @@ foreach Index = 0-31 in {
                  DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
 }
 
+let isArtificial = 1 in {
+  foreach Index = 0-31 in {
+    def VFH#Index : VF<-1, "">;
+  }
+}
+
 // Vector registers
 foreach Index = 0-31 in {
-  def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
-                DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+  def V#Index : VR<!cast<VF>("VF"#Index), !cast<VF>("VFH"#Index), "v"#Index>,
+                DwarfRegNum<[!add(Index, 77)]>;
 }
 
 // VSX registers
 foreach Index = 0-31 in {
-  def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
+  def VSL#Index : VSRL<!cast<FPR>("F"#Index), !cast<FPR>("FH"#Index), "vs"#Index>,
                   DwarfRegAlias<!cast<FPR>("F"#Index)>;
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/frem.ll b/llvm/test/CodeGen/PowerPC/frem.ll
index 8cb68e60f7f9b..19b4b1c9cdf95 100644
--- a/llvm/test/CodeGen/PowerPC/frem.ll
+++ b/llvm/test/CodeGen/PowerPC/frem.ll
@@ -70,7 +70,6 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    xscvspdpn 2, 0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 0, 1, 61
 ; CHECK-NEXT:    xscvspdpn 1, 62
 ; CHECK-NEXT:    xscvspdpn 2, 63
@@ -84,7 +83,6 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    xscvspdpn 2, 0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 0, 61, 1
 ; CHECK-NEXT:    lxv 63, 80(1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv 62, 64(1) # 16-byte Folded Reload
@@ -124,11 +122,8 @@ define <2 x double> @frem2x64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-NEXT:    xscpsgndp 61, 1, 1
 ; CHECK-NEXT:    xxswapd 1, 62
 ; CHECK-NEXT:    xxswapd 2, 63
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; CHECK-NEXT:    bl fmod
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 34, 61, 1
 ; CHECK-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir b/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
new file mode 100644
index 0000000000000..28a4d0347f105
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
@@ -0,0 +1,63 @@
+# RUN: llc -mcpu=pwr10 -O3 -ppc-track-subreg-liveness -verify-machineinstrs \
+# RUN:   -mtriple=powerpc64le-unknown-linux-gnu -run-pass=greedy,virtregrewriter \
+# RUN:   -debug-only=regalloc -o - %s 2>&1 >/dev/null | FileCheck %s
+
+# Keep track of all of the lanemasks for various subregsiters.
+#
+# CHECK: %3 [80r,80d:0) 0 at 80r  L000000000000000C [80r,80d:0) 0 at 80r  weight:0.000000e+00
+# CHECK: %4 [96r,96d:0) 0 at 96r  L0000000000003000 [96r,96d:0) 0 at 96r  weight:0.000000e+00
+# CHECK: %5 [112r,112d:0) 0 at 112r  L000000000000000C [112r,112d:0) 0 at 112r  weight:0.000000e+00
+# CHECK: %6 [128r,128d:0) 0 at 128r  L0000000000003000 [128r,128d:0) 0 at 128r  weight:0.000000e+00
+# CHECK: %7 [144r,144d:0) 0 at 144r  L0000000000000004 [144r,144d:0) 0 at 144r  weight:0.000000e+00
+# CHECK: %8 [160r,160d:0) 0 at 160r  L0000000000001000 [160r,160d:0) 0 at 160r  weight:0.000000e+00
+# CHECK: %9 [176r,176d:0) 0 at 176r  L0000000000000004 [176r,176d:0) 0 at 176r  weight:0.000000e+00
+# CHECK: %10 [192r,192d:0) 0 at 192r  L0000000000001000 [192r,192d:0) 0 at 192r  weight:0.000000e+00
+# CHECK: %11 [208r,208d:0) 0 at 208r  L0000000000004000 [208r,208d:0) 0 at 208r  weight:0.000000e+00
+# CHECK: %12 [224r,224d:0) 0 at 224r  L0000000000010000 [224r,224d:0) 0 at 224r  weight:0.000000e+00
+# CHECK: %13 [240r,240d:0) 0 at 240r  L000000000000300C [240r,240d:0) 0 at 240r  weight:0.000000e+00
+# CHECK: %14 [256r,256d:0) 0 at 256r  L000000000003C000 [256r,256d:0) 0 at 256r  weight:0.000000e+00
+
+
+# CHECK:       0B bb.0
+# CHECK-NEXT:    liveins
+# CHECK-NEXT:  16B  %0:vsrc = COPY $v2
+# CHECK-NEXT:  32B  %float:fprc = COPY %0.sub_64:vsrc
+# CHECK-NEXT:  48B  dead undef %pair.sub_vsx0:vsrprc = COPY $v2
+# CHECK-NEXT:  64B  undef %15.sub_vsx1:vsrprc = COPY $v3
+# CHECK-NEXT:  80B  dead undef %3.sub_vsx0:vsrprc = COPY %0:vsrc
+# CHECK-NEXT:  96B  dead undef %4.sub_vsx1:vsrprc = COPY %0:vsrc
+# CHECK-NEXT:  112B  dead undef %5.sub_vsx0:accrc = COPY %0:vsrc
+# CHECK-NEXT:  128B  dead undef %6.sub_vsx1:accrc = COPY %0:vsrc
+# CHECK-NEXT:  144B  dead undef %7.sub_64:vsrprc = COPY %float:fprc
+# CHECK-NEXT:  160B  dead undef %8.sub_vsx1_then_sub_64:vsrprc = COPY %float:fprc
+# CHECK-NEXT:  176B  dead undef %9.sub_64:accrc = COPY %float:fprc
+# CHECK-NEXT:  192B  dead undef %10.sub_vsx1_then_sub_64:accrc = COPY %float:fprc
+# CHECK-NEXT:  208B  dead undef %11.sub_pair1_then_sub_64:accrc = COPY %float:fprc
+# CHECK-NEXT:  224B  dead undef %12.sub_pair1_then_sub_vsx1_then_sub_64:accrc = COPY %float:fprc
+# CHECK-NEXT:  240B  dead undef %13.sub_pair0:accrc = COPY %15:vsrprc
+# CHECK-NEXT:  256B  dead undef %14.sub_pair1:accrc = COPY %15:vsrprc
+
+
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $v2, $v3
+    %0:vsrc = COPY $v2
+    %float:fprc = COPY %0.sub_64
+    undef %pair.sub_vsx0:vsrprc = COPY $v2
+    undef %pair.sub_vsx1:vsrprc = COPY $v3
+    undef %1.sub_vsx0:vsrprc = COPY %0
+    undef %2.sub_vsx1:vsrprc = COPY %0
+    undef %3.sub_vsx0:accrc = COPY %0
+    undef %4.sub_vsx1:accrc = COPY %0
+    undef %5.sub_64:vsrprc = COPY %float
+    undef %6.sub_vsx1_then_sub_64:vsrprc = COPY %float
+    undef %7.sub_64:accrc = COPY %float
+    undef %8.sub_vsx1_then_sub_64:accrc = COPY %float
+    undef %9.sub_pair1_then_sub_64:accrc = COPY %float
+    undef %10.sub_pair1_then_sub_vsx1_then_sub_64:accrc = COPY %float
+    undef %11.sub_pair0:accrc = COPY %pair
+    undef %12.sub_pair1:accrc = COPY %pair
+...