[llvm] [PowerPC][AIX] Support ByVals with greater alignment then pointer size (PR #93341)

Fri May 24 13:09:45 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-powerpc

Author: Zarko Todorovski (ZarkoT)

<details>
<summary>Changes</summary>

Implementation is NOT compatible with XL. Instead it handles all ByVals
with greater alignment then pointer width the same way XL handles Byvals
that have vector members. For overaligned objects that do not contain
vectors XL does not align them properly if they are passed in the GPR
argument registers.

This patch was originally written by Sean Fertile @mandlebug. 

Previously on Phabricator https://reviews.llvm.org/D105659

---
Full diff: https://github.com/llvm/llvm-project/pull/93341.diff


4 Files Affected:

- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+28-17) 
- (modified) llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll (+1-1) 
- (added) llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll (+50) 
- (added) llvm/test/CodeGen/PowerPC/aix-vector-byval.ll (+49) 


``````````diff

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8450ce9e0e3b3..0b628ea2b3f6a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -6855,7 +6855,8 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
   const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
       State.getMachineFunction().getSubtarget());
   const bool IsPPC64 = Subtarget.isPPC64();
-  const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
+  const Align PtrAlign(PtrSize);
+  const Align StackAlign(16);
   const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
 
   if (ValVT == MVT::f128)
@@ -6876,12 +6877,16 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
                                  PPC::V6,  PPC::V7,  PPC::V8,  PPC::V9,
                                  PPC::V10, PPC::V11, PPC::V12, PPC::V13};
 
+  const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
+
   if (ArgFlags.isByVal()) {
+    const Align ByValAlign(ArgFlags.getNonZeroByValAlign());
     if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
       report_fatal_error("Pass-by-value arguments with alignment greater than "
-                         "register width are not supported.");
+                         "16 are not supported.");
 
     const unsigned ByValSize = ArgFlags.getByValSize();
+    const Align ObjAlign = ByValAlign > PtrAlign ? ByValAlign : PtrAlign;
 
     // An empty aggregate parameter takes up no storage and no registers,
     // but needs a MemLoc for a stack slot for the formal arguments side.
@@ -6891,11 +6896,23 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
       return false;
     }
 
-    const unsigned StackSize = alignTo(ByValSize, PtrAlign);
-    unsigned Offset = State.AllocateStack(StackSize, PtrAlign);
-    for (const unsigned E = Offset + StackSize; Offset < E;
-         Offset += PtrAlign.value()) {
-      if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
+    // Shadow allocate any registers that are not properly aligned.
+    unsigned NextReg = State.getFirstUnallocated(GPRs);
+    while (NextReg != GPRs.size() &&
+           !isGPRShadowAligned(GPRs[NextReg], ObjAlign)) {
+      // Shadow allocate next registers since its aligment is not strict enough.
+      unsigned Reg = State.AllocateReg(GPRs);
+      // Allocate the stack space shadowed by said register.
+      State.AllocateStack(PtrSize, PtrAlign);
+      assert(Reg && "Alocating register unexpectedly failed.");
+      (void)Reg;
+      NextReg = State.getFirstUnallocated(GPRs);
+    }
+
+    const unsigned StackSize = alignTo(ByValSize, ObjAlign);
+    unsigned Offset = State.AllocateStack(StackSize, ObjAlign);
+    for (const unsigned E = Offset + StackSize; Offset < E; Offset += PtrSize) {
+      if (unsigned Reg = State.AllocateReg(GPRs))
         State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
       else {
         State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
@@ -6917,12 +6934,12 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     [[fallthrough]];
   case MVT::i1:
   case MVT::i32: {
-    const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
+    const unsigned Offset = State.AllocateStack(PtrSize, PtrAlign);
     // AIX integer arguments are always passed in register width.
     if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
       LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
                                   : CCValAssign::LocInfo::ZExt;
-    if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
+    if (unsigned Reg = State.AllocateReg(GPRs))
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
     else
       State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
@@ -6942,8 +6959,8 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
 
     // Reserve and initialize GPRs or initialize the PSA as required.
-    for (unsigned I = 0; I < StoreSize; I += PtrAlign.value()) {
-      if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
+    for (unsigned I = 0; I < StoreSize; I += PtrSize) {
+      if (unsigned Reg = State.AllocateReg(GPRs)) {
         assert(FReg && "An FPR should be available when a GPR is reserved.");
         if (State.isVarArg()) {
           // Successfully reserved GPRs are only initialized for vararg calls.
@@ -6995,9 +7012,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
       return false;
     }
 
-    const unsigned PtrSize = IsPPC64 ? 8 : 4;
-    ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
-
     unsigned NextRegIndex = State.getFirstUnallocated(GPRs);
     // Burn any underaligned registers and their shadowed stack space until
     // we reach the required alignment.
@@ -7347,9 +7361,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
       const MCPhysReg ArgReg = VA.getLocReg();
       const PPCFrameLowering *FL = Subtarget.getFrameLowering();
 
-      if (Flags.getNonZeroByValAlign() > PtrByteSize)
-        report_fatal_error("Over aligned byvals not supported yet.");
-
       const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);
       const int FI = MF.getFrameInfo().CreateFixedObject(
           StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll
index bb0231dbf5417..d4d9d67061fcb 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll
@@ -13,4 +13,4 @@ entry:
 
 declare void @foo(i32, i32, i32, i32, i32, i32, i32, i32, ptr byval(%struct.S) align 32)
 
-; CHECK: LLVM ERROR: Pass-by-value arguments with alignment greater than register width are not supported.
+; CHECK: LLVM ERROR: Pass-by-value arguments with alignment greater than 16 are not supported.
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
new file mode 100644
index 0000000000000..3491779b1002c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
@@ -0,0 +1,50 @@
+TE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr7 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr7 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=64BIT %s
+
+%struct.vec_struct = type { <4 x i32> }
+
+; Function Attrs: norecurse nounwind readonly
+define i32 @vec_struct_test(i32 %i, %struct.vec_struct* nocapture readonly byval(%struct.vec_struct) align 16 %vs) {
+  ; 32BIT-LABEL: name: vec_struct_test
+
+  ; 32BIT: fixedStack:
+  ; 32BIT: - { id: 0, type: default, offset: 32, size: 16, alignment: 16, stack-id: default,
+  ; 32BIT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
+  ; 32BIT:     debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+
+  ; 32BIT: bb.0.entry:
+  ; 32BIT:   liveins: $r3, $r5, $r6, $r7, $r8
+  ; 32BIT:   renamable $r3 = nsw ADD4 renamable $r5, killed renamable $r3
+  ; 32BIT:   STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
+  ; 32BIT:   STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT:   STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+  ; 32BIT:   STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+  ; 32BIT:   BLR implicit $lr, implicit $rm, implicit $r3
+
+
+  ; 64BIT-LABEL: name: vec_struct_test
+  ; 64BIT: fixedStack:
+  ; 64BIT:  - { id: 0, type: default, offset: 64, size: 16, alignment: 16, stack-id: default,
+  ; 64BIT:      isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
+  ; 64BIT:      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+
+  ; 64BIT: bb.0.entry:
+  ; 64BIT:   liveins: $x3, $x5, $x6
+  ; 64BIT:   STD killed renamable $x5, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT:   renamable $r4 = LWZ 0, %fixed-stack.0 :: (dereferenceable load (s32) from %ir.vsi1, align 16)
+  ; 64BIT:   renamable $r3 = nsw ADD4 killed renamable $r4, renamable $r3, implicit killed $x3, implicit-def $x3
+  ; 64BIT:   STD killed renamable $x6, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; 64BIT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %vsi = getelementptr inbounds %struct.vec_struct, %struct.vec_struct* %vs, i32 0, i32 0
+  %0 = load <4 x i32>, <4 x i32>* %vsi, align 16
+  %vecext = extractelement <4 x i32> %0, i32 0
+  %add = add nsw i32 %vecext, %i
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-byval.ll b/llvm/test/CodeGen/PowerPC/aix-vector-byval.ll
new file mode 100644
index 0000000000000..9e4139cdf0c6c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-byval.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr7 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr7 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=64BIT %s
+
+%struct.B = type { <8 x i16>, i32, i32, [8 x i8] }
+
+; Function Attrs: nounwind
+define i32 @caller() {
+  ; 32BIT-LABEL: name: caller
+  ; 32BIT: bb.0.entry:
+  ; 32BIT:   renamable $r3 = LWZ 28, %stack.0.vs :: (load (s32) from unknown-address + 4)
+  ; 32BIT:   STW killed renamable $r3, 60, $r1 :: (store (s32) into unknown-address + 4, basealign 16)
+  ; 32BIT:   renamable $r3 = LWZ 24, %stack.0.vs :: (load (s32) from %stack.0.vs + 24, align 8, basealign 16)
+  ; 32BIT:   STW killed renamable $r3, 56, $r1 :: (store (s32), align 16)
+  ; 32BIT:   ADJCALLSTACKDOWN 64, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT:   renamable $r10 = LWZ 20, %stack.0.vs :: (load (s32) from %stack.0.vs + 20)
+  ; 32BIT:   renamable $r9 = LWZ 16, %stack.0.vs :: (load (s32) from %stack.0.vs + 16, align 16)
+  ; 32BIT:   renamable $r8 = LWZ 12, %stack.0.vs :: (load (s32) from %stack.0.vs + 12)
+  ; 32BIT:   renamable $r7 = LWZ 8, %stack.0.vs :: (load (s32) from %stack.0.vs + 8, align 8)
+  ; 32BIT:   renamable $r6 = LWZ 4, %stack.0.vs :: (load (s32) from %stack.0.vs + 4)
+  ; 32BIT:   renamable $r5 = LWZ 0, %stack.0.vs :: (load (s32) from %stack.0.vs, align 16)
+  ; 32BIT:   $r3 = LI 0
+  ; 32BIT:   BL_NOP <mcsymbol .vec_struct_test[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def $r3
+  ; 32BIT:   ADJCALLSTACKUP 64, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT:   BLR implicit $lr, implicit $rm, implicit $r3
+
+  ; 64BIT-LABEL: name: caller
+  ; 64BIT: bb.0.entry:
+  ; 64BIT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT:   renamable $x8 = LD 24, %stack.0.vs :: (load (s64) from %stack.0.vs + 24)
+  ; 64BIT:   renamable $x7 = LD 16, %stack.0.vs :: (load (s64) from %stack.0.vs + 16, align 16)
+  ; 64BIT:   renamable $x6 = LD 8, %stack.0.vs :: (load (s64) from %stack.0.vs + 8)
+  ; 64BIT:   renamable $x5 = LD 0, %stack.0.vs :: (load (s64) from %stack.0.vs, align 16)
+  ; 64BIT:   $x3 = LI8 0
+  ; 64BIT:   BL8_NOP <mcsymbol .vec_struct_test[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x2, implicit-def $r1, implicit-def $x3
+  ; 64BIT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+  entry:
+  %vs = alloca %struct.B, align 16
+  %call = tail call i32 @vec_struct_test(i32 0, %struct.B* nonnull byval(%struct.B) align 16 %vs)
+  ret i32 %call
+}
+
+declare i32 @vec_struct_test(i32, %struct.B* byval(%struct.B) align 16)

``````````

</details>


https://github.com/llvm/llvm-project/pull/93341