[llvm-branch-commits] [llvm] GlobalISel: Fix mishandling vector-as-scalar in return values (PR #175780)

Tue Jan 13 07:29:37 PST 2026

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/175780

This fixes 2 cases when the AMDGPU ABI is fixed to pass <2 x i16>
values as packed on gfx6/gfx7. The ABI does not pack values
currently; this is a pre-fix for that change.

Insert a bitcast if there is a single part with a different size.
Previously this would miscompile by going through the scalarization
and extend path, dropping the high element.

Also fix assertions in odd cases, like <3 x i16> -> i32. This needs
to unmerge with excess elements from the widened source vector.

All of this code is in need of a cleanup; this should look more
like the DAG version using getVectorTypeBreakdown.

>From 186d52070cb20200b9c0165ff756db9ecbb3ae29 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 13 Jan 2026 08:52:36 +0100
Subject: [PATCH] GlobalISel: Fix mishandling vector-as-scalar in return values

This fixes 2 cases when the AMDGPU ABI is fixed to pass <2 x i16>
values as packed on gfx6/gfx7. The ABI does not pack values
currently; this is a pre-fix for that change.

Insert a bitcast if there is a single part with a different size.
Previously this would miscompile by going through the scalarization
and extend path, dropping the high element.

Also fix assertions in odd cases, like <3 x i16> -> i32. This needs
to unmerge with excess elements from the widened source vector.

All of this code is in need of a cleanup; this should look more
like the DAG version using getVectorTypeBreakdown.
---
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 26 ++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index e2ed45eec0ecd..0da360d8038b6 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -568,6 +568,13 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
 
   const TypeSize PartSize = PartTy.getSizeInBits();
 
+  if (PartSize == SrcTy.getSizeInBits() && DstRegs.size() == 1) {
+    // TODO: Handle int<->ptr casts. It just happens the ABI lowering
+    // assignments are not pointer aware.
+    B.buildBitcast(DstRegs[0], SrcReg);
+    return;
+  }
+
   if (PartTy.isVector() == SrcTy.isVector() &&
       PartTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits()) {
     assert(DstRegs.size() == 1);
@@ -576,9 +583,11 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
   }
 
   if (SrcTy.isVector() && !PartTy.isVector() &&
-      TypeSize::isKnownGT(PartSize, SrcTy.getElementType().getSizeInBits())) {
+      TypeSize::isKnownGT(PartSize, SrcTy.getElementType().getSizeInBits()) &&
+      SrcTy.getElementCount() == ElementCount::getFixed(DstRegs.size())) {
     // Vector was scalarized, and the elements extended.
     auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
+
     for (int i = 0, e = DstRegs.size(); i != e; ++i)
       B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
     return;
@@ -645,9 +654,22 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
     }
   }
 
-  if (LCMTy.isVector() && CoveringSize != SrcSize)
+  if (LCMTy.isVector() && CoveringSize != SrcSize) {
     UnmergeSrc = B.buildPadVectorWithUndefElements(LCMTy, SrcReg).getReg(0);
 
+    unsigned ExcessBits = CoveringSize - DstSize * DstRegs.size();
+    if (ExcessBits != 0) {
+      SmallVector<Register, 8> PaddedDstRegs(DstRegs.begin(), DstRegs.end());
+
+      MachineRegisterInfo &MRI = *B.getMRI();
+      for (unsigned I = 0; I != ExcessBits; I += PartSize)
+        PaddedDstRegs.push_back(MRI.createGenericVirtualRegister(PartTy));
+
+      B.buildUnmerge(PaddedDstRegs, UnmergeSrc);
+      return;
+    }
+  }
+
   B.buildUnmerge(DstRegs, UnmergeSrc);
 }