[llvm] [BPF] i128 direct return support (PR #183258)

Sat Mar 7 13:00:32 PST 2026

https://github.com/clairechingching updated https://github.com/llvm/llvm-project/pull/183258

>From 49328907ac24f979cd0c4bffd171469d5561dfc9 Mon Sep 17 00:00:00 2001
From: Claire Fan <fanyungching at gmail.com>
Date: Sun, 8 Feb 2026 09:07:52 +0100
Subject: [PATCH] [BPF] i128 direct return support

---
 llvm/lib/Target/BPF/BPF.td              |  3 +++
 llvm/lib/Target/BPF/BPFCallingConv.td   |  6 ++++--
 llvm/lib/Target/BPF/BPFISelLowering.cpp | 20 ++++++++++++++----
 llvm/lib/Target/BPF/BPFISelLowering.h   |  3 +++
 llvm/lib/Target/BPF/BPFSubtarget.cpp    |  1 +
 llvm/lib/Target/BPF/BPFSubtarget.h      |  4 ++++
 llvm/test/CodeGen/BPF/arr_ret1.ll       | 27 +++++++++++++++++++++++++
 llvm/test/CodeGen/BPF/i128-bpf64.ll     | 25 +++++++++++++++++++++++
 llvm/test/CodeGen/BPF/struct_ret2.ll    |  7 ++++---
 llvm/test/CodeGen/BPF/vec_ret1.ll       | 13 ++++++++++++
 10 files changed, 100 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/arr_ret1.ll
 create mode 100644 llvm/test/CodeGen/BPF/i128-bpf64.ll
 create mode 100644 llvm/test/CodeGen/BPF/vec_ret1.ll

diff --git a/llvm/lib/Target/BPF/BPF.td b/llvm/lib/Target/BPF/BPF.td
index 1fc364dad9988..84ecb90a98009 100644
--- a/llvm/lib/Target/BPF/BPF.td
+++ b/llvm/lib/Target/BPF/BPF.td
@@ -34,6 +34,9 @@ def MisalignedMemAccess : SubtargetFeature<"allows-misaligned-mem-access",
                                            "AllowsMisalignedMemAccess", "true",
                                            "Allows misaligned memory access">;
 
+def i128DirectReturn : SubtargetFeature<"has-i128-direct-return", "Hasi128DirectReturn", 
+                                        "true", "Enable i128 direct return">;
+
 def : Proc<"generic", []>;
 def : Proc<"v1", []>;
 def : Proc<"v2", []>;
diff --git a/llvm/lib/Target/BPF/BPFCallingConv.td b/llvm/lib/Target/BPF/BPFCallingConv.td
index a557211437e95..d92cc959bdeb6 100644
--- a/llvm/lib/Target/BPF/BPFCallingConv.td
+++ b/llvm/lib/Target/BPF/BPFCallingConv.td
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 // BPF 64-bit C return-value convention.
-def RetCC_BPF64 : CallingConv<[CCIfType<[i64], CCAssignToReg<[R0]>>]>;
+def RetCC_BPF64 : CallingConv<[CCIfType<[i64], CCAssignToReg<[R0, R1]>>]>;
 
 // BPF 64-bit C Calling convention.
 def CC_BPF64 : CallingConv<[
@@ -28,7 +28,9 @@ def CC_BPF64 : CallingConv<[
 // Return-value convention when -mattr=+alu32 enabled
 def RetCC_BPF32 : CallingConv<[
   CCIfType<[i32], CCAssignToRegWithShadow<[W0], [R0]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[R0], [W0]>>
+  CCIfType<[i32], CCAssignToRegWithShadow<[W1], [R1]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0], [W0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R1], [W1]>>,
 ]>;
 
 // Calling convention when -mattr=+alu32 enabled
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index fd26345920a71..fcca9228e9ba3 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -207,6 +207,8 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   HasJmpExt = STI.getHasJmpExt();
   HasMovsx = STI.hasMovsx();
 
+  Hasi128DirectReturn = STI.getHasi128DirectReturn();
+
   AllowsMisalignedMemAccess = STI.getAllowsMisalignedMemAccess();
 }
 
@@ -633,9 +635,18 @@ BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
 
-  if (MF.getFunction().getReturnType()->isAggregateType()) {
-    fail(DL, DAG, "aggregate returns are not supported");
-    return DAG.getNode(Opc, DL, MVT::Other, Chain);
+  const Function &F = MF.getFunction();
+  Type *retTy = F.getReturnType();
+
+  if (retTy->isAggregateType() || retTy->isVectorTy()) {
+    // BPF calling convention
+    // 1. in any case, does not allow returning more than 2 registers
+    // 2. when target doesn't supports i128 direct return through R0/R1,
+    // return size has to be <= 1
+    if (Outs.size() > 2 || (!Hasi128DirectReturn && Outs.size() > 1)) {
+      fail(DL, DAG, "aggregate returns are not supported");
+      return DAG.getNode(Opc, DL, MVT::Other, Chain);
+    }
   }
 
   // Analize return values.
@@ -677,7 +688,8 @@ SDValue BPFTargetLowering::LowerCallResult(
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
 
-  if (Ins.size() > 1) {
+  // BPF calling convention does not allow large return
+  if (Ins.size() > 2 || (!Hasi128DirectReturn && Ins.size() > 1)) {
     fail(DL, DAG, "only small returns supported");
     for (auto &In : Ins)
       InVals.push_back(DAG.getConstant(0, DL, In.VT));
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 8607e4f8c9e69..e828f6a050d29 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -65,6 +65,9 @@ class BPFTargetLowering : public TargetLowering {
   bool HasJmpExt;
   bool HasMovsx;
 
+  // Supports i128 Direct Return
+  bool Hasi128DirectReturn;
+
   // Allows Misalignment
   bool AllowsMisalignedMemAccess;
 
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 726f8f4b39827..1efbae3c84793 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -69,6 +69,7 @@ void BPFSubtarget::initializeEnvironment() {
   HasStoreImm = false;
   HasLoadAcqStoreRel = false;
   HasGotox = false;
+  Hasi128DirectReturn = false;
   AllowsMisalignedMemAccess = false;
 }
 
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.h b/llvm/lib/Target/BPF/BPFSubtarget.h
index 24eff862224b0..7f0307d8da5ec 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -60,6 +60,9 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   // whether the cpu supports alu32 instructions.
   bool HasAlu32;
 
+  // whether target supports i128 direct return.
+  bool Hasi128DirectReturn;
+
   // whether we should enable MCAsmInfo DwarfUsesRelocationsAcrossSections
   bool UseDwarfRIS;
 
@@ -89,6 +92,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   bool getHasJmpExt() const { return HasJmpExt; }
   bool getHasJmp32() const { return HasJmp32; }
   bool getHasAlu32() const { return HasAlu32; }
+  bool getHasi128DirectReturn() const { return Hasi128DirectReturn; }
   bool getUseDwarfRIS() const { return UseDwarfRIS; }
   bool getAllowsMisalignedMemAccess() const {
     return AllowsMisalignedMemAccess;
diff --git a/llvm/test/CodeGen/BPF/arr_ret1.ll b/llvm/test/CodeGen/BPF/arr_ret1.ll
new file mode 100644
index 0000000000000..da36a1713b67e
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/arr_ret1.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=bpf -mattr=+has-i128-direct-return < %s | FileCheck %s
+
+; Source code:
+; typedef struct {
+;     long long v[2];
+; } arr2_i64;
+; 
+; arr2_i64 bar(int a, int b, int c, int d, int e);
+; 
+; arr2_i64 foo(int a, int b, int c) {
+;     return bar(a, b, c, 1, 2);
+; }
+; 
+; Compile with:
+; 	clang -target bpf -O2 -S -emit-llvm foo.c
+
+; Function Attrs: nounwind uwtable
+define [2 x i64] @foo(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: foo:
+; CHECK: w4 = 1
+; CHECK-NEXT: w5 = 2
+entry:
+  %call = tail call [2 x i64]  @bar(i32 %a, i32 %b, i32 %c, i32 1, i32 2) #3
+  ret [2 x i64] %call
+}
+
+declare [2 x i64] @bar(i32, i32, i32, i32, i32) #1
diff --git a/llvm/test/CodeGen/BPF/i128-bpf64.ll b/llvm/test/CodeGen/BPF/i128-bpf64.ll
new file mode 100644
index 0000000000000..d089b30276ca6
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/i128-bpf64.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=bpf -mcpu=generic -mattr=+has-i128-direct-return < %s | FileCheck %s
+
+; Source code:
+; void test(__int128 *a) {
+;     __int128 tmp = __atomic_load_n(a, __ATOMIC_RELAXED);
+;     __atomic_store_n(a, tmp, __ATOMIC_RELAXED);
+; }
+; 
+; Compile with:
+; 	clang -target bpf -O2 -S -emit-llvm test.c
+
+define void @test(ptr %a) nounwind {
+; CHECK-LABEL: test:
+; CHECK: r6 = r1
+; CHECK-NEXT: r2 = 0
+; CHECK-NEXT: call __atomic_load_16
+; CHECK-NEXT: r3 = r1
+; CHECK-NEXT: r1 = r6
+; CHECK-NEXT: r2 = r0
+; CHECK-NEXT: r4 = 0
+; CHECK-NEXT: call __atomic_store_16
+  %1 = load atomic i128, ptr %a monotonic, align 16
+  store atomic i128 %1, ptr %a monotonic, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/BPF/struct_ret2.ll b/llvm/test/CodeGen/BPF/struct_ret2.ll
index 170d55cc29df0..9b0d7d4fe2f81 100644
--- a/llvm/test/CodeGen/BPF/struct_ret2.ll
+++ b/llvm/test/CodeGen/BPF/struct_ret2.ll
@@ -1,9 +1,10 @@
-; RUN: not llc -mtriple=bpf < %s 2> %t1
-; RUN: FileCheck %s < %t1
-; CHECK: only small returns
+; RUN: llc -mtriple=bpf  -mattr=+has-i128-direct-return < %s | FileCheck %s
 
 ; Function Attrs: nounwind uwtable
 define { i64, i32 } @foo(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: foo:
+; CHECK: w4 = 1
+; CHECK-NEXT: w5 = 2
 entry:
   %call = tail call { i64, i32 } @bar(i32 %a, i32 %b, i32 %c, i32 1, i32 2) #3
   ret { i64, i32 } %call
diff --git a/llvm/test/CodeGen/BPF/vec_ret1.ll b/llvm/test/CodeGen/BPF/vec_ret1.ll
new file mode 100644
index 0000000000000..f7e0d4ab95494
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/vec_ret1.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=bpf -mattr=+has-i128-direct-return < %s | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define <2 x i64> @foo(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: foo:
+; CHECK: w4 = 1
+; CHECK-NEXT: w5 = 2
+entry:
+  %call = tail call <2 x i64> @bar(i32 %a, i32 %b, i32 %c, i32 1, i32 2) #3
+  ret <2 x i64> %call
+}
+
+declare <2 x i64> @bar(i32, i32, i32, i32, i32) #1