[clang] [llvm] [LoongArch] Add support for half-precision floating-point type (PR #141564)

Tue May 27 01:44:55 PDT 2025

llvmbot wrote:



@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-loongarch

Author: None (Ami-zhang)

<details>
<summary>Changes</summary>

This PR contains 3 commits:
1. Updated the FP16 implementation to pass arguments via FPR instead of the original GPR.
2. Added support for the _Float16 type and fixed 2 related issues.
3. Added support for the __bf16 type.

---

Patch is 219.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141564.diff


15 Files Affected:

- (modified) clang/docs/LanguageExtensions.rst (+2) 
- (modified) clang/lib/Basic/Targets/LoongArch.h (+8) 
- (modified) clang/lib/CodeGen/Targets/LoongArch.cpp (+3-4) 
- (added) clang/test/CodeGen/LoongArch/__fp16-convert.c (+30) 
- (modified) clang/test/CodeGen/LoongArch/abi-lp64d.c (+71) 
- (added) clang/test/CodeGen/LoongArch/bfloat-abi.c (+611) 
- (added) clang/test/CodeGen/LoongArch/bfloat-mangle.cpp (+19) 
- (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+181-3) 
- (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.h (+26) 
- (added) llvm/test/CodeGen/LoongArch/bf16-promote.ll (+172) 
- (added) llvm/test/CodeGen/LoongArch/bf16.ll (+1048) 
- (added) llvm/test/CodeGen/LoongArch/calling-conv-half.ll (+1626) 
- (modified) llvm/test/CodeGen/LoongArch/fp16-promote.ll (+131-71) 
- (added) llvm/test/CodeGen/LoongArch/issue97975.ll (+438) 
- (added) llvm/test/CodeGen/LoongArch/issue97981.ll (+127) 


``````````diff

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a40dd4d1a1673..4fa91b95c45e0 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1001,6 +1001,7 @@ to ``float``; see below for more information on this emulation.
   * X86 (if SSE2 is available; natively if AVX512-FP16 is also available)
   * RISC-V (natively if Zfh or Zhinx is available)
   * SystemZ (emulated)
+  * LoongArch
 
 * ``__bf16`` is supported on the following targets (currently never natively):
 
@@ -1008,6 +1009,7 @@ to ``float``; see below for more information on this emulation.
   * 64-bit ARM (AArch64)
   * RISC-V
   * X86 (when SSE2 is available)
+  * LoongArch
 
 (For X86, SSE2 is available on 64-bit and all recent 32-bit processors.)
 
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index 4c7b53abfef9b..7e9affc98ac0f 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -49,10 +49,14 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
     HasFeatureLD_SEQ_SA = false;
     HasFeatureDiv32 = false;
     HasFeatureSCQ = false;
+    BFloat16Width = 16;
+    BFloat16Align = 16;
+    BFloat16Format = &llvm::APFloat::BFloat();
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
     MCountName = "_mcount";
+    HasFloat16 = true;
     SuitableAlign = 128;
     WCharType = SignedInt;
     WIntType = UnsignedInt;
@@ -98,6 +102,10 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
 
   bool hasBitIntType() const override { return true; }
 
+  bool hasBFloat16Type() const override { return true; }
+
+  bool useFP16ConversionIntrinsics() const override { return false; }
+
   bool handleTargetFeatures(std::vector<std::string> &Features,
                             DiagnosticsEngine &Diags) override;
 
diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 0f689371a60db..7640f3779816a 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -110,10 +110,9 @@ bool LoongArchABIInfo::detectFARsEligibleStructHelper(
     uint64_t Size = getContext().getTypeSize(Ty);
     if (IsInt && Size > GRLen)
       return false;
-    // Can't be eligible if larger than the FP registers. Half precision isn't
-    // currently supported on LoongArch and the ABI hasn't been confirmed, so
-    // default to the integer ABI in that case.
-    if (IsFloat && (Size > FRLen || Size < 32))
+    // Can't be eligible if larger than the FP registers. Handling of half
+    // precision values has been specified in the ABI, so don't block those.
+    if (IsFloat && Size > FRLen)
       return false;
     // Can't be eligible if an integer type was already found (int+int pairs
     // are not eligible).
diff --git a/clang/test/CodeGen/LoongArch/__fp16-convert.c b/clang/test/CodeGen/LoongArch/__fp16-convert.c
new file mode 100644
index 0000000000000..84ef5de960b47
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/__fp16-convert.c
@@ -0,0 +1,30 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple loongarch64 -emit-llvm %s -o - \
+// RUN:   | FileCheck %s
+
+__fp16 y;
+short z;
+// CHECK-LABEL: define dso_local void @bar1(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @y, align 2
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP0]] to float
+// CHECK-NEXT:    [[CONV1:%.*]] = fptosi float [[CONV]] to i16
+// CHECK-NEXT:    store i16 [[CONV1]], ptr @z, align 2
+// CHECK-NEXT:    ret void
+//
+void bar1(){
+    z = y;
+}
+// CHECK-LABEL: define dso_local void @bar2(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @z, align 2
+// CHECK-NEXT:    [[CONV:%.*]] = sitofp i16 [[TMP0]] to float
+// CHECK-NEXT:    [[CONV1:%.*]] = fptrunc float [[CONV]] to half
+// CHECK-NEXT:    store half [[CONV1]], ptr @y, align 2
+// CHECK-NEXT:    ret void
+//
+void bar2(){
+    y = z;
+}
diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d.c b/clang/test/CodeGen/LoongArch/abi-lp64d.c
index fc7f1eada586b..9f64cfd662e5f 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d.c
@@ -48,6 +48,9 @@ unsigned long check_ulong() { return 0; }
 // CHECK-LABEL: define{{.*}} i64 @check_ulonglong()
 unsigned long long check_ulonglong() { return 0; }
 
+// CHECK-LABEL: define{{.*}}  half @check_float16()
+_Float16 check_float16() { return 0; }
+
 // CHECK-LABEL: define{{.*}} float @check_float()
 float check_float() { return 0; }
 
@@ -127,6 +130,14 @@ struct i16x4_s f_i16x4_s(struct i16x4_s x) {
 /// available, the value is passed in a GAR; if no GAR is available, the value
 /// is passed on the stack.
 
+struct f16x1_s {
+  __fp16 a;
+};
+
+struct float16x1_s {
+  _Float16 a;
+};
+
 struct f32x1_s {
   float a;
 };
@@ -135,6 +146,16 @@ struct f64x1_s {
   double a;
 };
 
+// CHECK-LABEL: define{{.*}} half @f_f16x1_s(half %0)
+struct f16x1_s f_f16x1_s(struct f16x1_s x) {
+  return x;
+}
+
+// CHECK-LABEL: define{{.*}} half @f_float16x1_s(half %0)
+struct float16x1_s f_float16x1_s(struct float16x1_s x) {
+  return x;
+}
+
 // CHECK-LABEL: define{{.*}} float @f_f32x1_s(float %0)
 struct f32x1_s f_f32x1_s(struct f32x1_s x) {
   return x;
@@ -151,10 +172,20 @@ struct f64x1_s f_f64x1_s(struct f64x1_s x) {
 /// number of available FAR is less than 2, it’s passed in a GAR, and passed on
 /// the stack if no GAR is available.
 
+struct f16x2_s {
+  __fp16 a;
+  _Float16 b;
+};
+
 struct f32x2_s {
   float a, b;
 };
 
+// CHECK-LABEL: define{{.*}} { half, half } @f_f16x2_s(half %0, half %1)
+struct f16x2_s f_f16x2_s(struct f16x2_s x) {
+  return x;
+}
+
 // CHECK-LABEL: define{{.*}} { float, float } @f_f32x2_s(float %0, float %1)
 struct f32x2_s f_f32x2_s(struct f32x2_s x) {
   return x;
@@ -165,11 +196,21 @@ struct f32x2_s f_f32x2_s(struct f32x2_s x) {
 /// i. Multiple fixed-point members. If there are available GAR, the structure
 /// is passed in a GAR, and passed on the stack if no GAR is available.
 
+struct f16x1_i16x2_s {
+  _Float16 a;
+  int16_t b, c;
+};
+
 struct f32x1_i16x2_s {
   float a;
   int16_t b, c;
 };
 
+// CHECK-LABEL: define{{.*}} i64 @f_f16x1_i16x2_s(i64 %x.coerce)
+struct f16x1_i16x2_s f_f16x1_i16x2_s(struct f16x1_i16x2_s x) {
+  return x;
+}
+
 // CHECK-LABEL: define{{.*}} i64 @f_f32x1_i16x2_s(i64 %x.coerce)
 struct f32x1_i16x2_s f_f32x1_i16x2_s(struct f32x1_i16x2_s x) {
   return x;
@@ -181,11 +222,21 @@ struct f32x1_i16x2_s f_f32x1_i16x2_s(struct f32x1_i16x2_s x) {
 /// but one GAR is available, it’s passed in GAR; If no GAR is available, it’s
 /// passed on the stack.
 
+struct f16x1_i32x1_s {
+  _Float16 a;
+  int32_t b;
+};
+
 struct f32x1_i32x1_s {
   float a;
   int32_t b;
 };
 
+// CHECK-LABEL: define{{.*}} { half, i32 } @f_f16x1_i32x1_s(half %0, i32 %1)
+struct f16x1_i32x1_s f_f16x1_i32x1_s(struct f16x1_i32x1_s x) {
+  return x;
+}
+
 // CHECK-LABEL: define{{.*}} { float, i32 } @f_f32x1_i32x1_s(float %0, i32 %1)
 struct f32x1_i32x1_s f_f32x1_i32x1_s(struct f32x1_i32x1_s x) {
   return x;
@@ -253,6 +304,16 @@ struct f32x4_s f_f32x4_s(struct f32x4_s x) {
   return x;
 }
 
+struct f16x5_s {
+  _Float16 a, b, c, d;
+  __fp16 e;
+};
+
+// CHECK-LABEL: define{{.*}} [2 x i64] @f_f16x5_s([2 x i64] %x.coerce)
+struct f16x5_s f_f16x5_s(struct f16x5_s x) {
+  return x;
+}
+
 /// ii. The structure with two double members is passed in a pair of available
 /// FARs. If no a pair of available FARs, it’s passed in GARs. A structure with
 /// one double member and one float member is same.
@@ -312,6 +373,16 @@ struct f32x2_i32x2_s f_f32x2_i32x2_s(struct f32x2_i32x2_s x) {
   return x;
 }
 
+struct f16x4_i32x2_s {
+  _Float16 a, b, c, d;
+  int32_t e, f;
+};
+
+// CHECK-LABEL: define{{.*}} [2 x i64] @f_f16x4_i32x2_s([2 x i64] %x.coerce)
+struct f16x4_i32x2_s f_f16x4_i32x2_s(struct f16x4_i32x2_s x) {
+  return x;
+}
+
 /// 3. WOA > 2 × GRLEN
 /// a. It’s passed by reference and are replaced in the argument list with the
 /// address. If there is an available GAR, the reference is passed in the GAR,
diff --git a/clang/test/CodeGen/LoongArch/bfloat-abi.c b/clang/test/CodeGen/LoongArch/bfloat-abi.c
new file mode 100644
index 0000000000000..9f0e25c17cc74
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/bfloat-abi.c
@@ -0,0 +1,611 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// RUN: %clang_cc1 -triple loongarch64 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK-LA64
+// RUN: %clang_cc1 -triple loongarch32 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK-LA32
+
+struct bfloat1 {
+  __bf16 a;
+};
+
+// CHECK-LA64-LABEL: define dso_local bfloat @h1
+// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT1:%.*]], align 2
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT1]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { bfloat }, ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[TMP1]], align 2
+// CHECK-LA64-NEXT:    ret bfloat [[TMP2]]
+//
+// CHECK-LA32-LABEL: define dso_local bfloat @h1
+// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT1:%.*]], align 2
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT1]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { bfloat }, ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[TMP1]], align 2
+// CHECK-LA32-NEXT:    ret bfloat [[TMP2]]
+//
+struct bfloat1 h1(__bf16 a) {
+  struct bfloat1 x;
+  x.a = a;
+  return x;
+}
+
+struct bfloat2 {
+  __bf16 a;
+  __bf16 b;
+};
+
+// CHECK-LA64-LABEL: define dso_local { bfloat, bfloat } @h2
+// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2:%.*]], align 2
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[TMP2]], align 2
+// CHECK-LA64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    [[TMP5:%.*]] = load bfloat, ptr [[TMP4]], align 2
+// CHECK-LA64-NEXT:    [[TMP6:%.*]] = insertvalue { bfloat, bfloat } poison, bfloat [[TMP3]], 0
+// CHECK-LA64-NEXT:    [[TMP7:%.*]] = insertvalue { bfloat, bfloat } [[TMP6]], bfloat [[TMP5]], 1
+// CHECK-LA64-NEXT:    ret { bfloat, bfloat } [[TMP7]]
+//
+// CHECK-LA32-LABEL: define dso_local { bfloat, bfloat } @h2
+// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2:%.*]], align 2
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA32-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[TMP2]], align 2
+// CHECK-LA32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA32-NEXT:    [[TMP5:%.*]] = load bfloat, ptr [[TMP4]], align 2
+// CHECK-LA32-NEXT:    [[TMP6:%.*]] = insertvalue { bfloat, bfloat } poison, bfloat [[TMP3]], 0
+// CHECK-LA32-NEXT:    [[TMP7:%.*]] = insertvalue { bfloat, bfloat } [[TMP6]], bfloat [[TMP5]], 1
+// CHECK-LA32-NEXT:    ret { bfloat, bfloat } [[TMP7]]
+//
+struct bfloat2 h2(__bf16 a, __bf16 b) {
+  struct bfloat2 x;
+  x.a = a;
+  x.b = b;
+  return x;
+}
+
+struct bfloat3 {
+  __bf16 a;
+  __bf16 b;
+  __bf16 c;
+};
+
+// CHECK-LA64-LABEL: define dso_local i64 @h3
+// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT3:%.*]], align 2
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca i64, align 8
+// CHECK-LA64-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA64-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i64 6, i1 false)
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[RETVAL_COERCE]], align 8
+// CHECK-LA64-NEXT:    ret i64 [[TMP3]]
+//
+// CHECK-LA32-LABEL: define dso_local [2 x i32] @h3
+// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT3:%.*]], align 2
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[RETVAL_COERCE:%.*]] = alloca [2 x i32], align 4
+// CHECK-LA32-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA32-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA32-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i32 6, i1 false)
+// CHECK-LA32-NEXT:    [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL_COERCE]], align 4
+// CHECK-LA32-NEXT:    ret [2 x i32] [[TMP3]]
+//
+struct bfloat3 h3(__bf16 a, __bf16 b, __bf16 c) {
+  struct bfloat3 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  return x;
+}
+
+struct bfloat4 {
+  __bf16 a;
+  __bf16 b;
+  __bf16 c;
+  __bf16 d;
+};
+
+// CHECK-LA64-LABEL: define dso_local i64 @h4
+// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT4:%.*]], align 2
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA64-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 3
+// CHECK-LA64-NEXT:    store bfloat [[TMP3]], ptr [[D4]], align 2
+// CHECK-LA64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[RETVAL]], align 2
+// CHECK-LA64-NEXT:    ret i64 [[TMP4]]
+//
+// CHECK-LA32-LABEL: define dso_local [2 x i32] @h4
+// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT4:%.*]], align 2
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[D_ADDR:%.*]...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/141564