[clang] 563cc3f - [Clang][CSKY] Add support about CSKYABIInfo

Mon May 30 19:55:54 PDT 2022

Author: Zi Xuan Wu (Zeson)
Date: 2022-05-31T10:53:30+08:00
New Revision: 563cc3fda9a2a35582d274e1d2d66687ecf2fc77

URL: https://github.com/llvm/llvm-project/commit/563cc3fda9a2a35582d274e1d2d66687ecf2fc77
DIFF: https://github.com/llvm/llvm-project/commit/563cc3fda9a2a35582d274e1d2d66687ecf2fc77.diff

LOG: [Clang][CSKY] Add support about CSKYABIInfo

According to the CSKY ABIv2 document, https://github.com/c-sky/csky-doc/blob/master/C-SKY_V2_CPU_Applications_Binary_Interface_Standards_Manual.pdf
construct the ABIInfo to handle argument passing and return of clang data type. It also includes how to emit and expand VAArg intrinsic.

Differential Revision: https://reviews.llvm.org/D126451

Added: 
    clang/test/CodeGen/CSKY/csky-abi.c
    clang/test/CodeGen/CSKY/csky-hard-abi.c
    clang/test/CodeGen/CSKY/csky-soft-abi.c

Modified: 
    clang/lib/CodeGen/TargetInfo.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index ecbb3505bb91c..4b7b301594d77 100644

--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -11325,6 +11325,165 @@ class VETargetCodeGenInfo : public TargetCodeGenInfo {
 };
 } // end anonymous namespace
 
+//===----------------------------------------------------------------------===//
+// CSKY ABI Implementation
+//===----------------------------------------------------------------------===//
+namespace {
+class CSKYABIInfo : public DefaultABIInfo {
+  static const int NumArgGPRs = 4;
+  static const int NumArgFPRs = 4;
+
+  static const unsigned XLen = 32;
+  unsigned FLen;
+
+public:
+  CSKYABIInfo(CodeGen::CodeGenTypes &CGT, unsigned FLen)
+      : DefaultABIInfo(CGT), FLen(FLen) {}
+
+  void computeInfo(CGFunctionInfo &FI) const override;
+  ABIArgInfo classifyArgumentType(QualType Ty, int &ArgGPRsLeft,
+                                  int &ArgFPRsLeft,
+                                  bool isReturnType = false) const;
+  ABIArgInfo classifyReturnType(QualType RetTy) const;
+
+  Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                    QualType Ty) const override;
+};
+
+} // end anonymous namespace
+
+void CSKYABIInfo::computeInfo(CGFunctionInfo &FI) const {
+  QualType RetTy = FI.getReturnType();
+  if (!getCXXABI().classifyReturnType(FI))
+    FI.getReturnInfo() = classifyReturnType(RetTy);
+
+  bool IsRetIndirect = FI.getReturnInfo().getKind() == ABIArgInfo::Indirect;
+
+  // We must track the number of GPRs used in order to conform to the CSKY
+  // ABI, as integer scalars passed in registers should have signext/zeroext
+  // when promoted.
+  int ArgGPRsLeft = IsRetIndirect ? NumArgGPRs - 1 : NumArgGPRs;
+  int ArgFPRsLeft = FLen ? NumArgFPRs : 0;
+
+  for (auto &ArgInfo : FI.arguments()) {
+    ArgInfo.info = classifyArgumentType(ArgInfo.type, ArgGPRsLeft, ArgFPRsLeft);
+  }
+}
+
+Address CSKYABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                               QualType Ty) const {
+  CharUnits SlotSize = CharUnits::fromQuantity(XLen / 8);
+
+  // Empty records are ignored for parameter passing purposes.
+  if (isEmptyRecord(getContext(), Ty, true)) {
+    Address Addr = Address(CGF.Builder.CreateLoad(VAListAddr),
+                           getVAListElementType(CGF), SlotSize);
+    Addr = CGF.Builder.CreateElementBitCast(Addr, CGF.ConvertTypeForMem(Ty));
+    return Addr;
+  }
+
+  auto TInfo = getContext().getTypeInfoInChars(Ty);
+
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, false, TInfo, SlotSize,
+                          /*AllowHigherAlign=*/true);
+}
+
+ABIArgInfo CSKYABIInfo::classifyArgumentType(QualType Ty, int &ArgGPRsLeft,
+                                             int &ArgFPRsLeft,
+                                             bool isReturnType) const {
+  assert(ArgGPRsLeft <= NumArgGPRs && "Arg GPR tracking underflow");
+  Ty = useFirstFieldIfTransparentUnion(Ty);
+
+  // Structures with either a non-trivial destructor or a non-trivial
+  // copy constructor are always passed indirectly.
+  if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+    if (ArgGPRsLeft)
+      ArgGPRsLeft -= 1;
+    return getNaturalAlignIndirect(Ty, /*ByVal=*/RAA ==
+                                           CGCXXABI::RAA_DirectInMemory);
+  }
+
+  // Ignore empty structs/unions.
+  if (isEmptyRecord(getContext(), Ty, true))
+    return ABIArgInfo::getIgnore();
+
+  if (!Ty->getAsUnionType())
+    if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
+      return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
+
+  uint64_t Size = getContext().getTypeSize(Ty);
+  // Pass floating point values via FPRs if possible.
+  if (Ty->isFloatingType() && !Ty->isComplexType() && FLen >= Size &&
+      ArgFPRsLeft) {
+    ArgFPRsLeft--;
+    return ABIArgInfo::getDirect();
+  }
+
+  // Complex types for the hard float ABI must be passed direct rather than
+  // using CoerceAndExpand.
+  if (Ty->isComplexType() && FLen && !isReturnType) {
+    QualType EltTy = Ty->castAs<ComplexType>()->getElementType();
+    if (getContext().getTypeSize(EltTy) <= FLen) {
+      ArgFPRsLeft -= 2;
+      return ABIArgInfo::getDirect();
+    }
+  }
+
+  if (!isAggregateTypeForABI(Ty)) {
+    // Treat an enum type as its underlying type.
+    if (const EnumType *EnumTy = Ty->getAs<EnumType>())
+      Ty = EnumTy->getDecl()->getIntegerType();
+
+    // All integral types are promoted to XLen width, unless passed on the
+    // stack.
+    if (Size < XLen && Ty->isIntegralOrEnumerationType())
+      return ABIArgInfo::getExtend(Ty);
+
+    if (const auto *EIT = Ty->getAs<BitIntType>()) {
+      if (EIT->getNumBits() < XLen)
+        return ABIArgInfo::getExtend(Ty);
+    }
+
+    return ABIArgInfo::getDirect();
+  }
+
+  // For argument type, the first 4*XLen parts of aggregate will be passed
+  // in registers, and the rest will be passed in stack.
+  // So we can coerce to integers directly and let backend handle it correctly.
+  // For return type, aggregate which <= 2*XLen will be returned in registers.
+  // Otherwise, aggregate will be returned indirectly.
+  if (!isReturnType || (isReturnType && Size <= 2 * XLen)) {
+    if (Size <= XLen) {
+      return ABIArgInfo::getDirect(
+          llvm::IntegerType::get(getVMContext(), XLen));
+    } else {
+      return ABIArgInfo::getDirect(llvm::ArrayType::get(
+          llvm::IntegerType::get(getVMContext(), XLen), (Size + 31) / XLen));
+    }
+  }
+  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
+}
+
+ABIArgInfo CSKYABIInfo::classifyReturnType(QualType RetTy) const {
+  if (RetTy->isVoidType())
+    return ABIArgInfo::getIgnore();
+
+  int ArgGPRsLeft = 2;
+  int ArgFPRsLeft = FLen ? 1 : 0;
+
+  // The rules for return and argument types are the same, so defer to
+  // classifyArgumentType.
+  return classifyArgumentType(RetTy, ArgGPRsLeft, ArgFPRsLeft, true);
+}
+
+namespace {
+class CSKYTargetCodeGenInfo : public TargetCodeGenInfo {
+public:
+  CSKYTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, unsigned FLen)
+      : TargetCodeGenInfo(std::make_unique<CSKYABIInfo>(CGT, FLen)) {}
+};
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // Driver code
 //===----------------------------------------------------------------------===//
@@ -11545,6 +11704,14 @@ const TargetCodeGenInfo &CodeGenModule::getTargetCodeGenInfo() {
     return SetCGInfo(new SPIRVTargetCodeGenInfo(Types));
   case llvm::Triple::ve:
     return SetCGInfo(new VETargetCodeGenInfo(Types));
+  case llvm::Triple::csky: {
+    bool IsSoftFloat = !getTarget().hasFeature("hard-float-abi");
+    bool hasFP64 = getTarget().hasFeature("fpuv2_df") ||
+                   getTarget().hasFeature("fpuv3_df");
+    return SetCGInfo(new CSKYTargetCodeGenInfo(Types, IsSoftFloat ? 0
+                                                      : hasFP64   ? 64
+                                                                  : 32));
+  }
   }
 }
 

diff  --git a/clang/test/CodeGen/CSKY/csky-abi.c b/clang/test/CodeGen/CSKY/csky-abi.c
new file mode 100644
index 0000000000000..b32d637d17154
--- /dev/null
+++ b/clang/test/CodeGen/CSKY/csky-abi.c
@@ -0,0 +1,347 @@
+// RUN: %clang_cc1 -no-opaque-pointers -triple csky -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -no-opaque-pointers -triple csky -target-feature +fpuv2_df -target-feature +fpuv2_sf \
+// RUN:   -target-feature +hard-float -target-feature +hard-float-abi -emit-llvm %s -o -   | FileCheck %s
+
+// This file contains test cases that will have the same output for the hard-float
+// and soft-float ABIs.
+
+#include <stddef.h>
+#include <stdint.h>
+
+// CHECK-LABEL: define{{.*}} void @f_void()
+void f_void(void) {}
+
+// Scalar arguments and return values smaller than the word size are extended
+// according to the sign of their type, up to 32 bits
+
+// CHECK-LABEL: define{{.*}} zeroext i1 @f_scalar_0(i1 noundef zeroext %x)
+_Bool f_scalar_0(_Bool x) { return x; }
+
+// CHECK-LABEL: define{{.*}} signext i8 @f_scalar_1(i8 noundef signext %x)
+int8_t f_scalar_1(int8_t x) { return x; }
+
+// CHECK-LABEL: define{{.*}} zeroext i8 @f_scalar_2(i8 noundef zeroext %x)
+uint8_t f_scalar_2(uint8_t x) { return x; }
+
+// CHECK-LABEL: define{{.*}} i32 @f_scalar_3(i32 noundef %x)
+int32_t f_scalar_3(int32_t x) { return x; }
+
+// CHECK-LABEL: define{{.*}} i64 @f_scalar_4(i64 noundef %x)
+int64_t f_scalar_4(int64_t x) { return x; }
+
+// CHECK-LABEL: define{{.*}} float @f_fp_scalar_1(float noundef %x)
+float f_fp_scalar_1(float x) { return x; }
+
+// CHECK-LABEL: define{{.*}} double @f_fp_scalar_2(double noundef %x)
+double f_fp_scalar_2(double x) { return x; }
+
+// CHECK-LABEL: define{{.*}} double @f_fp_scalar_3(double noundef %x)
+long double f_fp_scalar_3(long double x) { return x; }
+
+// Empty structs or unions are ignored.
+
+struct empty_s {};
+
+// CHECK-LABEL: define{{.*}} void @f_agg_empty_struct()
+struct empty_s f_agg_empty_struct(struct empty_s x) {
+  return x;
+}
+
+union empty_u {};
+
+// CHECK-LABEL: define{{.*}} void @f_agg_empty_union()
+union empty_u f_agg_empty_union(union empty_u x) {
+  return x;
+}
+
+// Aggregates <= 4*xlen may be passed in registers, so will be coerced to
+// integer arguments. The rules for return are <= 2*xlen.
+
+struct tiny {
+  uint8_t a, b, c, d;
+};
+
+// CHECK-LABEL: define{{.*}} void @f_agg_tiny(i32 %x.coerce)
+void f_agg_tiny(struct tiny x) {
+  x.a += x.b;
+  x.c += x.d;
+}
+
+// CHECK-LABEL: define{{.*}} i32 @f_agg_tiny_ret()
+struct tiny f_agg_tiny_ret(void) {
+  return (struct tiny){1, 2, 3, 4};
+}
+
+struct small {
+  int32_t a, *b;
+};
+
+// CHECK-LABEL: define{{.*}} void @f_agg_small([2 x i32] %x.coerce)
+void f_agg_small(struct small x) {
+  x.a += *x.b;
+  x.b = &x.a;
+}
+
+// CHECK-LABEL: define{{.*}} [2 x i32] @f_agg_small_ret()
+struct small f_agg_small_ret(void) {
+  return (struct small){1, 0};
+}
+
+struct small_aligned {
+  int64_t a;
+};
+
+// CHECK-LABEL: define{{.*}} void @f_agg_small_aligned(i64 %x.coerce)
+void f_agg_small_aligned(struct small_aligned x) {
+  x.a += x.a;
+}
+
+// CHECK-LABEL: define{{.*}} i64 @f_agg_small_aligned_ret(i64 %x.coerce)
+struct small_aligned f_agg_small_aligned_ret(struct small_aligned x) {
+  return (struct small_aligned){10};
+}
+
+// For argument type, the first 4*XLen parts of aggregate will be passed
+// in registers, and the rest will be passed in stack.
+// So we can coerce to integers directly and let backend handle it correctly.
+// For return type, aggregate which <= 2*XLen will be returned in registers.
+// Otherwise, aggregate will be returned indirectly.
+struct large {
+  int32_t a, b, c, d;
+};
+
+// CHECK-LABEL: define{{.*}} void @f_agg_large([4 x i32] %x.coerce)
+void f_agg_large(struct large x) {
+  x.a = x.b + x.c + x.d;
+}
+
+// The address where the struct should be written to will be the first
+// argument
+// CHECK-LABEL: define{{.*}} void @f_agg_large_ret(%struct.large* noalias sret(%struct.large) align 4 %agg.result, i32 noundef %i, i8 noundef signext %j)
+struct large f_agg_large_ret(int32_t i, int8_t j) {
+  return (struct large){1, 2, 3, 4};
+}
+
+typedef unsigned char v16i8 __attribute__((vector_size(16)));
+
+// CHECK-LABEL: define{{.*}} void @f_vec_large_v16i8(<16 x i8> noundef %x)
+void f_vec_large_v16i8(v16i8 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define{{.*}} <16 x i8> @f_vec_large_v16i8_ret()
+v16i8 f_vec_large_v16i8_ret(void) {
+  return (v16i8){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// CHECK-LABEL: define{{.*}} i32 @f_scalar_stack_1(i32 %a.coerce, [2 x i32] %b.coerce, i64 %c.coerce, [4 x i32] %d.coerce, i8 noundef zeroext %e, i8 noundef signext %f, i8 noundef zeroext %g, i8 noundef signext %h)
+int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c,
+                     struct large d, uint8_t e, int8_t f, uint8_t g, int8_t h) {
+  return g + h;
+}
+
+// Ensure that scalars passed on the stack are still determined correctly in
+// the presence of large return values that consume a register due to the need
+// to pass a pointer.
+
+// CHECK-LABEL: define{{.*}} void @f_scalar_stack_2(%struct.large* noalias sret(%struct.large) align 4 %agg.result, i32 noundef %a, i64 noundef %b, i64 noundef %c, double noundef %d, i8 noundef zeroext %e, i8 noundef signext %f, i8 noundef zeroext %g)
+struct large f_scalar_stack_2(int32_t a, int64_t b, int64_t c, long double d,
+                              uint8_t e, int8_t f, uint8_t g) {
+  return (struct large){a, e, f, g};
+}
+
+// CHECK-LABEL: define{{.*}} double @f_scalar_stack_4(i32 noundef %a, i64 noundef %b, i64 noundef %c, double noundef %d, i8 noundef zeroext %e, i8 noundef signext %f, i8 noundef zeroext %g)
+long double f_scalar_stack_4(int32_t a, int64_t b, int64_t c, long double d,
+                             uint8_t e, int8_t f, uint8_t g) {
+  return d;
+}
+
+// Aggregates should be coerced integer arrary.
+
+// CHECK-LABEL: define{{.*}} void @f_scalar_stack_5(double noundef %a, i64 noundef %b, double noundef %c, i64 noundef %d, i32 noundef %e, i64 noundef %f, float noundef %g, double noundef %h, double noundef %i)
+void f_scalar_stack_5(double a, int64_t b, double c, int64_t d, int e,
+                      int64_t f, float g, double h, long double i) {}
+
+// CHECK-LABEL: define{{.*}} void @f_agg_stack(double noundef %a, i64 noundef %b, double noundef %c, i64 noundef %d, i32 %e.coerce, [2 x i32] %f.coerce, i64 %g.coerce, [4 x i32] %h.coerce)
+void f_agg_stack(double a, int64_t b, double c, int64_t d, struct tiny e,
+                 struct small f, struct small_aligned g, struct large h) {}
+
+// Ensure that ABI lowering happens as expected for vararg calls. For CSKY
+// with the base integer calling convention there will be no observable
+// 
diff erences in the lowered IR for a call with varargs vs without.
+
+int f_va_callee(int, ...);
+
+// CHECK-LABEL: define{{.*}} void @f_va_caller()
+// CHECK: call i32 (i32, ...) @f_va_callee(i32 noundef 1, i32 noundef 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i32 {{%.*}}, [2 x i32] {{%.*}}, i64 {{%.*}}, [4 x i32] {{%.*}})
+void f_va_caller(void) {
+  f_va_callee(1, 2, 3LL, 4.0f, 5.0, (struct tiny){6, 7, 8, 9},
+              (struct small){10, NULL}, (struct small_aligned){11},
+              (struct large){12, 13, 14, 15});
+}
+
+// CHECK-LABEL: define{{.*}} i32 @f_va_1(i8* noundef %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK:   [[VA:%.*]] = alloca i8*, align 4
+// CHECK:   [[V:%.*]] = alloca i32, align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 4
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK:   [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK:   store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK:   [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[V]], align 4
+// CHECK:   ret i32 [[TMP2]]
+// CHECK: }
+int f_va_1(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// CHECK-LABEL: @f_va_2(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca double, align 4
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 4
+// CHECK-NEXT:    store double [[TMP4]], double* [[V]], align 4
+// CHECK-NEXT:    [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA2]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[V]], align 4
+// CHECK-NEXT:    ret double [[TMP5]]
+double f_va_2(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  double v = __builtin_va_arg(va, double);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// CHECK-LABEL: @f_va_3(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca double, align 4
+// CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[X:%.*]] = alloca double, align 4
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 4
+// CHECK-NEXT:    store double [[TMP4]], double* [[V]], align 4
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR2]] to i32*
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-NEXT:    store i32 [[TMP6]], i32* [[W]], align 4
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARGP_CUR4]] to double*
+// CHECK-NEXT:    [[TMP11:%.*]] = load double, double* [[TMP10]], align 4
+// CHECK-NEXT:    store double [[TMP11]], double* [[X]], align 4
+// CHECK-NEXT:    [[VA6:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA6]])
+// CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[V]], align 4
+// CHECK-NEXT:    [[TMP13:%.*]] = load double, double* [[X]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], [[TMP13]]
+// CHECK-NEXT:    ret double [[ADD]]
+double f_va_3(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  double v = __builtin_va_arg(va, double);
+  int w = __builtin_va_arg(va, int);
+  double x = __builtin_va_arg(va, double);
+  __builtin_va_end(va);
+
+  return v + x;
+}
+
+// CHECK-LABEL: define{{.*}} i32 @f_va_4(i8* noundef %fmt, ...) {{.*}} {
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LD:%.*]] = alloca double, align 4
+// CHECK-NEXT:    [[TS:%.*]] = alloca [[STRUCT_TINY:%.*]], align 1
+// CHECK-NEXT:    [[SS:%.*]] = alloca [[STRUCT_SMALL:%.*]], align 4
+// CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
+// CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARGP_CUR2]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP2]], align 4
+// CHECK-NEXT:    store double [[TMP4]], double* [[LD]], align 4
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR4]] to %struct.tiny*
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.tiny* [[TS]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.tiny* [[TMP5]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[TMP6]], i8* align 4 [[TMP7]], i32 4, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR6:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT7:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR6]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT7]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[ARGP_CUR6]] to %struct.small*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.small* [[SS]] to i8*
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct.small* [[TMP8]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 [[TMP10]], i32 8, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR8:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT9:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR8]], i32 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT9]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[ARGP_CUR8]] to %struct.large*
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast %struct.large* [[LS]] to i8*
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.large* [[TMP11]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 16, i1 false)
+// CHECK-NEXT:    [[VA10:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA10]])
+int f_va_4(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  long double ld = __builtin_va_arg(va, long double);
+  struct tiny ts = __builtin_va_arg(va, struct tiny);
+  struct small ss = __builtin_va_arg(va, struct small);
+  struct large ls = __builtin_va_arg(va, struct large);
+  __builtin_va_end(va);
+
+  int ret = (int)((long double)v + ld);
+  ret = ret + ts.a + ts.b + ts.c + ts.d;
+  ret = ret + ss.a + (int)ss.b;
+  ret = ret + ls.a + ls.b + ls.c + ls.d;
+
+  return ret;
+}

diff  --git a/clang/test/CodeGen/CSKY/csky-hard-abi.c b/clang/test/CodeGen/CSKY/csky-hard-abi.c
new file mode 100644
index 0000000000000..d5ed00e4a0755
--- /dev/null
+++ b/clang/test/CodeGen/CSKY/csky-hard-abi.c
@@ -0,0 +1,394 @@
+// RUN: %clang_cc1 -no-opaque-pointers -triple csky -target-feature +fpuv2_sf -target-feature +fpuv2_df -target-feature +hard-float-abi -target-feature +hard-float -emit-llvm %s -o - | FileCheck %s
+
+#include <stdint.h>
+
+// Verify that the tracking of used GPRs and FPRs works correctly by checking
+// that small integers are sign/zero extended when passed in registers.
+
+// Doubles are passed in FPRs, so argument 'i' will be passed zero-extended
+// because it will be passed in a GPR.
+
+// CHECK: define{{.*}} void @f_fpr_tracking(double noundef %a, double noundef %b, double noundef %c, double noundef %d, i8 noundef zeroext %i)
+void f_fpr_tracking(double a, double b, double c, double d, uint8_t i) {}
+
+// A struct containing just one floating-point real is passed as though it
+// were a standalone floating-point real.
+struct double_s {
+  double f;
+};
+
+// CHECK: define{{.*}} void @f_double_s_arg(double %a.coerce)
+void f_double_s_arg(struct double_s a) {}
+
+// CHECK: define{{.*}} double @f_ret_double_s()
+struct double_s f_ret_double_s(void) {
+  return (struct double_s){1.0};
+}
+
+// A struct containing a double and any number of zero-width bitfields is
+// passed as though it were a standalone floating-point real.
+
+struct zbf_double_s {
+  int : 0;
+  double f;
+};
+struct zbf_double_zbf_s {
+  int : 0;
+  double f;
+  int : 0;
+};
+
+// CHECK: define{{.*}} void @f_zbf_double_s_arg(double %a.coerce)
+void f_zbf_double_s_arg(struct zbf_double_s a) {}
+
+// CHECK: define{{.*}} double @f_ret_zbf_double_s()
+struct zbf_double_s f_ret_zbf_double_s(void) {
+  return (struct zbf_double_s){1.0};
+}
+
+// CHECK: define{{.*}} void @f_zbf_double_zbf_s_arg(double %a.coerce)
+void f_zbf_double_zbf_s_arg(struct zbf_double_zbf_s a) {}
+
+// CHECK: define{{.*}} double @f_ret_zbf_double_zbf_s()
+struct zbf_double_zbf_s f_ret_zbf_double_zbf_s(void) {
+  return (struct zbf_double_zbf_s){1.0};
+}
+
+// For argument type, the first 4*XLen parts of aggregate will be passed
+// in registers, and the rest will be passed in stack.
+// So we can coerce to integers directly and let backend handle it correctly.
+// For return type, aggregate which <= 2*XLen will be returned in registers.
+// Otherwise, aggregate will be returned indirectly.
+
+struct double_double_s {
+  double f;
+  double g;
+};
+struct double_float_s {
+  double f;
+  float g;
+};
+
+// CHECK: define{{.*}} void @f_double_double_s_arg([4 x i32] %a.coerce)
+void f_double_double_s_arg(struct double_double_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_double_s(%struct.double_double_s* noalias sret(%struct.double_double_s) align 4 %agg.result)
+struct double_double_s f_ret_double_double_s(void) {
+  return (struct double_double_s){1.0, 2.0};
+}
+
+// CHECK: define{{.*}} void @f_double_float_s_arg([3 x i32] %a.coerce)
+void f_double_float_s_arg(struct double_float_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_float_s(%struct.double_float_s* noalias sret(%struct.double_float_s) align 4 %agg.result)
+struct double_float_s f_ret_double_float_s(void) {
+  return (struct double_float_s){1.0, 2.0};
+}
+
+// CHECK: define{{.*}} void @f_double_double_s_arg_insufficient_fprs(float noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, double noundef %f, double noundef %g, double noundef %i, [4 x i32] %h.coerce)
+void f_double_double_s_arg_insufficient_fprs(float a, double b, double c, double d,
+                                             double e, double f, double g, double i, struct double_double_s h) {}
+
+struct double_int8_s {
+  double f;
+  int8_t i;
+};
+struct double_uint8_s {
+  double f;
+  uint8_t i;
+};
+struct double_int32_s {
+  double f;
+  int32_t i;
+};
+struct double_int64_s {
+  double f;
+  int64_t i;
+};
+struct double_int64bf_s {
+  double f;
+  int64_t i : 32;
+};
+struct double_int8_zbf_s {
+  double f;
+  int8_t i;
+  int : 0;
+};
+
+// CHECK: define{{.*}}  @f_double_int8_s_arg([3 x i32] %a.coerce)
+void f_double_int8_s_arg(struct double_int8_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int8_s(%struct.double_int8_s* noalias sret(%struct.double_int8_s) align 4 %agg.result)
+struct double_int8_s f_ret_double_int8_s(void) {
+  return (struct double_int8_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_uint8_s_arg([3 x i32] %a.coerce)
+void f_double_uint8_s_arg(struct double_uint8_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_uint8_s(%struct.double_uint8_s* noalias sret(%struct.double_uint8_s) align 4 %agg.result)
+struct double_uint8_s f_ret_double_uint8_s(void) {
+  return (struct double_uint8_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int32_s_arg([3 x i32] %a.coerce)
+void f_double_int32_s_arg(struct double_int32_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int32_s(%struct.double_int32_s* noalias sret(%struct.double_int32_s) align 4 %agg.result)
+struct double_int32_s f_ret_double_int32_s(void) {
+  return (struct double_int32_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int64_s_arg([4 x i32] %a.coerce)
+void f_double_int64_s_arg(struct double_int64_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int64_s(%struct.double_int64_s* noalias sret(%struct.double_int64_s) align 4 %agg.result)
+struct double_int64_s f_ret_double_int64_s(void) {
+  return (struct double_int64_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int64bf_s_arg([3 x i32] %a.coerce)
+void f_double_int64bf_s_arg(struct double_int64bf_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int64bf_s(%struct.double_int64bf_s* noalias sret(%struct.double_int64bf_s) align 4 %agg.result)
+struct double_int64bf_s f_ret_double_int64bf_s(void) {
+  return (struct double_int64bf_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int8_zbf_s([3 x i32] %a.coerce)
+void f_double_int8_zbf_s(struct double_int8_zbf_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int8_zbf_s(%struct.double_int8_zbf_s* noalias sret(%struct.double_int8_zbf_s) align 4 %agg.result)
+struct double_int8_zbf_s f_ret_double_int8_zbf_s(void) {
+  return (struct double_int8_zbf_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int8_s_arg_insufficient_gprs(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, i32 noundef %g, i32 noundef %h, [3 x i32] %i.coerce)
+void f_double_int8_s_arg_insufficient_gprs(int a, int b, int c, int d, int e,
+                                           int f, int g, int h, struct double_int8_s i) {}
+
+// CHECK: define{{.*}} void @f_struct_double_int8_insufficient_fprs(float noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, double noundef %f, double noundef %g, double noundef %h, [3 x i32] %i.coerce)
+void f_struct_double_int8_insufficient_fprs(float a, double b, double c, double d,
+                                            double e, double f, double g, double h, struct double_int8_s i) {}
+
+// Complex floating-point values are special in passing argument,
+// and it's not same as structs containing a single complex.
+// Complex floating-point value should be passed in two consecutive fprs.
+// But the return process is same as struct.
+
+// CHECK: define{{.*}} void @f_doublecomplex(double noundef %a.coerce0, double noundef %a.coerce1)
+void f_doublecomplex(double __complex__ a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublecomplex({ double, double }* noalias sret({ double, double }) align 4 %agg.result)
+double __complex__ f_ret_doublecomplex(void) {
+  return 1.0;
+}
+
+struct doublecomplex_s {
+  double __complex__ c;
+};
+
+// CHECK: define{{.*}} void @f_doublecomplex_s_arg([4 x i32] %a.coerce)
+void f_doublecomplex_s_arg(struct doublecomplex_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublecomplex_s(%struct.doublecomplex_s* noalias sret(%struct.doublecomplex_s) align 4 %agg.result)
+struct doublecomplex_s f_ret_doublecomplex_s(void) {
+  return (struct doublecomplex_s){1.0};
+}
+
+// Test single or two-element structs that need flattening. e.g. those
+// containing nested structs, doubles in small arrays, zero-length structs etc.
+
+struct doublearr1_s {
+  double a[1];
+};
+
+// CHECK: define{{.*}} void @f_doublearr1_s_arg(double %a.coerce)
+void f_doublearr1_s_arg(struct doublearr1_s a) {}
+
+// CHECK: define{{.*}} double @f_ret_doublearr1_s()
+struct doublearr1_s f_ret_doublearr1_s(void) {
+  return (struct doublearr1_s){{1.0}};
+}
+
+struct doublearr2_s {
+  double a[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_s_arg(struct doublearr2_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_s(%struct.doublearr2_s* noalias sret(%struct.doublearr2_s) align 4 %agg.result)
+struct doublearr2_s f_ret_doublearr2_s(void) {
+  return (struct doublearr2_s){{1.0, 2.0}};
+}
+
+struct doublearr2_tricky1_s {
+  struct {
+    double f[1];
+  } g[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_tricky1_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_tricky1_s_arg(struct doublearr2_tricky1_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_tricky1_s(%struct.doublearr2_tricky1_s* noalias sret(%struct.doublearr2_tricky1_s) align 4 %agg.result)
+struct doublearr2_tricky1_s f_ret_doublearr2_tricky1_s(void) {
+  return (struct doublearr2_tricky1_s){{{{1.0}}, {{2.0}}}};
+}
+
+struct doublearr2_tricky2_s {
+  struct {};
+  struct {
+    double f[1];
+  } g[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_tricky2_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_tricky2_s_arg(struct doublearr2_tricky2_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_tricky2_s(%struct.doublearr2_tricky2_s* noalias sret(%struct.doublearr2_tricky2_s) align 4 %agg.result)
+struct doublearr2_tricky2_s f_ret_doublearr2_tricky2_s(void) {
+  return (struct doublearr2_tricky2_s){{}, {{{1.0}}, {{2.0}}}};
+}
+
+struct doublearr2_tricky3_s {
+  union {};
+  struct {
+    double f[1];
+  } g[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_tricky3_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_tricky3_s_arg(struct doublearr2_tricky3_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_tricky3_s(%struct.doublearr2_tricky3_s* noalias sret(%struct.doublearr2_tricky3_s) align 4 %agg.result)
+struct doublearr2_tricky3_s f_ret_doublearr2_tricky3_s(void) {
+  return (struct doublearr2_tricky3_s){{}, {{{1.0}}, {{2.0}}}};
+}
+
+struct doublearr2_tricky4_s {
+  union {};
+  struct {
+    struct {};
+    double f[1];
+  } g[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_tricky4_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_tricky4_s_arg(struct doublearr2_tricky4_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_tricky4_s(%struct.doublearr2_tricky4_s* noalias sret(%struct.doublearr2_tricky4_s) align 4 %agg.result)
+struct doublearr2_tricky4_s f_ret_doublearr2_tricky4_s(void) {
+  return (struct doublearr2_tricky4_s){{}, {{{}, {1.0}}, {{}, {2.0}}}};
+}
+
+struct int_double_int_s {
+  int a;
+  double b;
+  int c;
+};
+
+// CHECK: define{{.*}} void @f_int_double_int_s_arg([4 x i32] %a.coerce)
+void f_int_double_int_s_arg(struct int_double_int_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_int_double_int_s(%struct.int_double_int_s* noalias sret(%struct.int_double_int_s) align 4 %agg.result)
+struct int_double_int_s f_ret_int_double_int_s(void) {
+  return (struct int_double_int_s){1, 2.0, 3};
+}
+
+struct int64_double_s {
+  int64_t a;
+  double b;
+};
+
+// CHECK: define{{.*}} void @f_int64_double_s_arg([4 x i32] %a.coerce)
+void f_int64_double_s_arg(struct int64_double_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_int64_double_s(%struct.int64_double_s* noalias sret(%struct.int64_double_s) align 4 %agg.result)
+struct int64_double_s f_ret_int64_double_s(void) {
+  return (struct int64_double_s){1, 2.0};
+}
+
+struct char_char_double_s {
+  char a;
+  char b;
+  double c;
+};
+
+// CHECK-LABEL: define{{.*}} void @f_char_char_double_s_arg([3 x i32] %a.coerce)
+void f_char_char_double_s_arg(struct char_char_double_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_char_char_double_s(%struct.char_char_double_s* noalias sret(%struct.char_char_double_s) align 4 %agg.result)
+struct char_char_double_s f_ret_char_char_double_s(void) {
+  return (struct char_char_double_s){1, 2, 3.0};
+}
+
+// A union containing just one floating-point real can not be  passed as though it
+// were a standalone floating-point real.
+union double_u {
+  double a;
+};
+
+// CHECK: define{{.*}} void @f_double_u_arg([2 x i32] %a.coerce)
+void f_double_u_arg(union double_u a) {}
+
+// CHECK: define{{.*}} [2 x i32] @f_ret_double_u()
+union double_u f_ret_double_u(void) {
+  return (union double_u){1.0};
+}
+
+// CHECK: define{{.*}} void @f_ret_double_int32_s_double_int32_s_just_sufficient_gprs(%struct.double_int32_s* noalias sret(%struct.double_int32_s) align 4 %agg.result, i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, i32 noundef %g, [3 x i32] %h.coerce)
+struct double_int32_s f_ret_double_int32_s_double_int32_s_just_sufficient_gprs(
+    int a, int b, int c, int d, int e, int f, int g, struct double_int32_s h) {
+  return (struct double_int32_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_ret_double_double_s_double_int32_s_just_sufficient_gprs(%struct.double_double_s* noalias sret(%struct.double_double_s) align 4 %agg.result, i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, i32 noundef %g, [3 x i32] %h.coerce)
+struct double_double_s f_ret_double_double_s_double_int32_s_just_sufficient_gprs(
+    int a, int b, int c, int d, int e, int f, int g, struct double_int32_s h) {
+  return (struct double_double_s){1.0, 2.0};
+}
+
+// CHECK: define{{.*}} void @f_ret_doublecomplex_double_int32_s_just_sufficient_gprs({ double, double }* noalias sret({ double, double }) align 4 %agg.result, i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, i32 noundef %g, [3 x i32] %h.coerce)
+double __complex__ f_ret_doublecomplex_double_int32_s_just_sufficient_gprs(
+    int a, int b, int c, int d, int e, int f, int g, struct double_int32_s h) {
+  return 1.0;
+}
+
+struct tiny {
+  uint8_t a, b, c, d;
+};
+
+struct small {
+  int32_t a, *b;
+};
+
+struct small_aligned {
+  int64_t a;
+};
+
+struct large {
+  int32_t a, b, c, d;
+};
+
+// Ensure that scalars passed on the stack are still determined correctly in
+// the presence of large return values that consume a register due to the need
+// to pass a pointer.
+
+// CHECK-LABEL: define{{.*}} void @f_scalar_stack_2(%struct.large* noalias sret(%struct.large) align 4 %agg.result, float noundef %a, i64 noundef %b, double noundef %c, double noundef %d, i8 noundef zeroext %e, i8 noundef signext %f, i8 noundef zeroext %g)
+struct large f_scalar_stack_2(float a, int64_t b, double c, long double d,
+                              uint8_t e, int8_t f, uint8_t g) {
+  return (struct large){a, e, f, g};
+}
+
+// Aggregates and >=XLen scalars passed on the stack should be lowered just as
+// they would be if passed via registers.
+
+// CHECK-LABEL: define{{.*}} void @f_scalar_stack_3(double noundef %a, i64 noundef %b, double noundef %c, i64 noundef %d, i32 noundef %e, i64 noundef %f, float noundef %g, double noundef %h, double noundef %i)
+void f_scalar_stack_3(double a, int64_t b, double c, int64_t d, int e,
+                      int64_t f, float g, double h, long double i) {}
+
+// CHECK-LABEL: define{{.*}} void @f_agg_stack(double noundef %a, i64 noundef %b, double noundef %c, i64 noundef %d, i32 %e.coerce, [2 x i32] %f.coerce, i64 %g.coerce, [4 x i32] %h.coerce)
+void f_agg_stack(double a, int64_t b, double c, int64_t d, struct tiny e,
+                 struct small f, struct small_aligned g, struct large h) {}

diff  --git a/clang/test/CodeGen/CSKY/csky-soft-abi.c b/clang/test/CodeGen/CSKY/csky-soft-abi.c
new file mode 100644
index 0000000000000..b50feee32aa5a
--- /dev/null
+++ b/clang/test/CodeGen/CSKY/csky-soft-abi.c
@@ -0,0 +1,395 @@
+// RUN: %clang_cc1 -no-opaque-pointers -triple csky -target-feature +fpuv2_sf -target-feature +fpuv2_df -target-feature +hard-float -emit-llvm %s -o - | FileCheck %s
+
+#include <stdint.h>
+
+// Verify that the tracking of used GPRs and FPRs works correctly by checking
+// that small integers are sign/zero extended when passed in registers.
+
+// Doubles are passed in FPRs, so argument 'i' will be passed zero-extended
+// because it will be passed in a GPR.
+
+// CHECK: define{{.*}} void @f_fpr_tracking(double noundef %a, double noundef %b, double noundef %c, double noundef %d, i8 noundef zeroext %i)
+void f_fpr_tracking(double a, double b, double c, double d, uint8_t i) {}
+
+// A struct containing just one floating-point real is passed as though it
+// were a standalone floating-point real.
+struct double_s {
+  double f;
+};
+
+// CHECK: define{{.*}} void @f_double_s_arg(double %a.coerce)
+void f_double_s_arg(struct double_s a) {}
+
+// CHECK: define{{.*}} double @f_ret_double_s()
+struct double_s f_ret_double_s(void) {
+  return (struct double_s){1.0};
+}
+
+// A struct containing a double and any number of zero-width bitfields is
+// passed as though it were a standalone floating-point real.
+
+struct zbf_double_s {
+  int : 0;
+  double f;
+};
+struct zbf_double_zbf_s {
+  int : 0;
+  double f;
+  int : 0;
+};
+
+// CHECK: define{{.*}} void @f_zbf_double_s_arg(double %a.coerce)
+void f_zbf_double_s_arg(struct zbf_double_s a) {}
+
+// CHECK: define{{.*}} double @f_ret_zbf_double_s()
+struct zbf_double_s f_ret_zbf_double_s(void) {
+  return (struct zbf_double_s){1.0};
+}
+
+// CHECK: define{{.*}} void @f_zbf_double_zbf_s_arg(double %a.coerce)
+void f_zbf_double_zbf_s_arg(struct zbf_double_zbf_s a) {}
+
+// CHECK: define{{.*}} double @f_ret_zbf_double_zbf_s()
+struct zbf_double_zbf_s f_ret_zbf_double_zbf_s(void) {
+  return (struct zbf_double_zbf_s){1.0};
+}
+
+// For argument type, the first 4*XLen parts of aggregate will be passed
+// in registers, and the rest will be passed in stack.
+// So we can coerce to integers directly and let backend handle it correctly.
+// For return type, aggregate which <= 2*XLen will be returned in registers.
+// Otherwise, aggregate will be returned indirectly.
+
+struct double_double_s {
+  double f;
+  double g;
+};
+struct double_float_s {
+  double f;
+  float g;
+};
+
+// CHECK: define{{.*}} void @f_double_double_s_arg([4 x i32] %a.coerce)
+void f_double_double_s_arg(struct double_double_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_double_s(%struct.double_double_s* noalias sret(%struct.double_double_s) align 4 %agg.result)
+struct double_double_s f_ret_double_double_s(void) {
+  return (struct double_double_s){1.0, 2.0};
+}
+
+// CHECK: define{{.*}} void @f_double_float_s_arg([3 x i32] %a.coerce)
+void f_double_float_s_arg(struct double_float_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_float_s(%struct.double_float_s* noalias sret(%struct.double_float_s) align 4 %agg.result)
+struct double_float_s f_ret_double_float_s(void) {
+  return (struct double_float_s){1.0, 2.0};
+}
+
+// CHECK: define{{.*}} void @f_double_double_s_arg_insufficient_fprs(float noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, double noundef %f, double noundef %g, double noundef %i, [4 x i32] %h.coerce)
+void f_double_double_s_arg_insufficient_fprs(float a, double b, double c, double d,
+                                             double e, double f, double g, double i, struct double_double_s h) {}
+
+struct double_int8_s {
+  double f;
+  int8_t i;
+};
+struct double_uint8_s {
+  double f;
+  uint8_t i;
+};
+struct double_int32_s {
+  double f;
+  int32_t i;
+};
+struct double_int64_s {
+  double f;
+  int64_t i;
+};
+struct double_int64bf_s {
+  double f;
+  int64_t i : 32;
+};
+struct double_int8_zbf_s {
+  double f;
+  int8_t i;
+  int : 0;
+};
+
+// CHECK: define{{.*}}  @f_double_int8_s_arg([3 x i32] %a.coerce)
+void f_double_int8_s_arg(struct double_int8_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int8_s(%struct.double_int8_s* noalias sret(%struct.double_int8_s) align 4 %agg.result)
+struct double_int8_s f_ret_double_int8_s(void) {
+  return (struct double_int8_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_uint8_s_arg([3 x i32] %a.coerce)
+void f_double_uint8_s_arg(struct double_uint8_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_uint8_s(%struct.double_uint8_s* noalias sret(%struct.double_uint8_s) align 4 %agg.result)
+struct double_uint8_s f_ret_double_uint8_s(void) {
+  return (struct double_uint8_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int32_s_arg([3 x i32] %a.coerce)
+void f_double_int32_s_arg(struct double_int32_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int32_s(%struct.double_int32_s* noalias sret(%struct.double_int32_s) align 4 %agg.result)
+struct double_int32_s f_ret_double_int32_s(void) {
+  return (struct double_int32_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int64_s_arg([4 x i32] %a.coerce)
+void f_double_int64_s_arg(struct double_int64_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int64_s(%struct.double_int64_s* noalias sret(%struct.double_int64_s) align 4 %agg.result)
+struct double_int64_s f_ret_double_int64_s(void) {
+  return (struct double_int64_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int64bf_s_arg([3 x i32] %a.coerce)
+void f_double_int64bf_s_arg(struct double_int64bf_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int64bf_s(%struct.double_int64bf_s* noalias sret(%struct.double_int64bf_s) align 4 %agg.result)
+struct double_int64bf_s f_ret_double_int64bf_s(void) {
+  return (struct double_int64bf_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int8_zbf_s([3 x i32] %a.coerce)
+void f_double_int8_zbf_s(struct double_int8_zbf_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_double_int8_zbf_s(%struct.double_int8_zbf_s* noalias sret(%struct.double_int8_zbf_s) align 4 %agg.result)
+struct double_int8_zbf_s f_ret_double_int8_zbf_s(void) {
+  return (struct double_int8_zbf_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_double_int8_s_arg_insufficient_gprs(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, i32 noundef %g, i32 noundef %h, [3 x i32] %i.coerce)
+void f_double_int8_s_arg_insufficient_gprs(int a, int b, int c, int d, int e,
+                                           int f, int g, int h, struct double_int8_s i) {}
+
+// CHECK: define{{.*}} void @f_struct_double_int8_insufficient_fprs(float noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, double noundef %f, double noundef %g, double noundef %h, [3 x i32] %i.coerce)
+void f_struct_double_int8_insufficient_fprs(float a, double b, double c, double d,
+                                            double e, double f, double g, double h, struct double_int8_s i) {}
+
+// Complex floating-point values are special in passing argument,
+// and it's not same as structs containing a single complex.
+// Complex floating-point value should be passed in two consecutive fprs.
+// But the return process is same as struct.
+
+// But now we test in soft-float, it's coerced and passing in gprs.
+// CHECK: define{{.*}} void @f_doublecomplex([4 x i32] noundef %a.coerce)
+void f_doublecomplex(double __complex__ a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublecomplex({ double, double }* noalias sret({ double, double }) align 4 %agg.result)
+double __complex__ f_ret_doublecomplex(void) {
+  return 1.0;
+}
+
+struct doublecomplex_s {
+  double __complex__ c;
+};
+
+// CHECK: define{{.*}} void @f_doublecomplex_s_arg([4 x i32] %a.coerce)
+void f_doublecomplex_s_arg(struct doublecomplex_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublecomplex_s(%struct.doublecomplex_s* noalias sret(%struct.doublecomplex_s) align 4 %agg.result)
+struct doublecomplex_s f_ret_doublecomplex_s(void) {
+  return (struct doublecomplex_s){1.0};
+}
+
+// Test single or two-element structs that need flattening. e.g. those
+// containing nested structs, doubles in small arrays, zero-length structs etc.
+
+struct doublearr1_s {
+  double a[1];
+};
+
+// CHECK: define{{.*}} void @f_doublearr1_s_arg(double %a.coerce)
+void f_doublearr1_s_arg(struct doublearr1_s a) {}
+
+// CHECK: define{{.*}} double @f_ret_doublearr1_s()
+struct doublearr1_s f_ret_doublearr1_s(void) {
+  return (struct doublearr1_s){{1.0}};
+}
+
+struct doublearr2_s {
+  double a[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_s_arg(struct doublearr2_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_s(%struct.doublearr2_s* noalias sret(%struct.doublearr2_s) align 4 %agg.result)
+struct doublearr2_s f_ret_doublearr2_s(void) {
+  return (struct doublearr2_s){{1.0, 2.0}};
+}
+
+struct doublearr2_tricky1_s {
+  struct {
+    double f[1];
+  } g[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_tricky1_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_tricky1_s_arg(struct doublearr2_tricky1_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_tricky1_s(%struct.doublearr2_tricky1_s* noalias sret(%struct.doublearr2_tricky1_s) align 4 %agg.result)
+struct doublearr2_tricky1_s f_ret_doublearr2_tricky1_s(void) {
+  return (struct doublearr2_tricky1_s){{{{1.0}}, {{2.0}}}};
+}
+
+struct doublearr2_tricky2_s {
+  struct {};
+  struct {
+    double f[1];
+  } g[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_tricky2_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_tricky2_s_arg(struct doublearr2_tricky2_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_tricky2_s(%struct.doublearr2_tricky2_s* noalias sret(%struct.doublearr2_tricky2_s) align 4 %agg.result)
+struct doublearr2_tricky2_s f_ret_doublearr2_tricky2_s(void) {
+  return (struct doublearr2_tricky2_s){{}, {{{1.0}}, {{2.0}}}};
+}
+
+struct doublearr2_tricky3_s {
+  union {};
+  struct {
+    double f[1];
+  } g[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_tricky3_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_tricky3_s_arg(struct doublearr2_tricky3_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_tricky3_s(%struct.doublearr2_tricky3_s* noalias sret(%struct.doublearr2_tricky3_s) align 4 %agg.result)
+struct doublearr2_tricky3_s f_ret_doublearr2_tricky3_s(void) {
+  return (struct doublearr2_tricky3_s){{}, {{{1.0}}, {{2.0}}}};
+}
+
+struct doublearr2_tricky4_s {
+  union {};
+  struct {
+    struct {};
+    double f[1];
+  } g[2];
+};
+
+// CHECK: define{{.*}} void @f_doublearr2_tricky4_s_arg([4 x i32] %a.coerce)
+void f_doublearr2_tricky4_s_arg(struct doublearr2_tricky4_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_doublearr2_tricky4_s(%struct.doublearr2_tricky4_s* noalias sret(%struct.doublearr2_tricky4_s) align 4 %agg.result)
+struct doublearr2_tricky4_s f_ret_doublearr2_tricky4_s(void) {
+  return (struct doublearr2_tricky4_s){{}, {{{}, {1.0}}, {{}, {2.0}}}};
+}
+
+struct int_double_int_s {
+  int a;
+  double b;
+  int c;
+};
+
+// CHECK: define{{.*}} void @f_int_double_int_s_arg([4 x i32] %a.coerce)
+void f_int_double_int_s_arg(struct int_double_int_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_int_double_int_s(%struct.int_double_int_s* noalias sret(%struct.int_double_int_s) align 4 %agg.result)
+struct int_double_int_s f_ret_int_double_int_s(void) {
+  return (struct int_double_int_s){1, 2.0, 3};
+}
+
+struct int64_double_s {
+  int64_t a;
+  double b;
+};
+
+// CHECK: define{{.*}} void @f_int64_double_s_arg([4 x i32] %a.coerce)
+void f_int64_double_s_arg(struct int64_double_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_int64_double_s(%struct.int64_double_s* noalias sret(%struct.int64_double_s) align 4 %agg.result)
+struct int64_double_s f_ret_int64_double_s(void) {
+  return (struct int64_double_s){1, 2.0};
+}
+
+struct char_char_double_s {
+  char a;
+  char b;
+  double c;
+};
+
+// CHECK-LABEL: define{{.*}} void @f_char_char_double_s_arg([3 x i32] %a.coerce)
+void f_char_char_double_s_arg(struct char_char_double_s a) {}
+
+// CHECK: define{{.*}} void @f_ret_char_char_double_s(%struct.char_char_double_s* noalias sret(%struct.char_char_double_s) align 4 %agg.result)
+struct char_char_double_s f_ret_char_char_double_s(void) {
+  return (struct char_char_double_s){1, 2, 3.0};
+}
+
+// A union containing just one floating-point real can not be  passed as though it
+// were a standalone floating-point real.
+union double_u {
+  double a;
+};
+
+// CHECK: define{{.*}} void @f_double_u_arg([2 x i32] %a.coerce)
+void f_double_u_arg(union double_u a) {}
+
+// CHECK: define{{.*}} [2 x i32] @f_ret_double_u()
+union double_u f_ret_double_u(void) {
+  return (union double_u){1.0};
+}
+
+// CHECK: define{{.*}} void @f_ret_double_int32_s_double_int32_s_just_sufficient_gprs(%struct.double_int32_s* noalias sret(%struct.double_int32_s) align 4 %agg.result, i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, i32 noundef %g, [3 x i32] %h.coerce)
+struct double_int32_s f_ret_double_int32_s_double_int32_s_just_sufficient_gprs(
+    int a, int b, int c, int d, int e, int f, int g, struct double_int32_s h) {
+  return (struct double_int32_s){1.0, 2};
+}
+
+// CHECK: define{{.*}} void @f_ret_double_double_s_double_int32_s_just_sufficient_gprs(%struct.double_double_s* noalias sret(%struct.double_double_s) align 4 %agg.result, i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, i32 noundef %g, [3 x i32] %h.coerce)
+struct double_double_s f_ret_double_double_s_double_int32_s_just_sufficient_gprs(
+    int a, int b, int c, int d, int e, int f, int g, struct double_int32_s h) {
+  return (struct double_double_s){1.0, 2.0};
+}
+
+// CHECK: define{{.*}} void @f_ret_doublecomplex_double_int32_s_just_sufficient_gprs({ double, double }* noalias sret({ double, double }) align 4 %agg.result, i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, i32 noundef %g, [3 x i32] %h.coerce)
+double __complex__ f_ret_doublecomplex_double_int32_s_just_sufficient_gprs(
+    int a, int b, int c, int d, int e, int f, int g, struct double_int32_s h) {
+  return 1.0;
+}
+
+struct tiny {
+  uint8_t a, b, c, d;
+};
+
+struct small {
+  int32_t a, *b;
+};
+
+struct small_aligned {
+  int64_t a;
+};
+
+struct large {
+  int32_t a, b, c, d;
+};
+
+// Ensure that scalars passed on the stack are still determined correctly in
+// the presence of large return values that consume a register due to the need
+// to pass a pointer.
+
+// CHECK-LABEL: define{{.*}} void @f_scalar_stack_2(%struct.large* noalias sret(%struct.large) align 4 %agg.result, float noundef %a, i64 noundef %b, double noundef %c, double noundef %d, i8 noundef zeroext %e, i8 noundef signext %f, i8 noundef zeroext %g)
+struct large f_scalar_stack_2(float a, int64_t b, double c, long double d,
+                              uint8_t e, int8_t f, uint8_t g) {
+  return (struct large){a, e, f, g};
+}
+
+// Aggregates and >=XLen scalars passed on the stack should be lowered just as
+// they would be if passed via registers.
+
+// CHECK-LABEL: define{{.*}} void @f_scalar_stack_3(double noundef %a, i64 noundef %b, double noundef %c, i64 noundef %d, i32 noundef %e, i64 noundef %f, float noundef %g, double noundef %h, double noundef %i)
+void f_scalar_stack_3(double a, int64_t b, double c, int64_t d, int e,
+                      int64_t f, float g, double h, long double i) {}
+
+// CHECK-LABEL: define{{.*}} void @f_agg_stack(double noundef %a, i64 noundef %b, double noundef %c, i64 noundef %d, i32 %e.coerce, [2 x i32] %f.coerce, i64 %g.coerce, [4 x i32] %h.coerce)
+void f_agg_stack(double a, int64_t b, double c, int64_t d, struct tiny e,
+                 struct small f, struct small_aligned g, struct large h) {}