[llvm-branch-commits] [clang] 1085fe3 - [Clang][AArch64] Inline assembly support for the ACLE type 'data512_t'
Alexandros Lamprineas via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Aug 2 07:47:18 PDT 2021
Author: Alexandros Lamprineas
Date: 2021-08-02T15:45:58+01:00
New Revision: 1085fe3e0509aabbf8cea2c39b14ec092f4ca72f
URL: https://github.com/llvm/llvm-project/commit/1085fe3e0509aabbf8cea2c39b14ec092f4ca72f
DIFF: https://github.com/llvm/llvm-project/commit/1085fe3e0509aabbf8cea2c39b14ec092f4ca72f.diff
LOG: [Clang][AArch64] Inline assembly support for the ACLE type 'data512_t'
In LLVM IR terms the ACLE type 'data512_t' is essentially an aggregate
type { [8 x i64] }. When emitting code for inline assembly operands,
clang tries to scalarize aggregate types to an integer of the equivalent
length, otherwise it passes them by-reference. This patch adds a target
hook to tell whether a given inline assembly operand is scalarizable
so that clang can emit code to pass/return it by-value.
Differential Revision: https://reviews.llvm.org/D94098
Added:
clang/test/CodeGen/aarch64-ls64-inline-asm.c
Modified:
clang/lib/Basic/Targets/AArch64.cpp
clang/lib/CodeGen/CGStmt.cpp
clang/lib/CodeGen/TargetInfo.cpp
clang/lib/CodeGen/TargetInfo.h
Removed:
################################################################################
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 4070ac727d165..e163ebfa2348b 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -431,7 +431,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
Feature == "sve2-aes" || Feature == "sve2-sha3" ||
Feature == "sve2-sm4" || Feature == "f64mm" || Feature == "f32mm" ||
Feature == "i8mm" || Feature == "bf16") &&
- (FPU & SveMode));
+ (FPU & SveMode)) ||
+ (Feature == "ls64" && HasLS64);
}
bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
@@ -752,6 +753,9 @@ bool AArch64TargetInfo::validateConstraintModifier(
if (Size == 64)
return true;
+ if (Size == 512)
+ return HasLS64;
+
SuggestedModifier = "w";
return false;
}
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index aeb319ca15819..0a3a722fa6533 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2097,7 +2097,8 @@ CodeGenFunction::EmitAsmInputLValue(const TargetInfo::ConstraintInfo &Info,
} else {
llvm::Type *Ty = ConvertType(InputType);
uint64_t Size = CGM.getDataLayout().getTypeSizeInBits(Ty);
- if (Size <= 64 && llvm::isPowerOf2_64(Size)) {
+ if ((Size <= 64 && llvm::isPowerOf2_64(Size)) ||
+ getTargetHooks().isScalarizableAsmOperand(*this, Ty)) {
Ty = llvm::IntegerType::get(getLLVMContext(), Size);
Ty = llvm::PointerType::getUnqual(Ty);
@@ -2320,23 +2321,28 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
// If this is a register output, then make the inline asm return it
// by-value. If this is a memory result, return the value by-reference.
- bool isScalarizableAggregate =
- hasAggregateEvaluationKind(OutExpr->getType());
- if (!Info.allowsMemory() && (hasScalarEvaluationKind(OutExpr->getType()) ||
- isScalarizableAggregate)) {
+ QualType QTy = OutExpr->getType();
+ const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
+ hasAggregateEvaluationKind(QTy);
+ if (!Info.allowsMemory() && IsScalarOrAggregate) {
+
Constraints += "=" + OutputConstraint;
- ResultRegQualTys.push_back(OutExpr->getType());
+ ResultRegQualTys.push_back(QTy);
ResultRegDests.push_back(Dest);
- ResultTruncRegTypes.push_back(ConvertTypeForMem(OutExpr->getType()));
- if (Info.allowsRegister() && isScalarizableAggregate) {
- ResultTypeRequiresCast.push_back(true);
- unsigned Size = getContext().getTypeSize(OutExpr->getType());
- llvm::Type *ConvTy = llvm::IntegerType::get(getLLVMContext(), Size);
- ResultRegTypes.push_back(ConvTy);
- } else {
- ResultTypeRequiresCast.push_back(false);
- ResultRegTypes.push_back(ResultTruncRegTypes.back());
+
+ llvm::Type *Ty = ConvertTypeForMem(QTy);
+ const bool RequiresCast = Info.allowsRegister() &&
+ (getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
+ Ty->isAggregateType());
+
+ ResultTruncRegTypes.push_back(Ty);
+ ResultTypeRequiresCast.push_back(RequiresCast);
+
+ if (RequiresCast) {
+ unsigned Size = getContext().getTypeSize(QTy);
+ Ty = llvm::IntegerType::get(getLLVMContext(), Size);
}
+ ResultRegTypes.push_back(Ty);
// If this output is tied to an input, and if the input is larger, then
// we need to set the actual result type of the inline asm node to be the
// same as the input type.
@@ -2638,11 +2644,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
assert(ResultTypeRequiresCast.size() <= ResultRegDests.size());
for (unsigned i = 0, e = RegResults.size(); i != e; ++i) {
llvm::Value *Tmp = RegResults[i];
+ llvm::Type *TruncTy = ResultTruncRegTypes[i];
// If the result type of the LLVM IR asm doesn't match the result type of
// the expression, do the conversion.
if (ResultRegTypes[i] != ResultTruncRegTypes[i]) {
- llvm::Type *TruncTy = ResultTruncRegTypes[i];
// Truncate the integer result to the right size, note that TruncTy can be
// a pointer.
@@ -2672,6 +2678,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
unsigned Size = getContext().getTypeSize(ResultRegQualTys[i]);
Address A = Builder.CreateBitCast(Dest.getAddress(*this),
ResultRegTypes[i]->getPointerTo());
+ if (getTargetHooks().isScalarizableAsmOperand(*this, TruncTy)) {
+ Builder.CreateStore(Tmp, A);
+ continue;
+ }
+
QualType Ty = getContext().getIntTypeForBitwidth(Size, /*Signed*/ false);
if (Ty.isNull()) {
const Expr *OutExpr = S.getOutputExpr(i);
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index a2b68a04d3516..d2cc0a699f435 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -5526,6 +5526,20 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
Fn->addFnAttr("branch-target-enforcement",
BPI.BranchTargetEnforcement ? "true" : "false");
}
+
+ bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
+ llvm::Type *Ty) const override {
+ if (CGF.getTarget().hasFeature("ls64")) {
+ auto *ST = dyn_cast<llvm::StructType>(Ty);
+ if (ST && ST->getNumElements() == 1) {
+ auto *AT = dyn_cast<llvm::ArrayType>(ST->getElementType(0));
+ if (AT && AT->getNumElements() == 8 &&
+ AT->getElementType()->isIntegerTy(64))
+ return true;
+ }
+ }
+ return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
+ }
};
class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo {
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index e6e474544fc44..aa8bbb60a75f1 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -148,6 +148,13 @@ class TargetCodeGenInfo {
return Ty;
}
+ /// Target hook to decide whether an inline asm operand can be passed
+ /// by value.
+ virtual bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
+ llvm::Type *Ty) const {
+ return false;
+ }
+
/// Adds constraints and types for result registers.
virtual void addReturnRegisterOutputs(
CodeGen::CodeGenFunction &CGF, CodeGen::LValue ReturnValue,
diff --git a/clang/test/CodeGen/aarch64-ls64-inline-asm.c b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
new file mode 100644
index 0000000000000..9a2ff5889e23f
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
@@ -0,0 +1,84 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s
+
+struct foo { unsigned long long x[8]; };
+
+// CHECK-LABEL: @load(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.foo* [[OUTPUT:%.*]] to i512*
+// CHECK-NEXT: store i512 [[TMP0]], i512* [[TMP1]], align 8
+// CHECK-NEXT: ret void
+//
+void load(struct foo *output, void *addr)
+{
+ __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
+}
+
+// CHECK-LABEL: @store(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.foo* [[INPUT:%.*]] to i512*
+// CHECK-NEXT: [[TMP1:%.*]] = load i512, i512* [[TMP0]], align 8
+// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7
+// CHECK-NEXT: ret void
+//
+void store(const struct foo *input, void *addr)
+{
+ __asm__ volatile ("st64b %0,[%1]" : : "r" (*input), "r" (addr) : "memory" );
+}
+
+// CHECK-LABEL: @store2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[IN:%.*]], align 4, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64
+// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 1
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[CONV5:%.*]] = sext i32 [[TMP2]] to i64
+// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 16
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[CONV8:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 25
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[CONV11:%.*]] = sext i32 [[TMP4]] to i64
+// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 36
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[CONV14:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 49
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[CONV17:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 64
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX19]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[CONV20:%.*]] = sext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[S_SROA_10_0_INSERT_EXT:%.*]] = zext i64 [[CONV20]] to i512
+// CHECK-NEXT: [[S_SROA_10_0_INSERT_SHIFT:%.*]] = shl nuw i512 [[S_SROA_10_0_INSERT_EXT]], 448
+// CHECK-NEXT: [[S_SROA_9_0_INSERT_EXT:%.*]] = zext i64 [[CONV17]] to i512
+// CHECK-NEXT: [[S_SROA_9_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_9_0_INSERT_EXT]], 384
+// CHECK-NEXT: [[S_SROA_9_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_10_0_INSERT_SHIFT]], [[S_SROA_9_0_INSERT_SHIFT]]
+// CHECK-NEXT: [[S_SROA_8_0_INSERT_EXT:%.*]] = zext i64 [[CONV14]] to i512
+// CHECK-NEXT: [[S_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_8_0_INSERT_EXT]], 320
+// CHECK-NEXT: [[S_SROA_8_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_9_0_INSERT_INSERT]], [[S_SROA_8_0_INSERT_SHIFT]]
+// CHECK-NEXT: [[S_SROA_7_0_INSERT_EXT:%.*]] = zext i64 [[CONV11]] to i512
+// CHECK-NEXT: [[S_SROA_7_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_7_0_INSERT_EXT]], 256
+// CHECK-NEXT: [[S_SROA_7_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_8_0_INSERT_INSERT]], [[S_SROA_7_0_INSERT_SHIFT]]
+// CHECK-NEXT: [[S_SROA_6_0_INSERT_EXT:%.*]] = zext i64 [[CONV8]] to i512
+// CHECK-NEXT: [[S_SROA_6_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_6_0_INSERT_EXT]], 192
+// CHECK-NEXT: [[S_SROA_6_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_7_0_INSERT_INSERT]], [[S_SROA_6_0_INSERT_SHIFT]]
+// CHECK-NEXT: [[S_SROA_5_0_INSERT_EXT:%.*]] = zext i64 [[CONV5]] to i512
+// CHECK-NEXT: [[S_SROA_5_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_5_0_INSERT_EXT]], 128
+// CHECK-NEXT: [[S_SROA_4_0_INSERT_EXT:%.*]] = zext i64 [[CONV2]] to i512
+// CHECK-NEXT: [[S_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_4_0_INSERT_EXT]], 64
+// CHECK-NEXT: [[S_SROA_4_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_6_0_INSERT_INSERT]], [[S_SROA_5_0_INSERT_SHIFT]]
+// CHECK-NEXT: [[S_SROA_0_0_INSERT_EXT:%.*]] = zext i64 [[CONV]] to i512
+// CHECK-NEXT: [[S_SROA_0_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_4_0_INSERT_MASK]], [[S_SROA_4_0_INSERT_SHIFT]]
+// CHECK-NEXT: [[S_SROA_0_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_0_0_INSERT_MASK]], [[S_SROA_0_0_INSERT_EXT]]
+// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12
+// CHECK-NEXT: ret void
+//
+void store2(int *in, void *addr)
+{
+ struct foo s = { in[0], in[1], in[4], in[16], in[25], in[36], in[49], in[64] };
+ __asm__ volatile ("st64b %0,[%1]" : : "r" (s), "r" (addr) : "memory" );
+}
More information about the llvm-branch-commits
mailing list