[clang] [llvm] [llvm][ARM] Emit MVE .arch_extension after .fpu directive if it does not include MVE features (PR #71545)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 16 03:35:22 PST 2023
https://github.com/simpal01 updated https://github.com/llvm/llvm-project/pull/71545
>From e2b6fa1cb088f2c3cf05a38959a3f3d34eda92c5 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Tue, 7 Nov 2023 13:05:08 +0000
Subject: [PATCH 1/5] [ARM] .fpu equals fpv5-d16 disables floating point MVE
which leads to unsupported MVE instructions for cortex M85/M55.
The floating-point and MVE features together specify the MVE functionality that is supported on the Cortex-M85 processor. But the FPU extension for the underlying architecture(armv8.1-m.main) is FPV5 which does not include MVE-F.
So either when we explictly specify -mfpu=fpv5-d16 or Compiler's -S output and `-save-temps=obj` loses MVE feature which leads to assembler error. What happening here is .fpu directive overrides any previously set features by .cpu directive. Since the the corresponding .fpu generated (.fpu fpv5-d16) does not include MVE-F, it overrides those features even though it is supported and set by the .cpu directive. Looks like .fpu is supposed to do this.
In this case, there should be an .arch_extension directive re-enabling the relevant extensions after .fpu if the goal is to keep these extensions enabled. GCC also does the same.
So this patch enables the MVE features by:
.fpu fpv5-d16
.arch_extension mve.fp
---
clang/test/CodeGen/arm-v8.1m-check-mve.ll | 56 +++++++++++++++++++
.../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 3 +
.../ARM/MCTargetDesc/ARMTargetStreamer.cpp | 16 ++++--
3 files changed, 69 insertions(+), 6 deletions(-)
create mode 100644 clang/test/CodeGen/arm-v8.1m-check-mve.ll
diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.ll b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
new file mode 100644
index 000000000000000..cfcb0223961e31e
--- /dev/null
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
@@ -0,0 +1,56 @@
+; REQUIRES: arm-registered-target
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+; CHECK: .fpu fpv5-d16
+; CHECK .arch_extension mve.fp
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-unknown-eabihf"
+
+%struct.dummy_t = type { float, float, float, float }
+
+define dso_local signext i8 @foo(ptr noundef %handle) #0 {
+entry:
+ %handle.addr = alloca ptr, align 4
+ store ptr %handle, ptr %handle.addr, align 4
+ %0 = load ptr, ptr %handle.addr, align 4
+ %a = getelementptr inbounds %struct.dummy_t, ptr %0, i32 0, i32 0
+ %1 = load float, ptr %a, align 4
+ %sub = fsub float 0x3F5439DE40000000, %1
+ %2 = load ptr, ptr %handle.addr, align 4
+ %a1 = getelementptr inbounds %struct.dummy_t, ptr %2, i32 0, i32 0
+ %3 = load float, ptr %a1, align 4
+ %4 = call float @llvm.fmuladd.f32(float 0x3F847AE140000000, float %sub, float %3)
+ store float %4, ptr %a1, align 4
+ %5 = load ptr, ptr %handle.addr, align 4
+ %b = getelementptr inbounds %struct.dummy_t, ptr %5, i32 0, i32 1
+ %6 = load float, ptr %b, align 4
+ %sub2 = fsub float 0x3F5439DE40000000, %6
+ %7 = load ptr, ptr %handle.addr, align 4
+ %b3 = getelementptr inbounds %struct.dummy_t, ptr %7, i32 0, i32 1
+ %8 = load float, ptr %b3, align 4
+ %9 = call float @llvm.fmuladd.f32(float 0x3F947AE140000000, float %sub2, float %8)
+ store float %9, ptr %b3, align 4
+ %10 = load ptr, ptr %handle.addr, align 4
+ %c = getelementptr inbounds %struct.dummy_t, ptr %10, i32 0, i32 2
+ %11 = load float, ptr %c, align 4
+ %sub4 = fsub float 0x3F5439DE40000000, %11
+ %12 = load ptr, ptr %handle.addr, align 4
+ %c5 = getelementptr inbounds %struct.dummy_t, ptr %12, i32 0, i32 2
+ %13 = load float, ptr %c5, align 4
+ %14 = call float @llvm.fmuladd.f32(float 0x3F9EB851E0000000, float %sub4, float %13)
+ store float %14, ptr %c5, align 4
+ %15 = load ptr, ptr %handle.addr, align 4
+ %d = getelementptr inbounds %struct.dummy_t, ptr %15, i32 0, i32 3
+ %16 = load float, ptr %d, align 4
+ %sub6 = fsub float 0x3F5439DE40000000, %16
+ %17 = load ptr, ptr %handle.addr, align 4
+ %d7 = getelementptr inbounds %struct.dummy_t, ptr %17, i32 0, i32 3
+ %18 = load float, ptr %d7, align 4
+ %19 = call float @llvm.fmuladd.f32(float 0x3FA47AE140000000, float %sub6, float %18)
+ store float %19, ptr %d7, align 4
+ ret i8 0
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) #1
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 373d5b59bca6640..20b52ebc544a1ed 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12648,6 +12648,9 @@ bool ARMAsmParser::enableArchExtFeature(StringRef Name, SMLoc &ExtLoc) {
{ARM::AEK_CRYPTO,
{Feature_HasV8Bit},
{ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8}},
+ {(ARM::AEK_DSP | ARM::AEK_SIMD | ARM::AEK_FP),
+ {Feature_HasV8_1MMainlineBit},
+ {ARM::HasMVEFloatOps}},
{ARM::AEK_FP,
{Feature_HasV8Bit},
{ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8}},
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index b65d1b24e63d39b..e84b597e4382edc 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -238,14 +238,18 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
? ARMBuildAttrs::AllowNeonARMv8_1a
: ARMBuildAttrs::AllowNeonARMv8);
} else {
- if (STI.hasFeature(ARM::FeatureFPARMv8_D16_SP))
+ if (STI.hasFeature(ARM::FeatureFPARMv8_D16_SP)) {
// FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
// FPU, but there are two different names for it depending on the CPU.
- emitFPU(STI.hasFeature(ARM::FeatureD32)
- ? ARM::FK_FP_ARMV8
- : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_FPV5_D16
- : ARM::FK_FPV5_SP_D16));
- else if (STI.hasFeature(ARM::FeatureVFP4_D16_SP))
+ if (STI.hasFeature(ARM::FeatureD32))
+ emitFPU(ARM::FK_FP_ARMV8);
+ else {
+ emitFPU(STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_FPV5_D16
+ : ARM::FK_FPV5_SP_D16);
+ if (STI.hasFeature(ARM::HasMVEFloatOps))
+ emitArchExtension(ARM::AEK_SIMD | ARM::AEK_DSP | ARM::AEK_FP);
+ }
+ } else if (STI.hasFeature(ARM::FeatureVFP4_D16_SP))
emitFPU(STI.hasFeature(ARM::FeatureD32)
? ARM::FK_VFPV4
: (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_VFPV4_D16
>From 951537dbb7ce66451bf3c7d5f00d819893f5b264 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Thu, 9 Nov 2023 16:25:25 +0000
Subject: [PATCH 2/5] fixup! [llvm][ARM] Emit MVE .arch_extension after .fpu
directive if it does not include MVE features.
1. .arch_extension will always be on the very next line.
CHECK-NEXT would be bit more robust.
---
clang/test/CodeGen/arm-v8.1m-check-mve.ll | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.ll b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
index cfcb0223961e31e..6949f5529aeb653 100644
--- a/clang/test/CodeGen/arm-v8.1m-check-mve.ll
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
@@ -4,9 +4,7 @@
; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
; CHECK: .fpu fpv5-d16
-; CHECK .arch_extension mve.fp
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "thumbv8.1m.main-none-unknown-eabihf"
+; CHECK-NEXT .arch_extension mve.fp
%struct.dummy_t = type { float, float, float, float }
>From 38c3ee09cc393f5efcc3d44882a85100d8daeac1 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Wed, 15 Nov 2023 16:45:15 +0000
Subject: [PATCH 3/5] fixup! [ARM] .fpu equals fpv5-d16 disables floating point
MVE which leads to unsupported MVE instructions for cortex M85/M55.
---
clang/test/CodeGen/arm-v8.1m-check-mve.c | 35 ++++++++++++++++++++++++
1 file changed, 35 insertions(+)
create mode 100644 clang/test/CodeGen/arm-v8.1m-check-mve.c
diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.c b/clang/test/CodeGen/arm-v8.1m-check-mve.c
new file mode 100644
index 000000000000000..221f1cbc27ece5b
--- /dev/null
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.c
@@ -0,0 +1,35 @@
+// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -save-temps=obj -S -o - %s | FileCheck %s
+// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+
+// REQUIRES: arm-registered-target
+
+// CHECK: .fpu fpv5-d16
+// CHECK-NEXT .arch_extension mve.fp
+
+#define DUMMY_CONST_1 (0.0012345F)
+#define DUMMY_CONST_2 (0.01F)
+#define DUMMY_CONST_3 (0.02F)
+#define DUMMY_CONST_4 (0.03F)
+#define DUMMY_CONST_5 (0.04F)
+
+typedef struct
+{
+ float a;
+ float b;
+ float c;
+ float d;
+} dummy_t;
+
+// CHECK-LABEL: foo
+// CHECK: vsub.f32 q0, q0, q1
+// CHECK-NEXT: vfma.f32 q1, q0, q2
+
+signed char foo(dummy_t *handle)
+{
+ handle->a += DUMMY_CONST_2 * (DUMMY_CONST_1 - handle->a);
+ handle->b += DUMMY_CONST_3 * (DUMMY_CONST_1 - handle->b);
+ handle->c += DUMMY_CONST_4 * (DUMMY_CONST_1 - handle->c);
+ handle->d += DUMMY_CONST_5 * (DUMMY_CONST_1 - handle->d);
+ return 0;
+}
>From e8bf45c8d08a7fd095dd4e9eb051834d11ef48bf Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Wed, 15 Nov 2023 16:55:51 +0000
Subject: [PATCH 4/5] fixup! [ARM] .fpu equals fpv5-d16 disables floating point
MVE which leads to unsupported MVE instructions for cortex M85/M55.
---
clang/test/CodeGen/arm-v8.1m-check-mve.ll | 54 -----------------------
1 file changed, 54 deletions(-)
delete mode 100644 clang/test/CodeGen/arm-v8.1m-check-mve.ll
diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.ll b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
deleted file mode 100644
index 6949f5529aeb653..000000000000000
--- a/clang/test/CodeGen/arm-v8.1m-check-mve.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; REQUIRES: arm-registered-target
-; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
-; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
-; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
-; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
-; CHECK: .fpu fpv5-d16
-; CHECK-NEXT .arch_extension mve.fp
-
-%struct.dummy_t = type { float, float, float, float }
-
-define dso_local signext i8 @foo(ptr noundef %handle) #0 {
-entry:
- %handle.addr = alloca ptr, align 4
- store ptr %handle, ptr %handle.addr, align 4
- %0 = load ptr, ptr %handle.addr, align 4
- %a = getelementptr inbounds %struct.dummy_t, ptr %0, i32 0, i32 0
- %1 = load float, ptr %a, align 4
- %sub = fsub float 0x3F5439DE40000000, %1
- %2 = load ptr, ptr %handle.addr, align 4
- %a1 = getelementptr inbounds %struct.dummy_t, ptr %2, i32 0, i32 0
- %3 = load float, ptr %a1, align 4
- %4 = call float @llvm.fmuladd.f32(float 0x3F847AE140000000, float %sub, float %3)
- store float %4, ptr %a1, align 4
- %5 = load ptr, ptr %handle.addr, align 4
- %b = getelementptr inbounds %struct.dummy_t, ptr %5, i32 0, i32 1
- %6 = load float, ptr %b, align 4
- %sub2 = fsub float 0x3F5439DE40000000, %6
- %7 = load ptr, ptr %handle.addr, align 4
- %b3 = getelementptr inbounds %struct.dummy_t, ptr %7, i32 0, i32 1
- %8 = load float, ptr %b3, align 4
- %9 = call float @llvm.fmuladd.f32(float 0x3F947AE140000000, float %sub2, float %8)
- store float %9, ptr %b3, align 4
- %10 = load ptr, ptr %handle.addr, align 4
- %c = getelementptr inbounds %struct.dummy_t, ptr %10, i32 0, i32 2
- %11 = load float, ptr %c, align 4
- %sub4 = fsub float 0x3F5439DE40000000, %11
- %12 = load ptr, ptr %handle.addr, align 4
- %c5 = getelementptr inbounds %struct.dummy_t, ptr %12, i32 0, i32 2
- %13 = load float, ptr %c5, align 4
- %14 = call float @llvm.fmuladd.f32(float 0x3F9EB851E0000000, float %sub4, float %13)
- store float %14, ptr %c5, align 4
- %15 = load ptr, ptr %handle.addr, align 4
- %d = getelementptr inbounds %struct.dummy_t, ptr %15, i32 0, i32 3
- %16 = load float, ptr %d, align 4
- %sub6 = fsub float 0x3F5439DE40000000, %16
- %17 = load ptr, ptr %handle.addr, align 4
- %d7 = getelementptr inbounds %struct.dummy_t, ptr %17, i32 0, i32 3
- %18 = load float, ptr %d7, align 4
- %19 = call float @llvm.fmuladd.f32(float 0x3FA47AE140000000, float %sub6, float %18)
- store float %19, ptr %d7, align 4
- ret i8 0
-}
-
-declare float @llvm.fmuladd.f32(float, float, float) #1
>From 891c77b45605fb25ec4100ca2b194675f9b31dc0 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Thu, 16 Nov 2023 11:28:58 +0000
Subject: [PATCH 5/5] fixup! [ARM] .fpu equals fpv5-d16 disables floating point
MVE which leads to unsupported MVE instructions for cortex M85/M55.
---
clang/test/CodeGen/arm-v8.1m-check-mve.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.c b/clang/test/CodeGen/arm-v8.1m-check-mve.c
index 221f1cbc27ece5b..4f5941c782f0cc9 100644
--- a/clang/test/CodeGen/arm-v8.1m-check-mve.c
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.c
@@ -1,4 +1,8 @@
// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -save-temps=obj -S -o - %s | FileCheck %s
+// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -save-temps=obj -S -o - %s | FileCheck %s
+
+// The below tests are to make sure that assembly directives do not lose mve feature so that reassembly works with
+// mve floating point instructions.
// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
@@ -8,10 +12,6 @@
// CHECK-NEXT .arch_extension mve.fp
#define DUMMY_CONST_1 (0.0012345F)
-#define DUMMY_CONST_2 (0.01F)
-#define DUMMY_CONST_3 (0.02F)
-#define DUMMY_CONST_4 (0.03F)
-#define DUMMY_CONST_5 (0.04F)
typedef struct
{
@@ -22,14 +22,14 @@ typedef struct
} dummy_t;
// CHECK-LABEL: foo
-// CHECK: vsub.f32 q0, q0, q1
-// CHECK-NEXT: vfma.f32 q1, q0, q2
+// CHECK: vsub.f32
+// CHECK: vfma.f32
signed char foo(dummy_t *handle)
{
- handle->a += DUMMY_CONST_2 * (DUMMY_CONST_1 - handle->a);
- handle->b += DUMMY_CONST_3 * (DUMMY_CONST_1 - handle->b);
- handle->c += DUMMY_CONST_4 * (DUMMY_CONST_1 - handle->c);
- handle->d += DUMMY_CONST_5 * (DUMMY_CONST_1 - handle->d);
+ handle->a += 0.01F * (DUMMY_CONST_1 - handle->a);
+ handle->b += 0.02F * (DUMMY_CONST_1 - handle->b);
+ handle->c += 0.03F * (DUMMY_CONST_1 - handle->c);
+ handle->d += 0.04F * (DUMMY_CONST_1 - handle->d);
return 0;
}
More information about the llvm-commits
mailing list