[llvm] [clang] [llvm][ARM] Emit MVE .arch_extension after .fpu directive if it does not include MVE features (PR #71545)

Thu Nov 9 10:05:54 PST 2023

https://github.com/simpal01 updated https://github.com/llvm/llvm-project/pull/71545

>From e2b6fa1cb088f2c3cf05a38959a3f3d34eda92c5 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Tue, 7 Nov 2023 13:05:08 +0000
Subject: [PATCH 1/2] [ARM] .fpu equals fpv5-d16 disables floating point MVE
 which leads to unsupported MVE instructions for cortex M85/M55.

The floating-point and MVE features together specify the MVE functionality that is supported on the Cortex-M85 processor. But the FPU extension for the underlying architecture(armv8.1-m.main) is FPV5 which does not include MVE-F.

So either when we explictly specify -mfpu=fpv5-d16 or Compiler's -S output and `-save-temps=obj` loses MVE feature which leads to assembler error. What happening here is .fpu directive overrides any previously set features by .cpu directive. Since the the corresponding .fpu generated (.fpu fpv5-d16) does not include MVE-F, it overrides those features even though it is supported and set by the .cpu directive. Looks like .fpu is supposed to do this.

In this case, there should be an .arch_extension directive re-enabling the relevant extensions after .fpu  if the goal is to keep these extensions enabled. GCC also does the same.

So this patch enables the MVE features by:
  .fpu fpv5-d16
  .arch_extension mve.fp
---
 clang/test/CodeGen/arm-v8.1m-check-mve.ll     | 56 +++++++++++++++++++
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp |  3 +
 .../ARM/MCTargetDesc/ARMTargetStreamer.cpp    | 16 ++++--
 3 files changed, 69 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CodeGen/arm-v8.1m-check-mve.ll

diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.ll b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
new file mode 100644
index 000000000000000..cfcb0223961e31e
--- /dev/null
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
@@ -0,0 +1,56 @@
+; REQUIRES: arm-registered-target
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+; CHECK: .fpu   fpv5-d16
+; CHECK  .arch_extension mve.fp
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-unknown-eabihf"
+
+%struct.dummy_t = type { float, float, float, float }
+
+define dso_local signext i8 @foo(ptr noundef %handle) #0 {
+entry:
+  %handle.addr = alloca ptr, align 4
+  store ptr %handle, ptr %handle.addr, align 4
+  %0 = load ptr, ptr %handle.addr, align 4
+  %a = getelementptr inbounds %struct.dummy_t, ptr %0, i32 0, i32 0
+  %1 = load float, ptr %a, align 4
+  %sub = fsub float 0x3F5439DE40000000, %1
+  %2 = load ptr, ptr %handle.addr, align 4
+  %a1 = getelementptr inbounds %struct.dummy_t, ptr %2, i32 0, i32 0
+  %3 = load float, ptr %a1, align 4
+  %4 = call float @llvm.fmuladd.f32(float 0x3F847AE140000000, float %sub, float %3)
+  store float %4, ptr %a1, align 4
+  %5 = load ptr, ptr %handle.addr, align 4
+  %b = getelementptr inbounds %struct.dummy_t, ptr %5, i32 0, i32 1
+  %6 = load float, ptr %b, align 4
+  %sub2 = fsub float 0x3F5439DE40000000, %6
+  %7 = load ptr, ptr %handle.addr, align 4
+  %b3 = getelementptr inbounds %struct.dummy_t, ptr %7, i32 0, i32 1
+  %8 = load float, ptr %b3, align 4
+  %9 = call float @llvm.fmuladd.f32(float 0x3F947AE140000000, float %sub2, float %8)
+  store float %9, ptr %b3, align 4
+  %10 = load ptr, ptr %handle.addr, align 4
+  %c = getelementptr inbounds %struct.dummy_t, ptr %10, i32 0, i32 2
+  %11 = load float, ptr %c, align 4
+  %sub4 = fsub float 0x3F5439DE40000000, %11
+  %12 = load ptr, ptr %handle.addr, align 4
+  %c5 = getelementptr inbounds %struct.dummy_t, ptr %12, i32 0, i32 2
+  %13 = load float, ptr %c5, align 4
+  %14 = call float @llvm.fmuladd.f32(float 0x3F9EB851E0000000, float %sub4, float %13)
+  store float %14, ptr %c5, align 4
+  %15 = load ptr, ptr %handle.addr, align 4
+  %d = getelementptr inbounds %struct.dummy_t, ptr %15, i32 0, i32 3
+  %16 = load float, ptr %d, align 4
+  %sub6 = fsub float 0x3F5439DE40000000, %16
+  %17 = load ptr, ptr %handle.addr, align 4
+  %d7 = getelementptr inbounds %struct.dummy_t, ptr %17, i32 0, i32 3
+  %18 = load float, ptr %d7, align 4
+  %19 = call float @llvm.fmuladd.f32(float 0x3FA47AE140000000, float %sub6, float %18)
+  store float %19, ptr %d7, align 4
+  ret i8 0
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) #1
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 373d5b59bca6640..20b52ebc544a1ed 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12648,6 +12648,9 @@ bool ARMAsmParser::enableArchExtFeature(StringRef Name, SMLoc &ExtLoc) {
       {ARM::AEK_CRYPTO,
        {Feature_HasV8Bit},
        {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8}},
+      {(ARM::AEK_DSP | ARM::AEK_SIMD | ARM::AEK_FP),
+       {Feature_HasV8_1MMainlineBit},
+       {ARM::HasMVEFloatOps}},
       {ARM::AEK_FP,
        {Feature_HasV8Bit},
        {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8}},
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index b65d1b24e63d39b..e84b597e4382edc 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -238,14 +238,18 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
                         ? ARMBuildAttrs::AllowNeonARMv8_1a
                         : ARMBuildAttrs::AllowNeonARMv8);
   } else {
-    if (STI.hasFeature(ARM::FeatureFPARMv8_D16_SP))
+    if (STI.hasFeature(ARM::FeatureFPARMv8_D16_SP)) {
       // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
       // FPU, but there are two different names for it depending on the CPU.
-      emitFPU(STI.hasFeature(ARM::FeatureD32)
-                  ? ARM::FK_FP_ARMV8
-                  : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_FPV5_D16
-                                                      : ARM::FK_FPV5_SP_D16));
-    else if (STI.hasFeature(ARM::FeatureVFP4_D16_SP))
+      if (STI.hasFeature(ARM::FeatureD32))
+        emitFPU(ARM::FK_FP_ARMV8);
+      else {
+        emitFPU(STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_FPV5_D16
+                                                 : ARM::FK_FPV5_SP_D16);
+        if (STI.hasFeature(ARM::HasMVEFloatOps))
+          emitArchExtension(ARM::AEK_SIMD | ARM::AEK_DSP | ARM::AEK_FP);
+      }
+    } else if (STI.hasFeature(ARM::FeatureVFP4_D16_SP))
       emitFPU(STI.hasFeature(ARM::FeatureD32)
                   ? ARM::FK_VFPV4
                   : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_VFPV4_D16

>From 951537dbb7ce66451bf3c7d5f00d819893f5b264 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Thu, 9 Nov 2023 16:25:25 +0000
Subject: [PATCH 2/2] fixup! [llvm][ARM] Emit MVE .arch_extension after .fpu
 directive if it does not include MVE features.

  1. .arch_extension will always be on the very next line.
     CHECK-NEXT would be bit more robust.
---
 clang/test/CodeGen/arm-v8.1m-check-mve.ll | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.ll b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
index cfcb0223961e31e..6949f5529aeb653 100644
--- a/clang/test/CodeGen/arm-v8.1m-check-mve.ll
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
@@ -4,9 +4,7 @@
 ; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
 ; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
 ; CHECK: .fpu   fpv5-d16
-; CHECK  .arch_extension mve.fp
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "thumbv8.1m.main-none-unknown-eabihf"
+; CHECK-NEXT  .arch_extension mve.fp
 
 %struct.dummy_t = type { float, float, float, float }