[clang] [llvm] [llvm][ARM] Emit MVE .arch_extension after .fpu directive if it does not include MVE features (PR #71545)

Thu Nov 16 03:35:22 PST 2023

https://github.com/simpal01 updated https://github.com/llvm/llvm-project/pull/71545

>From e2b6fa1cb088f2c3cf05a38959a3f3d34eda92c5 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Tue, 7 Nov 2023 13:05:08 +0000
Subject: [PATCH 1/5] [ARM] .fpu equals fpv5-d16 disables floating point MVE
 which leads to unsupported MVE instructions for cortex M85/M55.

The floating-point and MVE features together specify the MVE functionality that is supported on the Cortex-M85 processor. But the FPU extension for the underlying architecture(armv8.1-m.main) is FPV5 which does not include MVE-F.

So either when we explictly specify -mfpu=fpv5-d16 or Compiler's -S output and `-save-temps=obj` loses MVE feature which leads to assembler error. What happening here is .fpu directive overrides any previously set features by .cpu directive. Since the the corresponding .fpu generated (.fpu fpv5-d16) does not include MVE-F, it overrides those features even though it is supported and set by the .cpu directive. Looks like .fpu is supposed to do this.

In this case, there should be an .arch_extension directive re-enabling the relevant extensions after .fpu  if the goal is to keep these extensions enabled. GCC also does the same.

So this patch enables the MVE features by:
  .fpu fpv5-d16
  .arch_extension mve.fp
---
 clang/test/CodeGen/arm-v8.1m-check-mve.ll     | 56 +++++++++++++++++++
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp |  3 +
 .../ARM/MCTargetDesc/ARMTargetStreamer.cpp    | 16 ++++--
 3 files changed, 69 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CodeGen/arm-v8.1m-check-mve.ll

diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.ll b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
new file mode 100644
index 000000000000000..cfcb0223961e31e
--- /dev/null
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
@@ -0,0 +1,56 @@
+; REQUIRES: arm-registered-target
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+; CHECK: .fpu   fpv5-d16
+; CHECK  .arch_extension mve.fp
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-unknown-eabihf"
+
+%struct.dummy_t = type { float, float, float, float }
+
+define dso_local signext i8 @foo(ptr noundef %handle) #0 {
+entry:
+  %handle.addr = alloca ptr, align 4
+  store ptr %handle, ptr %handle.addr, align 4
+  %0 = load ptr, ptr %handle.addr, align 4
+  %a = getelementptr inbounds %struct.dummy_t, ptr %0, i32 0, i32 0
+  %1 = load float, ptr %a, align 4
+  %sub = fsub float 0x3F5439DE40000000, %1
+  %2 = load ptr, ptr %handle.addr, align 4
+  %a1 = getelementptr inbounds %struct.dummy_t, ptr %2, i32 0, i32 0
+  %3 = load float, ptr %a1, align 4
+  %4 = call float @llvm.fmuladd.f32(float 0x3F847AE140000000, float %sub, float %3)
+  store float %4, ptr %a1, align 4
+  %5 = load ptr, ptr %handle.addr, align 4
+  %b = getelementptr inbounds %struct.dummy_t, ptr %5, i32 0, i32 1
+  %6 = load float, ptr %b, align 4
+  %sub2 = fsub float 0x3F5439DE40000000, %6
+  %7 = load ptr, ptr %handle.addr, align 4
+  %b3 = getelementptr inbounds %struct.dummy_t, ptr %7, i32 0, i32 1
+  %8 = load float, ptr %b3, align 4
+  %9 = call float @llvm.fmuladd.f32(float 0x3F947AE140000000, float %sub2, float %8)
+  store float %9, ptr %b3, align 4
+  %10 = load ptr, ptr %handle.addr, align 4
+  %c = getelementptr inbounds %struct.dummy_t, ptr %10, i32 0, i32 2
+  %11 = load float, ptr %c, align 4
+  %sub4 = fsub float 0x3F5439DE40000000, %11
+  %12 = load ptr, ptr %handle.addr, align 4
+  %c5 = getelementptr inbounds %struct.dummy_t, ptr %12, i32 0, i32 2
+  %13 = load float, ptr %c5, align 4
+  %14 = call float @llvm.fmuladd.f32(float 0x3F9EB851E0000000, float %sub4, float %13)
+  store float %14, ptr %c5, align 4
+  %15 = load ptr, ptr %handle.addr, align 4
+  %d = getelementptr inbounds %struct.dummy_t, ptr %15, i32 0, i32 3
+  %16 = load float, ptr %d, align 4
+  %sub6 = fsub float 0x3F5439DE40000000, %16
+  %17 = load ptr, ptr %handle.addr, align 4
+  %d7 = getelementptr inbounds %struct.dummy_t, ptr %17, i32 0, i32 3
+  %18 = load float, ptr %d7, align 4
+  %19 = call float @llvm.fmuladd.f32(float 0x3FA47AE140000000, float %sub6, float %18)
+  store float %19, ptr %d7, align 4
+  ret i8 0
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) #1
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 373d5b59bca6640..20b52ebc544a1ed 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12648,6 +12648,9 @@ bool ARMAsmParser::enableArchExtFeature(StringRef Name, SMLoc &ExtLoc) {
       {ARM::AEK_CRYPTO,
        {Feature_HasV8Bit},
        {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8}},
+      {(ARM::AEK_DSP | ARM::AEK_SIMD | ARM::AEK_FP),
+       {Feature_HasV8_1MMainlineBit},
+       {ARM::HasMVEFloatOps}},
       {ARM::AEK_FP,
        {Feature_HasV8Bit},
        {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8}},
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index b65d1b24e63d39b..e84b597e4382edc 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -238,14 +238,18 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
                         ? ARMBuildAttrs::AllowNeonARMv8_1a
                         : ARMBuildAttrs::AllowNeonARMv8);
   } else {
-    if (STI.hasFeature(ARM::FeatureFPARMv8_D16_SP))
+    if (STI.hasFeature(ARM::FeatureFPARMv8_D16_SP)) {
       // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
       // FPU, but there are two different names for it depending on the CPU.
-      emitFPU(STI.hasFeature(ARM::FeatureD32)
-                  ? ARM::FK_FP_ARMV8
-                  : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_FPV5_D16
-                                                      : ARM::FK_FPV5_SP_D16));
-    else if (STI.hasFeature(ARM::FeatureVFP4_D16_SP))
+      if (STI.hasFeature(ARM::FeatureD32))
+        emitFPU(ARM::FK_FP_ARMV8);
+      else {
+        emitFPU(STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_FPV5_D16
+                                                 : ARM::FK_FPV5_SP_D16);
+        if (STI.hasFeature(ARM::HasMVEFloatOps))
+          emitArchExtension(ARM::AEK_SIMD | ARM::AEK_DSP | ARM::AEK_FP);
+      }
+    } else if (STI.hasFeature(ARM::FeatureVFP4_D16_SP))
       emitFPU(STI.hasFeature(ARM::FeatureD32)
                   ? ARM::FK_VFPV4
                   : (STI.hasFeature(ARM::FeatureFP64) ? ARM::FK_VFPV4_D16

>From 951537dbb7ce66451bf3c7d5f00d819893f5b264 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Thu, 9 Nov 2023 16:25:25 +0000
Subject: [PATCH 2/5] fixup! [llvm][ARM] Emit MVE .arch_extension after .fpu
 directive if it does not include MVE features.

  1. .arch_extension will always be on the very next line.
     CHECK-NEXT would be bit more robust.
---
 clang/test/CodeGen/arm-v8.1m-check-mve.ll | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.ll b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
index cfcb0223961e31e..6949f5529aeb653 100644
--- a/clang/test/CodeGen/arm-v8.1m-check-mve.ll
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
@@ -4,9 +4,7 @@
 ; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
 ; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
 ; CHECK: .fpu   fpv5-d16
-; CHECK  .arch_extension mve.fp
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "thumbv8.1m.main-none-unknown-eabihf"
+; CHECK-NEXT  .arch_extension mve.fp
 
 %struct.dummy_t = type { float, float, float, float }
 

>From 38c3ee09cc393f5efcc3d44882a85100d8daeac1 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Wed, 15 Nov 2023 16:45:15 +0000
Subject: [PATCH 3/5] fixup! [ARM] .fpu equals fpv5-d16 disables floating point
 MVE which leads to unsupported MVE instructions for cortex M85/M55.

---
 clang/test/CodeGen/arm-v8.1m-check-mve.c | 35 ++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 clang/test/CodeGen/arm-v8.1m-check-mve.c

diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.c b/clang/test/CodeGen/arm-v8.1m-check-mve.c
new file mode 100644
index 000000000000000..221f1cbc27ece5b
--- /dev/null
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.c
@@ -0,0 +1,35 @@
+// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -save-temps=obj -S -o - %s | FileCheck %s
+// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
+
+// REQUIRES: arm-registered-target
+
+// CHECK: .fpu   fpv5-d16
+// CHECK-NEXT  .arch_extension mve.fp
+
+#define DUMMY_CONST_1 (0.0012345F)
+#define DUMMY_CONST_2 (0.01F)
+#define DUMMY_CONST_3 (0.02F)
+#define DUMMY_CONST_4 (0.03F)
+#define DUMMY_CONST_5 (0.04F)
+
+typedef struct
+{
+    float a;
+    float b;
+    float c;
+    float d;
+} dummy_t;
+
+// CHECK-LABEL: foo
+// CHECK: vsub.f32        q0, q0, q1
+// CHECK-NEXT: vfma.f32        q1, q0, q2
+
+signed char foo(dummy_t *handle)
+{
+    handle->a += DUMMY_CONST_2 * (DUMMY_CONST_1 - handle->a);
+    handle->b += DUMMY_CONST_3 * (DUMMY_CONST_1 - handle->b);
+    handle->c += DUMMY_CONST_4 * (DUMMY_CONST_1 - handle->c);
+    handle->d += DUMMY_CONST_5 * (DUMMY_CONST_1 - handle->d);
+    return 0;
+}

>From e8bf45c8d08a7fd095dd4e9eb051834d11ef48bf Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Wed, 15 Nov 2023 16:55:51 +0000
Subject: [PATCH 4/5] fixup! [ARM] .fpu equals fpv5-d16 disables floating point
 MVE which leads to unsupported MVE instructions for cortex M85/M55.

---
 clang/test/CodeGen/arm-v8.1m-check-mve.ll | 54 -----------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 clang/test/CodeGen/arm-v8.1m-check-mve.ll

diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.ll b/clang/test/CodeGen/arm-v8.1m-check-mve.ll
deleted file mode 100644
index 6949f5529aeb653..000000000000000
--- a/clang/test/CodeGen/arm-v8.1m-check-mve.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; REQUIRES: arm-registered-target
-; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
-; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -save-temps=obj -S -o - %s | FileCheck %s
-; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
-; RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
-; CHECK: .fpu   fpv5-d16
-; CHECK-NEXT  .arch_extension mve.fp
-
-%struct.dummy_t = type { float, float, float, float }
-
-define dso_local signext i8 @foo(ptr noundef %handle) #0 {
-entry:
-  %handle.addr = alloca ptr, align 4
-  store ptr %handle, ptr %handle.addr, align 4
-  %0 = load ptr, ptr %handle.addr, align 4
-  %a = getelementptr inbounds %struct.dummy_t, ptr %0, i32 0, i32 0
-  %1 = load float, ptr %a, align 4
-  %sub = fsub float 0x3F5439DE40000000, %1
-  %2 = load ptr, ptr %handle.addr, align 4
-  %a1 = getelementptr inbounds %struct.dummy_t, ptr %2, i32 0, i32 0
-  %3 = load float, ptr %a1, align 4
-  %4 = call float @llvm.fmuladd.f32(float 0x3F847AE140000000, float %sub, float %3)
-  store float %4, ptr %a1, align 4
-  %5 = load ptr, ptr %handle.addr, align 4
-  %b = getelementptr inbounds %struct.dummy_t, ptr %5, i32 0, i32 1
-  %6 = load float, ptr %b, align 4
-  %sub2 = fsub float 0x3F5439DE40000000, %6
-  %7 = load ptr, ptr %handle.addr, align 4
-  %b3 = getelementptr inbounds %struct.dummy_t, ptr %7, i32 0, i32 1
-  %8 = load float, ptr %b3, align 4
-  %9 = call float @llvm.fmuladd.f32(float 0x3F947AE140000000, float %sub2, float %8)
-  store float %9, ptr %b3, align 4
-  %10 = load ptr, ptr %handle.addr, align 4
-  %c = getelementptr inbounds %struct.dummy_t, ptr %10, i32 0, i32 2
-  %11 = load float, ptr %c, align 4
-  %sub4 = fsub float 0x3F5439DE40000000, %11
-  %12 = load ptr, ptr %handle.addr, align 4
-  %c5 = getelementptr inbounds %struct.dummy_t, ptr %12, i32 0, i32 2
-  %13 = load float, ptr %c5, align 4
-  %14 = call float @llvm.fmuladd.f32(float 0x3F9EB851E0000000, float %sub4, float %13)
-  store float %14, ptr %c5, align 4
-  %15 = load ptr, ptr %handle.addr, align 4
-  %d = getelementptr inbounds %struct.dummy_t, ptr %15, i32 0, i32 3
-  %16 = load float, ptr %d, align 4
-  %sub6 = fsub float 0x3F5439DE40000000, %16
-  %17 = load ptr, ptr %handle.addr, align 4
-  %d7 = getelementptr inbounds %struct.dummy_t, ptr %17, i32 0, i32 3
-  %18 = load float, ptr %d7, align 4
-  %19 = call float @llvm.fmuladd.f32(float 0x3FA47AE140000000, float %sub6, float %18)
-  store float %19, ptr %d7, align 4
-  ret i8 0
-}
-
-declare float @llvm.fmuladd.f32(float, float, float) #1

>From 891c77b45605fb25ec4100ca2b194675f9b31dc0 Mon Sep 17 00:00:00 2001
From: Simi Pallipurath <simi.pallipurath.com>
Date: Thu, 16 Nov 2023 11:28:58 +0000
Subject: [PATCH 5/5] fixup! [ARM] .fpu equals fpv5-d16 disables floating point
 MVE which leads to unsupported MVE instructions for cortex M85/M55.

---
 clang/test/CodeGen/arm-v8.1m-check-mve.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/clang/test/CodeGen/arm-v8.1m-check-mve.c b/clang/test/CodeGen/arm-v8.1m-check-mve.c
index 221f1cbc27ece5b..4f5941c782f0cc9 100644
--- a/clang/test/CodeGen/arm-v8.1m-check-mve.c
+++ b/clang/test/CodeGen/arm-v8.1m-check-mve.c
@@ -1,4 +1,8 @@
 // RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -save-temps=obj -S -o - %s | FileCheck %s
+// RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -save-temps=obj -S -o - %s | FileCheck %s
+
+// The below tests are to make sure that assembly directives do not lose mve feature so that reassembly works with
+// mve floating point instructions.
 // RUN: %clang --target=arm-none-eabi -mcpu=cortex-m85 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
 // RUN: %clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard -O2 -c -mthumb -save-temps=obj %s
 
@@ -8,10 +12,6 @@
 // CHECK-NEXT  .arch_extension mve.fp
 
 #define DUMMY_CONST_1 (0.0012345F)
-#define DUMMY_CONST_2 (0.01F)
-#define DUMMY_CONST_3 (0.02F)
-#define DUMMY_CONST_4 (0.03F)
-#define DUMMY_CONST_5 (0.04F)
 
 typedef struct
 {
@@ -22,14 +22,14 @@ typedef struct
 } dummy_t;
 
 // CHECK-LABEL: foo
-// CHECK: vsub.f32        q0, q0, q1
-// CHECK-NEXT: vfma.f32        q1, q0, q2
+// CHECK: vsub.f32
+// CHECK: vfma.f32
 
 signed char foo(dummy_t *handle)
 {
-    handle->a += DUMMY_CONST_2 * (DUMMY_CONST_1 - handle->a);
-    handle->b += DUMMY_CONST_3 * (DUMMY_CONST_1 - handle->b);
-    handle->c += DUMMY_CONST_4 * (DUMMY_CONST_1 - handle->c);
-    handle->d += DUMMY_CONST_5 * (DUMMY_CONST_1 - handle->d);
+    handle->a += 0.01F * (DUMMY_CONST_1 - handle->a);
+    handle->b += 0.02F * (DUMMY_CONST_1 - handle->b);
+    handle->c += 0.03F * (DUMMY_CONST_1 - handle->c);
+    handle->d += 0.04F * (DUMMY_CONST_1 - handle->d);
     return 0;
 }