[llvm] [AArch64] Fix failure with inline asm and svcount (PR #112537)

Thu Oct 24 09:23:36 PDT 2024

https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/112537

>From d5250c6dcf017f31810f5a10eca8884a106d68c1 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Wed, 16 Oct 2024 09:56:14 +0100
Subject: [PATCH 1/2] [AArch64] Fix failure with inline asm and svcount

This fixes an issue where the compiler runs into an assertion
failure for the following example:

  register svcount_t pred asm("pn8") = svptrue_c8();
  asm("ld1w { z0.s, z4.s, z8.s, z12.s }, %[pred]/z, [x0]\n"
    :
    : [pred] "Uph" (pred)
    : "memory", "cc");

Here the register constraint that ends up in the LLVM IR is "{pn8}",
but the code in `TargetRegisterInfo::getRegForInlineAsmConstraint`
that parses that string, follows a path where it queries a
suitable register class for this register (<=> PPRorPNR regclass),
for which it then chooses `nxv16i1` as a suitable type. These
choices individually are correct, but the combined result isn't,
because the type should be `aarch64svcount`.
This then results in issues later on in SelectionDAGBuilder.cpp
in CopyToReg because the type of the actual value and the computed
type from the constraint don't match.

This PR pre-empts this issue by parsing the predicate explicitly
and returning the correct register class.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 32 ++++++++++
 llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll  | 64 +++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9adb11292376ce..5a848ada9dd8ee 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11867,6 +11867,36 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
 
 enum class PredicateConstraint { Uph, Upl, Upa };
 
+// Returns a {Reg, RegisterClass} tuple if the constraint is
+// a specific predicate register.
+//
+// For some constraint like "{pn3}" the default path in
+// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
+// suitable register class for this register is "PPRorPNR", after which it
+// determines that nxv16i1 is an appropriate type for the constraint, which is
+// not what we want. The code here pre-empts this by matching the register
+// explicitly.
+static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
+parsePredicateRegAsConstraint(StringRef Constraint) {
+  if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
+      Constraint[1] != 'p')
+    return std::nullopt;
+
+  Constraint = Constraint.substr(2, Constraint.size() - 3);
+  bool IsPredicateAsCount = Constraint.starts_with("n");
+  if (IsPredicateAsCount)
+    Constraint = Constraint.drop_front(1);
+
+  unsigned V;
+  if (Constraint.getAsInteger(10, V) || V > 31)
+    return std::nullopt;
+
+  if (IsPredicateAsCount)
+    return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
+  else
+    return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
+}
+
 static std::optional<PredicateConstraint>
 parsePredicateConstraint(StringRef Constraint) {
   return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
@@ -12114,6 +12144,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
       break;
     }
   } else {
+    if (const auto P = parsePredicateRegAsConstraint(Constraint))
+      return *P;
     if (const auto PC = parsePredicateConstraint(Constraint))
       if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
         return std::make_pair(0U, RegClass);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
index 9f8897575b3d58..f587f315b6658d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
@@ -119,3 +119,67 @@ define <vscale x 8 x half> @test_svfadd_f16_Uph_constraint(<vscale x 16 x i1> %P
   %1 = tail call <vscale x 8 x half> asm "fadd $0.h, $1/m, $2.h, $3.h", "=w, at 3Uph,w,w"(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
   ret <vscale x 8 x half> %1
 }
+
+define void @explicit_p0(ptr %p) {
+  ; CHECK-LABEL: name: explicit_p0
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[PTRUE_B:%[0-9]+]]:ppr = PTRUE_B 31, implicit $vg
+  ; CHECK-NEXT:   $p0 = COPY [[PTRUE_B]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 2818058 /* regdef:GPR64common */, def %1, 9 /* reguse */, $p0, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   RET_ReallyLR
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.b8(i32 31)
+  %2 = tail call i64 asm sideeffect "ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", "=r,{p0},0"(<vscale x 16 x i1> %1, ptr %p)
+  ret void
+}
+
+define void @explicit_p8_invalid(ptr %p) {
+  ; CHECK-LABEL: name: explicit_p8_invalid
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[PTRUE_B:%[0-9]+]]:ppr = PTRUE_B 31, implicit $vg
+  ; CHECK-NEXT:   $p8 = COPY [[PTRUE_B]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 2818058 /* regdef:GPR64common */, def %1, 9 /* reguse */, $p8, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   RET_ReallyLR
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.b8(i32 31)
+  %2 = tail call i64 asm sideeffect "ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", "=r,{p8},0"(<vscale x 16 x i1> %1, ptr %p)
+  ret void
+}
+
+define void @explicit_pn8(ptr %p) {
+  ; CHECK-LABEL: name: explicit_pn8
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[PTRUE_C_B:%[0-9]+]]:pnr_p8to15 = PTRUE_C_B implicit $vg
+  ; CHECK-NEXT:   $pn8 = COPY [[PTRUE_C_B]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 2818058 /* regdef:GPR64common */, def %1, 9 /* reguse */, $pn8, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   RET_ReallyLR
+  %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %2 = tail call i64 asm sideeffect "ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", "=r,{pn8},0"(target("aarch64.svcount") %1, ptr %p)
+  ret void
+}
+
+define void @explicit_pn0_invalid(ptr %p) {
+  ; CHECK-LABEL: name: explicit_pn0_invalid
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   [[PTRUE_C_B:%[0-9]+]]:pnr_p8to15 = PTRUE_C_B implicit $vg
+  ; CHECK-NEXT:   $pn0 = COPY [[PTRUE_C_B]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 2818058 /* regdef:GPR64common */, def %1, 9 /* reguse */, $pn0, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   RET_ReallyLR
+  %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %2 = tail call i64 asm sideeffect "ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", "=r,{pn0},0"(target("aarch64.svcount") %1, ptr %p)
+  ret void
+}

>From 9da6284fda909371ec67debc9eb517a166eac130 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Thu, 24 Oct 2024 16:21:17 +0000
Subject: [PATCH 2/2] Fix up magical numbers in MIR test

---
 llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
index f587f315b6658d..ff66206228a4aa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
@@ -129,7 +129,7 @@ define void @explicit_p0(ptr %p) {
   ; CHECK-NEXT:   [[PTRUE_B:%[0-9]+]]:ppr = PTRUE_B 31, implicit $vg
   ; CHECK-NEXT:   $p0 = COPY [[PTRUE_B]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]]
-  ; CHECK-NEXT:   INLINEASM &"ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 2818058 /* regdef:GPR64common */, def %1, 9 /* reguse */, $p0, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 3538954 /* regdef:GPR64common */, def %1, 9 /* reguse */, $p0, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   RET_ReallyLR
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.b8(i32 31)
   %2 = tail call i64 asm sideeffect "ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", "=r,{p0},0"(<vscale x 16 x i1> %1, ptr %p)
@@ -145,7 +145,7 @@ define void @explicit_p8_invalid(ptr %p) {
   ; CHECK-NEXT:   [[PTRUE_B:%[0-9]+]]:ppr = PTRUE_B 31, implicit $vg
   ; CHECK-NEXT:   $p8 = COPY [[PTRUE_B]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]]
-  ; CHECK-NEXT:   INLINEASM &"ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 2818058 /* regdef:GPR64common */, def %1, 9 /* reguse */, $p8, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 3538954 /* regdef:GPR64common */, def %1, 9 /* reguse */, $p8, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   RET_ReallyLR
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.b8(i32 31)
   %2 = tail call i64 asm sideeffect "ld4w { z0.s, z1.s, z2.s, z3.s }, $1/z, [$0]", "=r,{p8},0"(<vscale x 16 x i1> %1, ptr %p)
@@ -161,7 +161,7 @@ define void @explicit_pn8(ptr %p) {
   ; CHECK-NEXT:   [[PTRUE_C_B:%[0-9]+]]:pnr_p8to15 = PTRUE_C_B implicit $vg
   ; CHECK-NEXT:   $pn8 = COPY [[PTRUE_C_B]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]]
-  ; CHECK-NEXT:   INLINEASM &"ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 2818058 /* regdef:GPR64common */, def %1, 9 /* reguse */, $pn8, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 3538954 /* regdef:GPR64common */, def %1, 9 /* reguse */, $pn8, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   RET_ReallyLR
   %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
   %2 = tail call i64 asm sideeffect "ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", "=r,{pn8},0"(target("aarch64.svcount") %1, ptr %p)
@@ -177,7 +177,7 @@ define void @explicit_pn0_invalid(ptr %p) {
   ; CHECK-NEXT:   [[PTRUE_C_B:%[0-9]+]]:pnr_p8to15 = PTRUE_C_B implicit $vg
   ; CHECK-NEXT:   $pn0 = COPY [[PTRUE_C_B]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]]
-  ; CHECK-NEXT:   INLINEASM &"ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 2818058 /* regdef:GPR64common */, def %1, 9 /* reguse */, $pn0, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", 1 /* sideeffect attdialect */, 3538954 /* regdef:GPR64common */, def %1, 9 /* reguse */, $pn0, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   RET_ReallyLR
   %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
   %2 = tail call i64 asm sideeffect "ld1w { z0.s, z4.s, z8.s, z12.s }, $1/z, [$0]", "=r,{pn0},0"(target("aarch64.svcount") %1, ptr %p)