[llvm] [AArch64] Inline asm v0-v31 are scalar when having less than 64-bit capacity (PR #169930)

Sun Nov 30 20:49:35 PST 2025

https://github.com/AlexeyMerzlyakov updated https://github.com/llvm/llvm-project/pull/169930

>From e54e6cb0daedfcd540a76c7683d97abf59a79c27 Mon Sep 17 00:00:00 2001
From: Alexey Merzlyakov <alexey.merzlyakov at samsung.com>
Date: Fri, 28 Nov 2025 11:25:01 +0300
Subject: [PATCH] [AArch64] Inline asm v0-v31 are scalar when having less than
 64-bit capacity.

If 32-bit (or less) "v0" registers coming from inline asm are treated as
vector ones, codegen might produce incorrect vector<->scalar conversions.
This causes types mismatch assertion failures later during compile-time.
The fix treats 32-bit or less v0-v31 AArch64 registers as scalar, along with
64-bit ones.

Fixes #153442
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 .../CodeGen/AArch64/inline-asm-nop-reg.ll     | 136 ++++++++++++++++++
 2 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/inline-asm-nop-reg.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e91f5a877b35b..8f3f6d27f5207 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13365,7 +13365,7 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
         // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
         // By default we'll emit v0-v31 for this unless there's a modifier where
         // we'll emit the correct register as well.
-        if (VT != MVT::Other && VT.getSizeInBits() == 64) {
+        if (VT != MVT::Other && VT.getSizeInBits() <= 64) {
           Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
           Res.second = &AArch64::FPR64RegClass;
         } else {
diff --git a/llvm/test/CodeGen/AArch64/inline-asm-nop-reg.ll b/llvm/test/CodeGen/AArch64/inline-asm-nop-reg.ll
new file mode 100644
index 0000000000000..c73c3e6b90a5c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/inline-asm-nop-reg.ll
@@ -0,0 +1,136 @@
+; RUN: llc -O1 -mtriple=aarch64-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+; This test checks that the code containing "nop" inline assembler instruction
+; with 16/32/64-bit FP "v0" register, will be successfully compiled
+; and generated code will contain one optimized nop-instruction
+; per each function.
+;
+; IR for this test was generated from the following source code:
+;
+; #define _FP16 _Float16
+; #define _FP32 float
+; #define _FP64 double
+;
+; #define FOO(BITS) \
+; int foo##BITS(void) { \
+;   register _FP##BITS a0 asm("v0"); \
+;   for (int i = 0; i < 2; ++i) { \
+;     __asm__ volatile("nop" : [a0] "+w"(a0)::); \
+;   } \
+;   return 0; \
+; }
+;
+; FOO(16)
+; FOO(32)
+; FOO(64)
+
+
+; test nop_fp16_reg
+; CHECK-LABEL: foo16:
+; CHECK: nop
+; CHECK-NOT: nop
+define dso_local i32 @foo16() #0 {
+  %1 = alloca half, align 2
+  %2 = alloca i32, align 4
+  store i32 0, ptr %2, align 4
+  br label %3
+
+3:                                                ; preds = %9, %0
+  %4 = load i32, ptr %2, align 4
+  %5 = icmp slt i32 %4, 2
+  br i1 %5, label %6, label %12
+
+6:                                                ; preds = %3
+  %7 = load half, ptr %1, align 2
+  %8 = call half asm sideeffect "nop", "={v0},{v0}"(half %7) #1, !srcloc !6
+  store half %8, ptr %1, align 2
+  br label %9
+
+9:                                                ; preds = %6
+  %10 = load i32, ptr %2, align 4
+  %11 = add nsw i32 %10, 1
+  store i32 %11, ptr %2, align 4
+  br label %3, !llvm.loop !7
+
+12:                                               ; preds = %3
+  ret i32 0
+}
+
+; test nop_fp32_reg
+; CHECK-LABEL: foo32:
+; CHECK: nop
+; CHECK-NOT: nop
+define dso_local i32 @foo32() #0 {
+  %1 = alloca float, align 4
+  %2 = alloca i32, align 4
+  store i32 0, ptr %2, align 4
+  br label %3
+
+3:                                                ; preds = %9, %0
+  %4 = load i32, ptr %2, align 4
+  %5 = icmp slt i32 %4, 2
+  br i1 %5, label %6, label %12
+
+6:                                                ; preds = %3
+  %7 = load float, ptr %1, align 4
+  %8 = call float asm sideeffect "nop", "={v0},{v0}"(float %7) #1, !srcloc !9
+  store float %8, ptr %1, align 4
+  br label %9
+
+9:                                                ; preds = %6
+  %10 = load i32, ptr %2, align 4
+  %11 = add nsw i32 %10, 1
+  store i32 %11, ptr %2, align 4
+  br label %3, !llvm.loop !10
+
+12:                                               ; preds = %3
+  ret i32 0
+}
+
+; test nop_fp64_reg
+; CHECK-LABEL: foo64:
+; CHECK: nop
+; CHECK-NOT: nop
+define dso_local i32 @foo64() #0 {
+  %1 = alloca double, align 8
+  %2 = alloca i32, align 4
+  store i32 0, ptr %2, align 4
+  br label %3
+
+3:                                                ; preds = %9, %0
+  %4 = load i32, ptr %2, align 4
+  %5 = icmp slt i32 %4, 2
+  br i1 %5, label %6, label %12
+
+6:                                                ; preds = %3
+  %7 = load double, ptr %1, align 8
+  %8 = call double asm sideeffect "nop", "={v0},{v0}"(double %7) #1, !srcloc !11
+  store double %8, ptr %1, align 8
+  br label %9
+
+9:                                                ; preds = %6
+  %10 = load i32, ptr %2, align 4
+  %11 = add nsw i32 %10, 1
+  store i32 %11, ptr %2, align 4
+  br label %3, !llvm.loop !12
+
+12:                                               ; preds = %3
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!"clang version 22.0.0git"}
+!6 = !{i64 2147502427}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{i64 2147502622}
+!10 = distinct !{!10, !8}
+!11 = !{i64 2147502814}
+!12 = distinct !{!12, !8}