[llvm] [AArch64][SME] Lower memchr to __arm_sc_memchr in streaming[-compatible] functions (PR #168896)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 20 08:51:32 PST 2025


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/168896

>From b90315b5093091097faaefd513278f5b6134e489 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 20 Nov 2025 15:44:31 +0000
Subject: [PATCH 1/3] Precommit test

---
 .../streaming-compatible-memory-ops.ll        | 319 +++++++++++++++++-
 1 file changed, 315 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index 9c66b38c46973..b87a81856056d 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -153,6 +153,316 @@ entry:
   ret void
 }
 
+define ptr @se_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: se_memchr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Spill
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    mov x2, x1
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    mov w1, #5 // =0x5
+; CHECK-NEXT:    bl memchr
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-SME-ROUTINES-LABEL: se_memchr:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x9, [sp, #80] // 8-byte Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset vg, -16
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset w30, -24
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset w29, -32
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b8, -40
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b9, -48
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b10, -56
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b11, -64
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b12, -72
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b13, -80
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b14, -88
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b15, -96
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x1
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:    mov w1, #5 // =0x5
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memchr
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore vg
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore w30
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore w29
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b8
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b9
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b10
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b11
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b12
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b13
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b14
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b15
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
+;
+; CHECK-MOPS-LABEL: se_memchr:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-MOPS-NEXT:    cntd x9
+; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    str x9, [sp, #80] // 8-byte Spill
+; CHECK-MOPS-NEXT:    .cfi_offset vg, -16
+; CHECK-MOPS-NEXT:    .cfi_offset w30, -24
+; CHECK-MOPS-NEXT:    .cfi_offset w29, -32
+; CHECK-MOPS-NEXT:    .cfi_offset b8, -40
+; CHECK-MOPS-NEXT:    .cfi_offset b9, -48
+; CHECK-MOPS-NEXT:    .cfi_offset b10, -56
+; CHECK-MOPS-NEXT:    .cfi_offset b11, -64
+; CHECK-MOPS-NEXT:    .cfi_offset b12, -72
+; CHECK-MOPS-NEXT:    .cfi_offset b13, -80
+; CHECK-MOPS-NEXT:    .cfi_offset b14, -88
+; CHECK-MOPS-NEXT:    .cfi_offset b15, -96
+; CHECK-MOPS-NEXT:    mov x2, x1
+; CHECK-MOPS-NEXT:    smstop sm
+; CHECK-MOPS-NEXT:    mov w1, #5 // =0x5
+; CHECK-MOPS-NEXT:    bl memchr
+; CHECK-MOPS-NEXT:    smstart sm
+; CHECK-MOPS-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-MOPS-NEXT:    .cfi_restore vg
+; CHECK-MOPS-NEXT:    .cfi_restore w30
+; CHECK-MOPS-NEXT:    .cfi_restore w29
+; CHECK-MOPS-NEXT:    .cfi_restore b8
+; CHECK-MOPS-NEXT:    .cfi_restore b9
+; CHECK-MOPS-NEXT:    .cfi_restore b10
+; CHECK-MOPS-NEXT:    .cfi_restore b11
+; CHECK-MOPS-NEXT:    .cfi_restore b12
+; CHECK-MOPS-NEXT:    .cfi_restore b13
+; CHECK-MOPS-NEXT:    .cfi_restore b14
+; CHECK-MOPS-NEXT:    .cfi_restore b15
+; CHECK-MOPS-NEXT:    ret
+entry:
+  %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n)
+  ret ptr %res
+}
+
+define ptr @sc_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: sc_memchr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    mov x2, x1
+; CHECK-NEXT:    mrs x19, SVCR
+; CHECK-NEXT:    tbz w19, #0, .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB4_2: // %entry
+; CHECK-NEXT:    mov w1, #5 // =0x5
+; CHECK-NEXT:    bl memchr
+; CHECK-NEXT:    tbz w19, #0, .LBB4_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB4_4: // %entry
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-SME-ROUTINES-LABEL: sc_memchr:
+; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset w19, -8
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset vg, -16
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset w30, -24
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset w29, -32
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b8, -40
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b9, -48
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b10, -56
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b11, -64
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b12, -72
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b13, -80
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b14, -88
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_offset b15, -96
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x1
+; CHECK-NO-SME-ROUTINES-NEXT:    mrs x19, SVCR
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB4_2
+; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.1: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB4_2: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    mov w1, #5 // =0x5
+; CHECK-NO-SME-ROUTINES-NEXT:    bl memchr
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB4_4
+; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB4_4: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x19, [sp, #88] // 8-byte Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore w19
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore vg
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore w30
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore w29
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b8
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b9
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b10
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b11
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b12
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b13
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b14
+; CHECK-NO-SME-ROUTINES-NEXT:    .cfi_restore b15
+; CHECK-NO-SME-ROUTINES-NEXT:    ret
+;
+; CHECK-MOPS-LABEL: sc_memchr:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-MOPS-NEXT:    cntd x9
+; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    .cfi_offset w19, -8
+; CHECK-MOPS-NEXT:    .cfi_offset vg, -16
+; CHECK-MOPS-NEXT:    .cfi_offset w30, -24
+; CHECK-MOPS-NEXT:    .cfi_offset w29, -32
+; CHECK-MOPS-NEXT:    .cfi_offset b8, -40
+; CHECK-MOPS-NEXT:    .cfi_offset b9, -48
+; CHECK-MOPS-NEXT:    .cfi_offset b10, -56
+; CHECK-MOPS-NEXT:    .cfi_offset b11, -64
+; CHECK-MOPS-NEXT:    .cfi_offset b12, -72
+; CHECK-MOPS-NEXT:    .cfi_offset b13, -80
+; CHECK-MOPS-NEXT:    .cfi_offset b14, -88
+; CHECK-MOPS-NEXT:    .cfi_offset b15, -96
+; CHECK-MOPS-NEXT:    mov x2, x1
+; CHECK-MOPS-NEXT:    mrs x19, SVCR
+; CHECK-MOPS-NEXT:    tbz w19, #0, .LBB4_2
+; CHECK-MOPS-NEXT:  // %bb.1: // %entry
+; CHECK-MOPS-NEXT:    smstop sm
+; CHECK-MOPS-NEXT:  .LBB4_2: // %entry
+; CHECK-MOPS-NEXT:    mov w1, #5 // =0x5
+; CHECK-MOPS-NEXT:    bl memchr
+; CHECK-MOPS-NEXT:    tbz w19, #0, .LBB4_4
+; CHECK-MOPS-NEXT:  // %bb.3: // %entry
+; CHECK-MOPS-NEXT:    smstart sm
+; CHECK-MOPS-NEXT:  .LBB4_4: // %entry
+; CHECK-MOPS-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldr x19, [sp, #88] // 8-byte Reload
+; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-MOPS-NEXT:    .cfi_restore w19
+; CHECK-MOPS-NEXT:    .cfi_restore vg
+; CHECK-MOPS-NEXT:    .cfi_restore w30
+; CHECK-MOPS-NEXT:    .cfi_restore w29
+; CHECK-MOPS-NEXT:    .cfi_restore b8
+; CHECK-MOPS-NEXT:    .cfi_restore b9
+; CHECK-MOPS-NEXT:    .cfi_restore b10
+; CHECK-MOPS-NEXT:    .cfi_restore b11
+; CHECK-MOPS-NEXT:    .cfi_restore b12
+; CHECK-MOPS-NEXT:    .cfi_restore b13
+; CHECK-MOPS-NEXT:    .cfi_restore b14
+; CHECK-MOPS-NEXT:    .cfi_restore b15
+; CHECK-MOPS-NEXT:    ret
+entry:
+  %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n)
+  ret ptr %res
+}
+
 define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: sc_memcpy:
 ; CHECK:       // %bb.0: // %entry
@@ -179,15 +489,15 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NO-SME-ROUTINES-NEXT:    mrs x19, SVCR
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
-; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_2
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB5_2
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.1: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
-; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_2: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB5_2: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
-; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_4
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB5_4
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
-; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_4: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB5_4: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -283,3 +593,4 @@ entry:
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
 declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
+declare ptr @memchr(ptr, i32, i64)

>From 4b834bcc9daa7c4bac44553cb775227124b94379 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 20 Nov 2025 15:45:48 +0000
Subject: [PATCH 2/3] [AArch64][SME] Lower memchr to __arm_sc_memchr in
 streaming[-compatible] functions

This allows us to avoid some streaming-mode switches.
---
 llvm/include/llvm/IR/RuntimeLibcalls.td       |   2 +
 .../AArch64/AArch64SelectionDAGInfo.cpp       |  35 +++-
 .../Target/AArch64/AArch64SelectionDAGInfo.h  |   9 +-
 .../streaming-compatible-memory-ops.ll        | 196 ++----------------
 4 files changed, 58 insertions(+), 184 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index ce7e836f66446..71e0edf03a16d 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -315,6 +315,7 @@ def MEMCMP : RuntimeLibcall;
 def MEMCPY : RuntimeLibcall;
 def MEMMOVE : RuntimeLibcall;
 def MEMSET : RuntimeLibcall;
+def MEMCHR : RuntimeLibcall;
 def CALLOC : RuntimeLibcall;
 def BZERO : RuntimeLibcall;
 def STRLEN : RuntimeLibcall;
@@ -997,6 +998,7 @@ def fesetmode : RuntimeLibcallImpl<FESETMODE>;
 def memcpy : RuntimeLibcallImpl<MEMCPY>;
 def memmove : RuntimeLibcallImpl<MEMMOVE>;
 def memset : RuntimeLibcallImpl<MEMSET>;
+def memchr : RuntimeLibcallImpl<MEMCHR>;
 
 // DSEPass can emit calloc if it finds a pair of malloc/memset
 def calloc : RuntimeLibcallImpl<CALLOC>;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 48e03ad853d26..38c7a3d55f856 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -156,29 +156,35 @@ SDValue AArch64SelectionDAGInfo::EmitMOPS(unsigned Opcode, SelectionDAG &DAG,
 }
 
 SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
-    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Op0, SDValue Op1,
     SDValue Size, RTLIB::Libcall LC) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
   const AArch64TargetLowering *TLI = STI.getTargetLowering();
   TargetLowering::ArgListTy Args;
-  Args.emplace_back(Dst, PointerType::getUnqual(*DAG.getContext()));
+  Args.emplace_back(Op0, PointerType::getUnqual(*DAG.getContext()));
 
   RTLIB::Libcall NewLC;
   switch (LC) {
   case RTLIB::MEMCPY: {
     NewLC = RTLIB::SC_MEMCPY;
-    Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext()));
+    Args.emplace_back(Op1, PointerType::getUnqual(*DAG.getContext()));
     break;
   }
   case RTLIB::MEMMOVE: {
     NewLC = RTLIB::SC_MEMMOVE;
-    Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext()));
+    Args.emplace_back(Op1, PointerType::getUnqual(*DAG.getContext()));
     break;
   }
   case RTLIB::MEMSET: {
     NewLC = RTLIB::SC_MEMSET;
-    Args.emplace_back(DAG.getZExtOrTrunc(Src, DL, MVT::i32),
+    Args.emplace_back(DAG.getZExtOrTrunc(Op1, DL, MVT::i32),
+                      Type::getInt32Ty(*DAG.getContext()));
+    break;
+  }
+  case RTLIB::MEMCHR: {
+    NewLC = RTLIB::SC_MEMCHR;
+    Args.emplace_back(DAG.getZExtOrTrunc(Op1, DL, MVT::i32),
                       Type::getInt32Ty(*DAG.getContext()));
     break;
   }
@@ -194,7 +200,11 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall(
   PointerType *RetTy = PointerType::getUnqual(*DAG.getContext());
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
       TLI->getLibcallCallingConv(NewLC), RetTy, Symbol, std::move(Args));
-  return TLI->LowerCallTo(CLI).second;
+
+  auto [Result, ChainOut] = TLI->LowerCallTo(CLI);
+  if (LC == RTLIB::MEMCHR)
+    return DAG.getMergeValues({Result, ChainOut}, DL);
+  return ChainOut;
 }
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
@@ -255,6 +265,19 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
   return SDValue();
 }
 
+std::pair<SDValue, SDValue> AArch64SelectionDAGInfo::EmitTargetCodeForMemchr(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Src,
+    SDValue Char, SDValue Length, MachinePointerInfo SrcPtrInfo) const {
+  auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+  SMEAttrs Attrs = AFI->getSMEFnAttrs();
+  if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody()) {
+    SDValue Result = EmitStreamingCompatibleMemLibCall(
+        DAG, dl, Chain, Src, Char, Length, RTLIB::MEMCHR);
+    return std::make_pair(Result.getValue(0), Result.getValue(1));
+  }
+  return std::make_pair(SDValue(), SDValue());
+}
+
 static const int kSetTagLoopThreshold = 176;
 
 static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 42c2797ebdd17..656a58c1dc1bf 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -53,14 +53,19 @@ class AArch64SelectionDAGInfo : public SelectionDAGGenTargetInfo {
                            MachinePointerInfo DstPtrInfo,
                            MachinePointerInfo SrcPtrInfo) const override;
 
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemchr(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+                          SDValue Src, SDValue Char, SDValue Length,
+                          MachinePointerInfo SrcPtrInfo) const override;
+
   SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   MachinePointerInfo DstPtrInfo,
                                   bool ZeroData) const override;
 
   SDValue EmitStreamingCompatibleMemLibCall(SelectionDAG &DAG, const SDLoc &DL,
-                                            SDValue Chain, SDValue Dst,
-                                            SDValue Src, SDValue Size,
+                                            SDValue Chain, SDValue Op0,
+                                            SDValue Op1, SDValue Size,
                                             RTLIB::Libcall LC) const;
 };
 } // namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index b87a81856056d..fc4ae272046a0 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -156,47 +156,13 @@ entry:
 define ptr @se_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: se_memchr:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Spill
-; CHECK-NEXT:    .cfi_offset vg, -16
-; CHECK-NEXT:    .cfi_offset w30, -24
-; CHECK-NEXT:    .cfi_offset w29, -32
-; CHECK-NEXT:    .cfi_offset b8, -40
-; CHECK-NEXT:    .cfi_offset b9, -48
-; CHECK-NEXT:    .cfi_offset b10, -56
-; CHECK-NEXT:    .cfi_offset b11, -64
-; CHECK-NEXT:    .cfi_offset b12, -72
-; CHECK-NEXT:    .cfi_offset b13, -80
-; CHECK-NEXT:    .cfi_offset b14, -88
-; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    mov x2, x1
-; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    mov w1, #5 // =0x5
-; CHECK-NEXT:    bl memchr
-; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    .cfi_restore vg
-; CHECK-NEXT:    .cfi_restore w30
-; CHECK-NEXT:    .cfi_restore w29
-; CHECK-NEXT:    .cfi_restore b8
-; CHECK-NEXT:    .cfi_restore b9
-; CHECK-NEXT:    .cfi_restore b10
-; CHECK-NEXT:    .cfi_restore b11
-; CHECK-NEXT:    .cfi_restore b12
-; CHECK-NEXT:    .cfi_restore b13
-; CHECK-NEXT:    .cfi_restore b14
-; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    bl __arm_sc_memchr
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-SME-ROUTINES-LABEL: se_memchr:
@@ -246,47 +212,13 @@ define ptr @se_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_enabled" {
 ;
 ; CHECK-MOPS-LABEL: se_memchr:
 ; CHECK-MOPS:       // %bb.0: // %entry
-; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-MOPS-NEXT:    cntd x9
-; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    str x9, [sp, #80] // 8-byte Spill
-; CHECK-MOPS-NEXT:    .cfi_offset vg, -16
-; CHECK-MOPS-NEXT:    .cfi_offset w30, -24
-; CHECK-MOPS-NEXT:    .cfi_offset w29, -32
-; CHECK-MOPS-NEXT:    .cfi_offset b8, -40
-; CHECK-MOPS-NEXT:    .cfi_offset b9, -48
-; CHECK-MOPS-NEXT:    .cfi_offset b10, -56
-; CHECK-MOPS-NEXT:    .cfi_offset b11, -64
-; CHECK-MOPS-NEXT:    .cfi_offset b12, -72
-; CHECK-MOPS-NEXT:    .cfi_offset b13, -80
-; CHECK-MOPS-NEXT:    .cfi_offset b14, -88
-; CHECK-MOPS-NEXT:    .cfi_offset b15, -96
+; CHECK-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-MOPS-NEXT:    .cfi_offset w30, -16
 ; CHECK-MOPS-NEXT:    mov x2, x1
-; CHECK-MOPS-NEXT:    smstop sm
 ; CHECK-MOPS-NEXT:    mov w1, #5 // =0x5
-; CHECK-MOPS-NEXT:    bl memchr
-; CHECK-MOPS-NEXT:    smstart sm
-; CHECK-MOPS-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-MOPS-NEXT:    .cfi_restore vg
-; CHECK-MOPS-NEXT:    .cfi_restore w30
-; CHECK-MOPS-NEXT:    .cfi_restore w29
-; CHECK-MOPS-NEXT:    .cfi_restore b8
-; CHECK-MOPS-NEXT:    .cfi_restore b9
-; CHECK-MOPS-NEXT:    .cfi_restore b10
-; CHECK-MOPS-NEXT:    .cfi_restore b11
-; CHECK-MOPS-NEXT:    .cfi_restore b12
-; CHECK-MOPS-NEXT:    .cfi_restore b13
-; CHECK-MOPS-NEXT:    .cfi_restore b14
-; CHECK-MOPS-NEXT:    .cfi_restore b15
+; CHECK-MOPS-NEXT:    bl __arm_sc_memchr
+; CHECK-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-MOPS-NEXT:    ret
 entry:
   %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n)
@@ -296,57 +228,13 @@ entry:
 define ptr @sc_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: sc_memchr:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset vg, -16
-; CHECK-NEXT:    .cfi_offset w30, -24
-; CHECK-NEXT:    .cfi_offset w29, -32
-; CHECK-NEXT:    .cfi_offset b8, -40
-; CHECK-NEXT:    .cfi_offset b9, -48
-; CHECK-NEXT:    .cfi_offset b10, -56
-; CHECK-NEXT:    .cfi_offset b11, -64
-; CHECK-NEXT:    .cfi_offset b12, -72
-; CHECK-NEXT:    .cfi_offset b13, -80
-; CHECK-NEXT:    .cfi_offset b14, -88
-; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    mov x2, x1
-; CHECK-NEXT:    mrs x19, SVCR
-; CHECK-NEXT:    tbz w19, #0, .LBB4_2
-; CHECK-NEXT:  // %bb.1: // %entry
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:  .LBB4_2: // %entry
 ; CHECK-NEXT:    mov w1, #5 // =0x5
-; CHECK-NEXT:    bl memchr
-; CHECK-NEXT:    tbz w19, #0, .LBB4_4
-; CHECK-NEXT:  // %bb.3: // %entry
-; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:  .LBB4_4: // %entry
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    .cfi_restore w19
-; CHECK-NEXT:    .cfi_restore vg
-; CHECK-NEXT:    .cfi_restore w30
-; CHECK-NEXT:    .cfi_restore w29
-; CHECK-NEXT:    .cfi_restore b8
-; CHECK-NEXT:    .cfi_restore b9
-; CHECK-NEXT:    .cfi_restore b10
-; CHECK-NEXT:    .cfi_restore b11
-; CHECK-NEXT:    .cfi_restore b12
-; CHECK-NEXT:    .cfi_restore b13
-; CHECK-NEXT:    .cfi_restore b14
-; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    bl __arm_sc_memchr
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-SME-ROUTINES-LABEL: sc_memchr:
@@ -406,57 +294,13 @@ define ptr @sc_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_compatible" {
 ;
 ; CHECK-MOPS-LABEL: sc_memchr:
 ; CHECK-MOPS:       // %bb.0: // %entry
-; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-MOPS-NEXT:    cntd x9
-; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    .cfi_offset w19, -8
-; CHECK-MOPS-NEXT:    .cfi_offset vg, -16
-; CHECK-MOPS-NEXT:    .cfi_offset w30, -24
-; CHECK-MOPS-NEXT:    .cfi_offset w29, -32
-; CHECK-MOPS-NEXT:    .cfi_offset b8, -40
-; CHECK-MOPS-NEXT:    .cfi_offset b9, -48
-; CHECK-MOPS-NEXT:    .cfi_offset b10, -56
-; CHECK-MOPS-NEXT:    .cfi_offset b11, -64
-; CHECK-MOPS-NEXT:    .cfi_offset b12, -72
-; CHECK-MOPS-NEXT:    .cfi_offset b13, -80
-; CHECK-MOPS-NEXT:    .cfi_offset b14, -88
-; CHECK-MOPS-NEXT:    .cfi_offset b15, -96
+; CHECK-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-MOPS-NEXT:    .cfi_offset w30, -16
 ; CHECK-MOPS-NEXT:    mov x2, x1
-; CHECK-MOPS-NEXT:    mrs x19, SVCR
-; CHECK-MOPS-NEXT:    tbz w19, #0, .LBB4_2
-; CHECK-MOPS-NEXT:  // %bb.1: // %entry
-; CHECK-MOPS-NEXT:    smstop sm
-; CHECK-MOPS-NEXT:  .LBB4_2: // %entry
 ; CHECK-MOPS-NEXT:    mov w1, #5 // =0x5
-; CHECK-MOPS-NEXT:    bl memchr
-; CHECK-MOPS-NEXT:    tbz w19, #0, .LBB4_4
-; CHECK-MOPS-NEXT:  // %bb.3: // %entry
-; CHECK-MOPS-NEXT:    smstart sm
-; CHECK-MOPS-NEXT:  .LBB4_4: // %entry
-; CHECK-MOPS-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldr x19, [sp, #88] // 8-byte Reload
-; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-MOPS-NEXT:    .cfi_restore w19
-; CHECK-MOPS-NEXT:    .cfi_restore vg
-; CHECK-MOPS-NEXT:    .cfi_restore w30
-; CHECK-MOPS-NEXT:    .cfi_restore w29
-; CHECK-MOPS-NEXT:    .cfi_restore b8
-; CHECK-MOPS-NEXT:    .cfi_restore b9
-; CHECK-MOPS-NEXT:    .cfi_restore b10
-; CHECK-MOPS-NEXT:    .cfi_restore b11
-; CHECK-MOPS-NEXT:    .cfi_restore b12
-; CHECK-MOPS-NEXT:    .cfi_restore b13
-; CHECK-MOPS-NEXT:    .cfi_restore b14
-; CHECK-MOPS-NEXT:    .cfi_restore b15
+; CHECK-MOPS-NEXT:    bl __arm_sc_memchr
+; CHECK-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-MOPS-NEXT:    ret
 entry:
   %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n)

>From f1f20f8e01bcb4055713934f2d7a3e740961aa2a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 20 Nov 2025 16:50:36 +0000
Subject: [PATCH 3/3] Add test of non-streaming[-compatible] memchr

---
 .../streaming-compatible-memory-ops.ll        | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index fc4ae272046a0..b000854f5948e 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-COMMON,CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-COMMON,CHECK-NO-SME-ROUTINES
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-COMMON,CHECK-MOPS
 
 @dst = global [512 x i8] zeroinitializer, align 1
 @src = global [512 x i8] zeroinitializer, align 1
@@ -307,6 +307,18 @@ entry:
   ret ptr %res
 }
 
+; Non-streaming[-compatible] call to memchr.
+define ptr @ns_memcpy(ptr %src, i64 %n)  {
+; CHECK-COMMON-LABEL: ns_memcpy:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    mov x2, x1
+; CHECK-COMMON-NEXT:    mov w1, #5 // =0x5
+; CHECK-COMMON-NEXT:    b memchr
+entry:
+  %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n)
+  ret ptr %res
+}
+
 define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: sc_memcpy:
 ; CHECK:       // %bb.0: // %entry
@@ -333,15 +345,15 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NO-SME-ROUTINES-NEXT:    mrs x19, SVCR
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
-; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB5_2
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB6_2
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.1: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
-; CHECK-NO-SME-ROUTINES-NEXT:  .LBB5_2: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB6_2: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
-; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB5_4
+; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB6_4
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
-; CHECK-NO-SME-ROUTINES-NEXT:  .LBB5_4: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:  .LBB6_4: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -438,3 +450,5 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
 declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
 declare ptr @memchr(ptr, i32, i64)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; : {{.*}}



More information about the llvm-commits mailing list