[llvm] [AArch64] Override canCombineStoreAndExtract (PR #145825)

Mon Jun 30 13:09:08 PDT 2025

================
@@ -0,0 +1,434 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
+
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    orr.2s v0, #1
+; ASM-NEXT:    st1.s { v0 }[1], [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+;
+define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) {
+; ASM-LABEL: unsupportedInstructionForPromotion:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov w8, s0
+; ASM-NEXT:    cmp w8, w1
+; ASM-NEXT:    cset w8, eq
+; ASM-NEXT:    strb w8, [x2]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out = icmp eq i32 %extract, %in2
+  store i1 %out, i1* %dest, align 4
+  ret void
+}
+
+
+; BB2
+;
+define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) {
+; ASM-LABEL: unsupportedChainInDifferentBBs:
+; ASM:       ; %bb.0: ; %bb1
+; ASM-NEXT:    tbz w2, #0, LBB2_2
+; ASM-NEXT:  ; %bb.1: ; %bb2
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov w8, s0
+; ASM-NEXT:    orr w8, w8, #0x1
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:  LBB2_2: ; %end
+; ASM-NEXT:    ret
+bb1:
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  br i1 %bool, label %bb2, label %end
+bb2:
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  br label %end
+end:
+  ret void
+}
+
+;
+define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: chainOfInstructionsToPromote:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    orr.2s v0, #1
+; ASM-NEXT:    str s0, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out1 = or i32 %extract, 1
+  %out2 = or i32 %out1, 1
+  %out3 = or i32 %out2, 1
+  %out4 = or i32 %out3, 1
+  %out5 = or i32 %out4, 1
+  %out6 = or i32 %out5, 1
+  %out7 = or i32 %out6, 1
+  store i32 %out7, i32* %dest, align 4
+  ret void
+}
+
+;
+define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: unsupportedMultiUses:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    orr w0, w8, #0x1
+; ASM-NEXT:    str w0, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret i32 %out
+}
+
+; Check that we promote we a splat constant when this is a division.
+; The NORMAL mode does not promote anything as divisions are not legal.
+; Scalar version:
+; Vector version:
+;
+define void @udivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: udivCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #18725 ; =0x4925
+; ASM-NEXT:    movk w9, #9362, lsl #16
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    umull x9, w8, w9
+; ASM-NEXT:    lsr x9, x9, #32
+; ASM-NEXT:    sub w8, w8, w9
+; ASM-NEXT:    add w8, w9, w8, lsr #1
+; ASM-NEXT:    lsr w8, w8, #2
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @uremCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: uremCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #18725 ; =0x4925
+; ASM-NEXT:    movk w9, #9362, lsl #16
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    umull x9, w8, w9
+; ASM-NEXT:    lsr x9, x9, #32
+; ASM-NEXT:    sub w10, w8, w9
+; ASM-NEXT:    add w9, w9, w10, lsr #1
+; ASM-NEXT:    lsr w9, w9, #2
+; ASM-NEXT:    sub w9, w9, w9, lsl #3
+; ASM-NEXT:    add w8, w8, w9
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = urem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @sdivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: sdivCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #9363 ; =0x2493
+; ASM-NEXT:    movk w9, #37449, lsl #16
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    smull x9, w8, w9
+; ASM-NEXT:    lsr x9, x9, #32
+; ASM-NEXT:    add w8, w9, w8
+; ASM-NEXT:    asr w9, w8, #2
+; ASM-NEXT:    add w8, w9, w8, lsr #31
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = sdiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @sremCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: sremCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #9363 ; =0x2493
+; ASM-NEXT:    movk w9, #37449, lsl #16
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    smull x9, w8, w9
+; ASM-NEXT:    lsr x9, x9, #32
+; ASM-NEXT:    add w9, w9, w8
+; ASM-NEXT:    asr w10, w9, #2
+; ASM-NEXT:    add w9, w10, w9, lsr #31
+; ASM-NEXT:    sub w9, w9, w9, lsl #3
+; ASM-NEXT:    add w8, w8, w9
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @fdivCase(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: fdivCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov s1, #7.00000000
+; ASM-NEXT:    mov s0, v0[1]
+; ASM-NEXT:    fdiv s0, s0, s1
+; ASM-NEXT:    str s0, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fdiv float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @fremCase(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: fremCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT:    .cfi_def_cfa_offset 32
+; ASM-NEXT:    .cfi_offset w30, -8
+; ASM-NEXT:    .cfi_offset w29, -16
+; ASM-NEXT:    .cfi_offset w19, -24
+; ASM-NEXT:    .cfi_offset w20, -32
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov s1, #7.00000000
+; ASM-NEXT:    mov x19, x1
+; ASM-NEXT:    mov s0, v0[1]
+; ASM-NEXT:    bl _fmodf
+; ASM-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT:    str s0, [x19]
+; ASM-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: undefDivCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #7 ; =0x7
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    udiv w8, w9, w8
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: undefRemCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #7 ; =0x7
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    sdiv w10, w9, w8
+; ASM-NEXT:    msub w8, w10, w8, w9
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; Scalar version:
+; Vector version:
+;
+define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: undefConstantFRemCaseWithFastMath:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT:    .cfi_def_cfa_offset 32
+; ASM-NEXT:    .cfi_offset w30, -8
+; ASM-NEXT:    .cfi_offset w29, -16
+; ASM-NEXT:    .cfi_offset w19, -24
+; ASM-NEXT:    .cfi_offset w20, -32
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov s1, #7.00000000
+; ASM-NEXT:    mov x19, x1
+; ASM-NEXT:    mov s0, v0[1]
+; ASM-NEXT:    bl _fmodf
+; ASM-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT:    str s0, [x19]
+; ASM-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; Scalar version:
+; Vector version:
+;
+define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: undefVectorFRemCaseWithFastMath:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT:    .cfi_def_cfa_offset 32
+; ASM-NEXT:    .cfi_offset w30, -8
+; ASM-NEXT:    .cfi_offset w29, -16
+; ASM-NEXT:    .cfi_offset w19, -24
+; ASM-NEXT:    .cfi_offset w20, -32
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov x19, x1
+; ASM-NEXT:    mov s1, v0[1]
+; ASM-NEXT:    fmov s0, #7.00000000
+; ASM-NEXT:    bl _fmodf
+; ASM-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT:    str s0, [x19]
+; ASM-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float 7.0, %extract
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we are able to promote floating point value.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotionFloat:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov s1, #1.00000000
+; ASM-NEXT:    mov s0, v0[1]
+; ASM-NEXT:    fadd s0, s0, s1
+; ASM-NEXT:    str s0, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fadd float %extract, 1.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we correctly use a splat constant when we cannot
+; determine at compile time the index of the extract.
+; This requires the STRESS modes, as variable index are expensive
+; to lower.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) {
+; ASM-LABEL: simpleOneInstructionPromotionVariableIdx:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ; kill: def $w2 killed $w2 def $x2
+; ASM-NEXT:    and x8, x2, #0x1
+; ASM-NEXT:    ldr w8, [x0, x8, lsl #2]
+; ASM-NEXT:    orr w8, w8, #0x1
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 %idx
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check a vector with more than 2 elements.
+; This requires the STRESS mode because currently 'or v8i8' is not marked
+; as legal or custom, althought the actual assembly is better if we were
+; promoting it.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion8x8:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    movi.8b v0, #1
+; ASM-NEXT:    ldr d1, [x0]
+; ASM-NEXT:    orr.8b v0, v1, v0
+; ASM-NEXT:    st1.b { v0 }[1], [x1]
+; ASM-NEXT:    ret
+  %in1 = load <8 x i8>, <8 x i8>* %addr1, align 8
+  %extract = extractelement <8 x i8> %in1, i32 1
+  %out = or i8 %extract, 1
+  store i8 %out, i8* %dest, align 4
+  ret void
+}
+
+; Check that we optimized the sequence correctly when it can be
+; lowered on a Q register.
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
----------------
MacDue wrote:

This comments are referring to ARM not AArch64 instructions. 

https://github.com/llvm/llvm-project/pull/145825