[llvm] [AArch64] Override canCombineStoreAndExtract (PR #145825)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 30 13:09:08 PDT 2025
================
@@ -0,0 +1,434 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
+
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: orr.2s v0, #1
+; ASM-NEXT: st1.s { v0 }[1], [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+;
+define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) {
+; ASM-LABEL: unsupportedInstructionForPromotion:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov w8, s0
+; ASM-NEXT: cmp w8, w1
+; ASM-NEXT: cset w8, eq
+; ASM-NEXT: strb w8, [x2]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 0
+ %out = icmp eq i32 %extract, %in2
+ store i1 %out, i1* %dest, align 4
+ ret void
+}
+
+
+; BB2
+;
+define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) {
+; ASM-LABEL: unsupportedChainInDifferentBBs:
+; ASM: ; %bb.0: ; %bb1
+; ASM-NEXT: tbz w2, #0, LBB2_2
+; ASM-NEXT: ; %bb.1: ; %bb2
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov w8, s0
+; ASM-NEXT: orr w8, w8, #0x1
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: LBB2_2: ; %end
+; ASM-NEXT: ret
+bb1:
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 0
+ br i1 %bool, label %bb2, label %end
+bb2:
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 4
+ br label %end
+end:
+ ret void
+}
+
+;
+define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: chainOfInstructionsToPromote:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: orr.2s v0, #1
+; ASM-NEXT: str s0, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 0
+ %out1 = or i32 %extract, 1
+ %out2 = or i32 %out1, 1
+ %out3 = or i32 %out2, 1
+ %out4 = or i32 %out3, 1
+ %out5 = or i32 %out4, 1
+ %out6 = or i32 %out5, 1
+ %out7 = or i32 %out6, 1
+ store i32 %out7, i32* %dest, align 4
+ ret void
+}
+
+;
+define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: unsupportedMultiUses:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: orr w0, w8, #0x1
+; ASM-NEXT: str w0, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 4
+ ret i32 %out
+}
+
+; Check that we promote we a splat constant when this is a division.
+; The NORMAL mode does not promote anything as divisions are not legal.
+; Scalar version:
+; Vector version:
+;
+define void @udivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: udivCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #18725 ; =0x4925
+; ASM-NEXT: movk w9, #9362, lsl #16
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: umull x9, w8, w9
+; ASM-NEXT: lsr x9, x9, #32
+; ASM-NEXT: sub w8, w8, w9
+; ASM-NEXT: add w8, w9, w8, lsr #1
+; ASM-NEXT: lsr w8, w8, #2
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = udiv i32 %extract, 7
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @uremCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: uremCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #18725 ; =0x4925
+; ASM-NEXT: movk w9, #9362, lsl #16
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: umull x9, w8, w9
+; ASM-NEXT: lsr x9, x9, #32
+; ASM-NEXT: sub w10, w8, w9
+; ASM-NEXT: add w9, w9, w10, lsr #1
+; ASM-NEXT: lsr w9, w9, #2
+; ASM-NEXT: sub w9, w9, w9, lsl #3
+; ASM-NEXT: add w8, w8, w9
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = urem i32 %extract, 7
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @sdivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: sdivCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #9363 ; =0x2493
+; ASM-NEXT: movk w9, #37449, lsl #16
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: smull x9, w8, w9
+; ASM-NEXT: lsr x9, x9, #32
+; ASM-NEXT: add w8, w9, w8
+; ASM-NEXT: asr w9, w8, #2
+; ASM-NEXT: add w8, w9, w8, lsr #31
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = sdiv i32 %extract, 7
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @sremCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: sremCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #9363 ; =0x2493
+; ASM-NEXT: movk w9, #37449, lsl #16
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: smull x9, w8, w9
+; ASM-NEXT: lsr x9, x9, #32
+; ASM-NEXT: add w9, w9, w8
+; ASM-NEXT: asr w10, w9, #2
+; ASM-NEXT: add w9, w10, w9, lsr #31
+; ASM-NEXT: sub w9, w9, w9, lsl #3
+; ASM-NEXT: add w8, w8, w9
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = srem i32 %extract, 7
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @fdivCase(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: fdivCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov s1, #7.00000000
+; ASM-NEXT: mov s0, v0[1]
+; ASM-NEXT: fdiv s0, s0, s1
+; ASM-NEXT: str s0, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = fdiv float %extract, 7.0
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @fremCase(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: fremCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT: .cfi_def_cfa_offset 32
+; ASM-NEXT: .cfi_offset w30, -8
+; ASM-NEXT: .cfi_offset w29, -16
+; ASM-NEXT: .cfi_offset w19, -24
+; ASM-NEXT: .cfi_offset w20, -32
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov s1, #7.00000000
+; ASM-NEXT: mov x19, x1
+; ASM-NEXT: mov s0, v0[1]
+; ASM-NEXT: bl _fmodf
+; ASM-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT: str s0, [x19]
+; ASM-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = frem float %extract, 7.0
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: undefDivCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #7 ; =0x7
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: udiv w8, w9, w8
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = udiv i32 7, %extract
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: undefRemCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #7 ; =0x7
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: sdiv w10, w9, w8
+; ASM-NEXT: msub w8, w10, w8, w9
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = srem i32 7, %extract
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; Scalar version:
+; Vector version:
+;
+define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: undefConstantFRemCaseWithFastMath:
+; ASM: ; %bb.0:
+; ASM-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT: .cfi_def_cfa_offset 32
+; ASM-NEXT: .cfi_offset w30, -8
+; ASM-NEXT: .cfi_offset w29, -16
+; ASM-NEXT: .cfi_offset w19, -24
+; ASM-NEXT: .cfi_offset w20, -32
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov s1, #7.00000000
+; ASM-NEXT: mov x19, x1
+; ASM-NEXT: mov s0, v0[1]
+; ASM-NEXT: bl _fmodf
+; ASM-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT: str s0, [x19]
+; ASM-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = frem nnan float %extract, 7.0
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; Scalar version:
+; Vector version:
+;
+define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: undefVectorFRemCaseWithFastMath:
+; ASM: ; %bb.0:
+; ASM-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT: .cfi_def_cfa_offset 32
+; ASM-NEXT: .cfi_offset w30, -8
+; ASM-NEXT: .cfi_offset w29, -16
+; ASM-NEXT: .cfi_offset w19, -24
+; ASM-NEXT: .cfi_offset w20, -32
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov x19, x1
+; ASM-NEXT: mov s1, v0[1]
+; ASM-NEXT: fmov s0, #7.00000000
+; ASM-NEXT: bl _fmodf
+; ASM-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT: str s0, [x19]
+; ASM-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = frem nnan float 7.0, %extract
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Check that we are able to promote floating point value.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotionFloat:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov s1, #1.00000000
+; ASM-NEXT: mov s0, v0[1]
+; ASM-NEXT: fadd s0, s0, s1
+; ASM-NEXT: str s0, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = fadd float %extract, 1.0
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Check that we correctly use a splat constant when we cannot
+; determine at compile time the index of the extract.
+; This requires the STRESS modes, as variable index are expensive
+; to lower.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) {
+; ASM-LABEL: simpleOneInstructionPromotionVariableIdx:
+; ASM: ; %bb.0:
+; ASM-NEXT: ; kill: def $w2 killed $w2 def $x2
+; ASM-NEXT: and x8, x2, #0x1
+; ASM-NEXT: ldr w8, [x0, x8, lsl #2]
+; ASM-NEXT: orr w8, w8, #0x1
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 %idx
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Check a vector with more than 2 elements.
+; This requires the STRESS mode because currently 'or v8i8' is not marked
+; as legal or custom, althought the actual assembly is better if we were
+; promoting it.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion8x8:
+; ASM: ; %bb.0:
+; ASM-NEXT: movi.8b v0, #1
+; ASM-NEXT: ldr d1, [x0]
+; ASM-NEXT: orr.8b v0, v1, v0
+; ASM-NEXT: st1.b { v0 }[1], [x1]
+; ASM-NEXT: ret
+ %in1 = load <8 x i8>, <8 x i8>* %addr1, align 8
+ %extract = extractelement <8 x i8> %in1, i32 1
+ %out = or i8 %extract, 1
+ store i8 %out, i8* %dest, align 4
+ ret void
+}
+
+; Check that we optimized the sequence correctly when it can be
+; lowered on a Q register.
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
----------------
MacDue wrote:
This comments are referring to ARM not AArch64 instructions.
https://github.com/llvm/llvm-project/pull/145825
More information about the llvm-commits
mailing list