[llvm] [AArch64][SVE] Remove pseudo from LD1_IMM (PR #73631)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 28 02:14:04 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
The LD1 immediate offset instructions have both a pseudo and a real instruction, mostly as the instructions shares a tablegen class with the FFR version of the instructions. As far as I can tell the pseudo for the non-ffr versions does not serve any useful purpose though, and we can rejig the the classes to only define the pseudo for FFR instructions similar to the existing sve_mem_cld_ss instructions.
The end result of this is that we don't have a SideEffects flag on the LD1_IMM instructions whilst scheduling them, and have a few less pseudo instructions which is usually a good thing.
---
Patch is 227.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73631.diff
48 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64SchedA510.td (+4-4)
- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td (+4-4)
- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td (+4-4)
- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td (+4-4)
- (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+23-19)
- (modified) llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll (+18-18)
- (modified) llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll (+9-9)
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll (+16-16)
- (modified) llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll (+30-30)
- (modified) llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll (+61-61)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll (+28-28)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll (+50-50)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll (+81-69)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll (+12-12)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll (+24-21)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll (+60-48)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll (+9-9)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll (+18-18)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll (+16-16)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll (+61-59)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll (+16-16)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll (+12-12)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll (+48-48)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll (+80-64)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll (+40-32)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll (+32-28)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll (+60-48)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll (+24-24)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll (+34-34)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll (+36-36)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll (+5-5)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll (+32-32)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll (+3-3)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll (+83-83)
- (modified) llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll (+8-8)
- (modified) llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll (+18-18)
- (modified) llvm/test/CodeGen/AArch64/sve-insert-vector.ll (+12-12)
- (modified) llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll (+14-14)
- (modified) llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll (+14-14)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll (+13-13)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll (+26-26)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll (+20-20)
- (modified) llvm/test/CodeGen/AArch64/sve-vscale-attr.ll (+8-8)
- (modified) llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll (+47-47)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index 1afbc5d9102ca96..1b66d6bb8fbd443 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -1168,10 +1168,10 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instrs LDR_ZXI)>;
def : InstRW<[CortexA510Write<3, CortexA510UnitLdSt>], (instrs LDR_PXI)>;
// Contiguous load, scalar + imm
-def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]_IMM_REAL$",
- "^LD1S?B_[HSD]_IMM_REAL$",
- "^LD1S?H_[SD]_IMM_REAL$",
- "^LD1S?W_D_IMM_REAL$" )>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]_IMM$",
+ "^LD1S?B_[HSD]_IMM$",
+ "^LD1S?H_[SD]_IMM$",
+ "^LD1S?W_D_IMM$" )>;
// Contiguous load, scalar + scalar
def : InstRW<[CortexA510Write<3, CortexA510UnitLd>], (instregex "^LD1[BHWD]$",
"^LD1S?B_[HSD]$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 517d0da7f47f428..503de3bee2b8678 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -2082,10 +2082,10 @@ def : InstRW<[N2Write_6cyc_1L], (instrs LDR_ZXI)>;
def : InstRW<[N2Write_6cyc_1L_1M], (instrs LDR_PXI)>;
// Contiguous load, scalar + imm
-def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM_REAL$",
- "^LD1S?B_[HSD]_IMM_REAL$",
- "^LD1S?H_[SD]_IMM_REAL$",
- "^LD1S?W_D_IMM_REAL$" )>;
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM$",
+ "^LD1S?B_[HSD]_IMM$",
+ "^LD1S?H_[SD]_IMM$",
+ "^LD1S?W_D_IMM$" )>;
// Contiguous load, scalar + scalar
def : InstRW<[N2Write_6cyc_1L01], (instregex "^LD1[BHWD]$",
"^LD1S?B_[HSD]$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index 5c155c936da9fea..726be1a547b9519 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -1687,10 +1687,10 @@ def : InstRW<[V1Write_6c_1L_1M], (instrs LDR_PXI)>;
// Contiguous load, scalar + scalar
// Contiguous load broadcast, scalar + imm
// Contiguous load broadcast, scalar + scalar
-def : InstRW<[V1Write_6c_1L01], (instregex "^LD1[BHWD]_IMM_REAL$",
- "^LD1S?B_[HSD]_IMM_REAL$",
- "^LD1S?H_[SD]_IMM_REAL$",
- "^LD1S?W_D_IMM_REAL$",
+def : InstRW<[V1Write_6c_1L01], (instregex "^LD1[BHWD]_IMM$",
+ "^LD1S?B_[HSD]_IMM$",
+ "^LD1S?H_[SD]_IMM$",
+ "^LD1S?W_D_IMM$",
"^LD1[BWD]$",
"^LD1S?B_[HSD]$",
"^LD1S?W_D$",
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index eca7700d5ff6ae9..3367d5d0cd315ff 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -2608,10 +2608,10 @@ def : InstRW<[V2Write_6cyc_1L], (instrs LDR_ZXI)>;
def : InstRW<[V2Write_6cyc_1L_1M], (instrs LDR_PXI)>;
// Contiguous load, scalar + imm
-def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM_REAL$",
- "^LD1S?B_[HSD]_IMM_REAL$",
- "^LD1S?H_[SD]_IMM_REAL$",
- "^LD1S?W_D_IMM_REAL$" )>;
+def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM$",
+ "^LD1S?B_[HSD]_IMM$",
+ "^LD1S?H_[SD]_IMM$",
+ "^LD1S?W_D_IMM$" )>;
// Contiguous load, scalar + scalar
def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1[BHWD]$",
"^LD1S?B_[HSD]$",
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index e765926d8a6355e..c0894e9c70680a4 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7318,29 +7318,18 @@ class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
let mayLoad = 1;
}
-multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
- RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
+multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
+ ZPRRegOp zprty> {
+ def "" : sve_mem_cld_si_base<dtype, 0, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+ (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
-
- // We need a layer of indirection because early machine code passes balk at
- // physical register (i.e. FFR) uses that have no previous definition.
- let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in {
- def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>,
- PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>;
- }
+ (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
-multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
- ZPRRegOp zprty>
-: sve_mem_cld_si_base<dtype, 0, asm, listty, zprty>;
-
class sve_mem_cldnt_si_base<bits<2> msz, string asm, RegisterOperand VecList>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
@@ -7559,8 +7548,23 @@ multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
}
multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
- ZPRRegOp zprty>
-: sve_mem_cld_si_base<dtype, 1, asm, listty, zprty>;
+ ZPRRegOp zprty> {
+ def _REAL : sve_mem_cld_si_base<dtype, 1, asm, listty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
+ (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+ (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+
+ // We need a layer of indirection because early machine code passes balk at
+ // physical register (i.e. FFR) uses that have no previous definition.
+ let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in {
+ def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>,
+ PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>;
+ }
+}
class sve_mem_eld_si<bits<2> sz, bits<3> nregs, RegisterOperand VecList,
string asm, Operand immtype>
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
index 49bdaf0fcde9418..110f0ef7f4a5500 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
@@ -14,12 +14,12 @@ define void @array_1D(ptr %addr) #0 {
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -57,10 +57,10 @@ define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 {
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: st1d { z2.d }, p0, [sp]
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -81,18 +81,18 @@ define void @array_2D(ptr %addr) #0 {
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #5, mul vl]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #4, mul vl]
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #5, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z5.d }, p0, [sp]
-; CHECK-NEXT: st1d { z4.d }, p0, [sp, #5, mul vl]
+; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp, #5, mul vl]
; CHECK-NEXT: st1d { z3.d }, p0, [sp, #4, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z5.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT: st1d { z4.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #6
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
index cc0f441d0aaae4e..f03a6f018d34d0c 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
@@ -13,12 +13,12 @@ define void @test(ptr %addr) #0 {
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 01fd2b1113b000b..467c3c254fc2d38 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -36,13 +36,13 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: add x8, x8, x11
; CHECK-NEXT: add x12, x12, x10
; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
-; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
-; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
+; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
+; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z0.d, p3/m, z7.d
; CHECK-NEXT: mov z1.d, p2/m, z6.d
; CHECK-NEXT: b.mi .LBB0_1
@@ -138,13 +138,13 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13]
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14]
-; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
-; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
+; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
+; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z0.d, p2/m, z7.d
; CHECK-NEXT: mov z1.d, p1/m, z6.d
; CHECK-NEXT: b.ne .LBB1_1
@@ -241,13 +241,13 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
; CHECK-NEXT: whilelo p1.d, x9, x10
; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
-; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
-; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
+; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
+; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z0.d, p3/m, z7.d
; CHECK-NEXT: mov z1.d, p2/m, z6.d
; CHECK-NEXT: b.mi .LBB2_1
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 956d2d941ac714d..1696ac8709d4060 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -214,21 +214,21 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl]
; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8]
-; CHECK-NEXT: ld1d { z7.d }, p0/z, [x15, #1, mul vl]
-; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1, x8]
-; CHECK-NEXT: ld1d { z17.d }, p0/z, [x16, #1, mul vl]
+; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x8]
+; CHECK-NEXT: ld1d { z16.d }, p0/z, [x16, #1, mul vl]
+; CHECK-NEXT: ld1d { z17.d }, p0/z, [x15, #1, mul vl]
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8]
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
; CHECK-NEXT: adds x10, x10, x9
; CHECK-NEXT: add x8, x8, x13
-; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z4.d, #0
-; CHECK-NEXT: fcmla z0.d, p0/m, z17.d, z5.d, #0
+; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0
+; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0
; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0
-; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z7.d, #0
-; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z4.d, #90
-; CHECK-NEXT: fcmla z0.d, p0/m, z17.d, z5.d, #90
+; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #0
+; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90
+; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90
; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #90
-; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z7.d, #90
+; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #90
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
@@ -335,15 +335,15 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
; CHECK-NEXT: zip1 z1.d, z2.d, z2.d
; CHECK-NEXT: .LBB3_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1w { z3.d }, p0/z, [x3, x8, lsl #2]
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: add x8, x8, x9
+; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: add x0, x0, x11
+; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2]
+; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: cmp x10, x8
-; CHECK-NEXT: fadd z0.d, z5.d, z0.d
-; CHECK-NEXT: fadd z1.d, z4.d, z1.d
-; CHECK-NEXT: add z2.d, z3.d, z2.d
+; CHECK-NEXT: fadd z0.d, z4.d, z0.d
+; CHECK-NEXT: fadd z1.d, z3.d, z1.d
+; CHECK-NEXT: add z2.d, z5.d, z2.d
; CHECK-NEXT: b.ne .LBB3_1
; CHECK-NEXT: // %bb.2: // %middle.block
; CHECK-NEXT: uaddv d2, p0, z2.d
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index f5d14779f6586e3..b80ea04823e9f69 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -75,11 +75,11 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_zero_i8(<vscale x 8 x i8
; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr d0, [x1]
; CHECK-NEXT: ptrue p1.h, vl8
-; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: mov z0.h, p1/m, z1.h
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0]
+; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: ret
%vec = load <vscale x 8 x i8>, <vscale x 8 x i8>* %a
%subvec = load <8 x i8>, <8 x i8>* %b
@@ -94,17 +94,17 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(<vscale x 8 x
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: cnth x8
-; CHECK-NEXT: mov w9, #8 // =0x8
+; CHECK-NEXT: ldr d0, [x1]
; CHECK-NEXT: sub x8, x8, #8
+; CHECK-NEXT: mov w9, #8 // =0x8
; CHECK-NEXT: cmp x8, #8...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/73631
More information about the llvm-commits
mailing list