[llvm] [AArch64][SVE] Fold integer lane 0 extract and store to FPR store (PR #129756)

Wed Mar 5 02:58:20 PST 2025

================
@@ -1988,6 +1988,38 @@ let Predicates = [HasSVE_or_SME] in {
   def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))),
             (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
 
+  // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+  multiclass SVEVecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+                              ValueType VTy, ValueType STy,
+                              ValueType SubRegTy,
+                              SubRegIndex SubRegIdx, Operand IndexType,
+                              Instruction STR> {
+    def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+                      (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+              (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+                  GPR64sp:$Rn, IndexType:$offset)>;
+  }
----------------
sdesmalen-arm wrote:

If you add another pattern for the non-zero offsets, e.g.:
```
def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 EltTy:$idx))),
                  (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
          (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, EltTy:$idx), SubRegIdx)),
              GPR64sp:$Rn, IndexType:$offset)>;
...
let AddedComplexity = 19 in {
  // Lane 0 truncating stores
  // i32 -> i16
  defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, sve_elm_idx_extdup_h, DUP_ZZI_H, STRHui>;
```

Then we can optimise the non-zero cases as well.

https://github.com/llvm/llvm-project/pull/129756