[llvm] [AArch64] Fix #94909: Optimize vector fmul(sitofp(x), 0.5) -> scvtf(x, 2) (PR #141480)

Tue Jun 10 14:52:17 PDT 2025

================
@@ -5407,6 +5433,102 @@ class BaseIntegerToFPUnscaled<bits<2> rmode, bits<3> opcode,
   let Inst{4-0}   = Rd;
 }
 
+multiclass IntegerToFPVector<
+    bits<2> rmode, bits<3> opcode, string asm, RegisterClass srcRegClass,
+    RegisterClass dstRegClass, Operand imm_op, bits<1> q, bits<2> size,
+    bits<2> srcElemTypeBits, list<Predicate> preds> {
+
+  def _V : BaseIntegerToFP<rmode, opcode, srcRegClass, dstRegClass, imm_op,
+                           asm, []> {
+    let Inst{30} = q;
+    let Inst{23 -22} = size;
+    let Inst{18 -16} = 0b001;
+    let Inst{11 -10} = srcElemTypeBits;
+    let Predicates = preds;
+  }
+}
+
+// SCVTF (Signed Convert To Floating-Point) from Vector 32-bit Integer (vNi32)
+// defm SCVTFv2f16_v2i32 : IntegerToFPVector<0b00, 0b010, "scvtf",
+//                                     FPR64, FPR64,
+//                                     fixedpoint_recip_v2f16_v2i32,
+//                                     0, 0b00, 0b10, [HasFullFP16]>;
+
+// defm SCVTFv4f16_v4i32 : IntegerToFPVector<0b00, 0b010, "scvtf",
+//                                     FPR128, FPR128,
+//                                     fixedpoint_recip_v4f16_v4i32,
+//                                     1, 0b00, 0b10, [HasFullFP16]>;
+
+// defm SCVTFv8f16_v8i32 : IntegerToFPVector<0b00, 0b010, "scvtf",
+//                                     FPR128, FPR128,
+//                                     fixedpoint_recip_v8f16_v8i32,
+//                                     1, 0b00, 0b10, [HasFullFP16]>;
+
+defm SCVTFv2f32_v2i32
+    : IntegerToFPVector<0b00, 0b010, "scvtf", FPR64, FPR64,
+                        fixedpoint_recip_v2f32_v2i32, 0, 0b01, 0b10, []>;
+
+defm SCVTFv4f32_v4i32
+    : IntegerToFPVector<0b00, 0b010, "scvtf", FPR128, FPR128,
+                        fixedpoint_recip_v4f32_v4i32, 1, 0b01, 0b10, []>;
+
+// SCVTF (Signed Convert To Floating-Point) from Vector 64-bit Integer (vNi64)
+// defm SCVTFv2f16_v2i64 : IntegerToFPVector<0b00, 0b010, "scvtf",
+//                                     FPR128, FPR128,
+//                                     fixedpoint_recip_v2f16_v2i64,
+//                                     1, 0b00, 0b11, [HasFullFP16]>;
+
+// defm SCVTFv2f32_v2i64 : IntegerToFPVector<0b00, 0b010, "scvtf",
+//                                     FPR128, FPR128,
+//                                     fixedpoint_recip_v2f32_v2i64,
+//                                     1, 0b01, 0b11, []>;
+
+defm SCVTFv2f64_v2i64
+    : IntegerToFPVector<0b00, 0b010, "scvtf", FPR128, FPR128,
+                        fixedpoint_recip_v2f64_v2i64, 1, 0b10, 0b11, []>;
+
+// def : Pat<
+//   (fmul (sint_to_fp (v2i32 V64:$Rn)),
+//         fixedpoint_recip_v2f32_v2i32:$scale),
+//   (SCVTFv2f16_v2i32_V V64:$Rn, fixedpoint_recip_v2f32_v2i32:$scale)
+// >;
+
+// def : Pat<
+//   (fmul (sint_to_fp (v4i32 FPR128:$Rn)),
+//         fixedpoint_recip_v4f16_v4i32:$scale),
+//   (SCVTFv4f16_v4i32_V FPR128:$Rn, fixedpoint_recip_v4f16_v4i32:$scale)
+// >;
+
+// def : Pat<
+//   (fmul (sint_to_fp (v8i32 FPR128:$Rn)),
+//         fixedpoint_recip_v8f16_v8i32:$scale),
+//   (SCVTFv8f16_v8i32_V FPR128:$Rn, fixedpoint_recip_v8f16_v8i32:$scale)
+// >;
+
+def : Pat<(fmul(sint_to_fp(v2i32 V64:$Rn)),
+              fixedpoint_recip_v2f32_v2i32:$scale),
+          (SCVTFv2f32_v2i32_V V64:$Rn, fixedpoint_recip_v2f32_v2i32:$scale)>;
+
+def : Pat<(fmul(sint_to_fp(v4i32 FPR128:$Rn)),
+              fixedpoint_recip_v4f32_v4i32:$scale),
+          (SCVTFv4f32_v4i32_V FPR128:$Rn, fixedpoint_recip_v4f32_v4i32:$scale)>;
+
+// def : Pat<
+//   (fmul (sint_to_fp (v2i64 FPR128:$Rn)),
+//         fixedpoint_recip_v2f16_v2i64:$scale),
+//   (SCVTFv2f16_v2i64_V FPR128:$Rn, fixedpoint_recip_v2f16_v2i64:$scale)
+// >;
+
+// def : Pat<
+//   (fmul (sint_to_fp (v2i64 FPR128:$Rn)),
+//         fixedpoint_recip_v2f32_v2i64:$scale),
+//   (SCVTFv2f32_v2i64_V FPR128:$Rn, fixedpoint_recip_v2f32_v2i64:$scale)
+// >;
+
+def : Pat<(fmul(sint_to_fp(v2i64 FPR128:$Rn)),
----------------
davemgreen wrote:

When you get this far, the tablegen pattern is a pretty gnarly as it needs to convert match the v2f32 $scale, but use that result as a i32 in the instruction. I'm not sure if there is a better way, but it might need a SDNodeXForm to convert the v2f32->i32, even if the complex pattern returns a i32 SDValue.

Something like this might work, if generalized to more types (and cleaned up a bit):
```
class fixedpoint_recip_vec<ValueType FloatVT> : ComplexPattern<FloatVT, 1, "SelectCVTFixedPosRecipOperandVec<32>", []>;
def fixedpoint_recip_v2f32_v2i32_2 : fixedpoint_recip_vec<v2f32>;
def fixedpoint_recip_vec_xform : SDNodeXForm<timm, [{
  return V;
}]>;
def : Pat<(v2f32 (fmul (sint_to_fp(v2i32 V64:$Rn)), fixedpoint_recip_v2f32_v2i32_2:$scale)),
          (v2f32 (SCVTFv2i32_shift (v2i32 V64:$Rn), (fixedpoint_recip_vec_xform fixedpoint_recip_v2f32_v2i32_2:$scale)))>;
```
The xform is a bit weird, as it just allows the type to change without actually doing anything to the value used. It could probably do with an comment explaining it.

https://github.com/llvm/llvm-project/pull/141480