[llvm] 495bdfc - [AArch64] Lower fcvtl2 (fpext) via tablegen patterns.

Sun Jul 23 11:17:15 PDT 2023

Author: David Green
Date: 2023-07-23T19:17:11+01:00
New Revision: 495bdfc7bb729ba22cf2b51d749f7789b0c2d9af

URL: https://github.com/llvm/llvm-project/commit/495bdfc7bb729ba22cf2b51d749f7789b0c2d9af
DIFF: https://github.com/llvm/llvm-project/commit/495bdfc7bb729ba22cf2b51d749f7789b0c2d9af.diff

LOG: [AArch64] Lower fcvtl2 (fpext) via tablegen patterns.

This patch does two things. First it removes the tryHighFPExt DAG2DAG method
used to select fcvtl2 instructions, using tablegen patterns through
SelectExtractHigh instead. This essentially undoes D71515, in a way that should
hopefully avoid any regressions. The second is that a GI equivalent of
SelectExtractHigh is added in selectExtractHigh, from G_UNMERGE_VALUES. The
end result is that GlobalISel (and some constrained fpext) can now make use of
the fcvtl2 instructions, saving an extra dup/ext.

Differential Revision: https://reviews.llvm.org/D155871

Added: 
    

Modified: 
    llvm/include/llvm/Target/GlobalISel/Target.td
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/lib/Target/AArch64/AArch64InstrFormats.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
    llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
    llvm/test/CodeGen/AArch64/fpext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Target/GlobalISel/Target.td b/llvm/include/llvm/Target/GlobalISel/Target.td
index 135d4a5e0dd0bd..4cb3fd1bf79c14 100644

--- a/llvm/include/llvm/Target/GlobalISel/Target.td
+++ b/llvm/include/llvm/Target/GlobalISel/Target.td
@@ -22,6 +22,8 @@ class LLT;
 
 def s32 : LLT;
 def s64 : LLT;
+def v2s32 : LLT;
+def v4s16 : LLT;
 
 // Defines a matcher for complex operands. This is analogous to ComplexPattern
 // from SelectionDAG.

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index c2005c6ff84778..f79d4d1934aaa6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -420,7 +420,6 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   bool tryBitfieldInsertOp(SDNode *N);
   bool tryBitfieldInsertInZeroOp(SDNode *N);
   bool tryShiftAmountMod(SDNode *N);
-  bool tryHighFPExt(SDNode *N);
 
   bool tryReadRegister(SDNode *N);
   bool tryWriteRegister(SDNode *N);
@@ -2470,35 +2469,6 @@ bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
   return true;
 }
 
-/// Try to form fcvtl2 instructions from a floating-point extend of a high-half
-/// extract of a subvector.
-bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
-  assert(N->getOpcode() == ISD::FP_EXTEND);
-
-  // There are 2 forms of fcvtl2 - extend to double or extend to float.
-  SDValue Extract = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-  EVT NarrowVT = Extract.getValueType();
-  if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) &&
-      (VT != MVT::v4f32 || NarrowVT != MVT::v4f16))
-    return false;
-
-  // Optionally look past a bitcast.
-  Extract = peekThroughBitcasts(Extract);
-  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-    return false;
-
-  // Match extract from start of high half index.
-  // Example: v8i16 -> v4i16 means the extract must begin at index 4.
-  unsigned ExtractIndex = Extract.getConstantOperandVal(1);
-  if (ExtractIndex != Extract.getValueType().getVectorNumElements())
-    return false;
-
-  auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
-  CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
-  return true;
-}
-
 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
                                 SDValue &Opd0, unsigned &Immr, unsigned &Imms,
                                 unsigned NumberOfIgnoredLowBits = 0,
@@ -4272,11 +4242,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       return;
     break;
 
-  case ISD::FP_EXTEND:
-    if (tryHighFPExt(Node))
-      return;
-    break;
-
   case ISD::OR:
     if (tryBitfieldInsertOp(Node))
       return;

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index ea35219b151911..cd2b9df27a240f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -133,6 +133,18 @@ def extract_high_v4i32 :
 def extract_high_v2i64 :
     ComplexPattern<v1i64, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
 
+def extract_high_v8f16 :
+    ComplexPattern<v4f16, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
+def extract_high_v4f32 :
+    ComplexPattern<v2f32, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
+
+def gi_extract_high_v8f16 :
+  GIComplexOperandMatcher<v4s16, "selectExtractHigh">,
+  GIComplexPatternEquiv<extract_high_v8f16>;
+def gi_extract_high_v4f32 :
+  GIComplexOperandMatcher<v2s32, "selectExtractHigh">,
+  GIComplexPatternEquiv<extract_high_v4f32>;
+
 def extract_high_dup_v8i16 :
    BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>;
 def extract_high_dup_v4i32 :

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2509edc6c82933..3450ed29d1426e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4690,11 +4690,16 @@ defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
 def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
           (FCVTLv4i16 V64:$Rn)>;
 def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
-                                                              (i64 4)))),
+                                                                (i64 4)))),
+          (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))),
+          (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (any_fpextend (v2f32 (extract_high_v4f32 (v4f32 V128:$Rn))))),
+          (FCVTLv4i32 V128:$Rn)>;
+def : Pat<(v4f32 (any_fpextend (v4f16 V64:$Rn))),
+          (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (any_fpextend (v4f16 (extract_high_v8f16 (v8f16 V128:$Rn))))),
           (FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-
-def : Pat<(v4f32 (any_fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
 
 defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
 defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index b8af6a2be46837..f1ba1aa7ba8968 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -436,6 +436,8 @@ class AArch64InstructionSelector : public InstructionSelector {
 
   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
 
+  ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
+
   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                       int OpIdx = -1) const;
   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -6877,6 +6879,23 @@ AArch64InstructionSelector::selectArithExtendedRegister(
            }}};
 }
 
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
+  if (!Root.isReg())
+    return std::nullopt;
+  MachineRegisterInfo &MRI =
+      Root.getParent()->getParent()->getParent()->getRegInfo();
+
+  MachineInstr *Extract = getDefIgnoringCopies(Root.getReg(), MRI);
+  if (Extract && Extract->getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+      Root.getReg() == Extract->getOperand(1).getReg()) {
+    Register ExtReg = Extract->getOperand(2).getReg();
+    return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
+  }
+
+  return std::nullopt;
+}
+
 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
                                                 const MachineInstr &MI,
                                                 int OpIdx) const {

diff  --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
index d5a99f2d770890..c1cdfa6419deb6 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
@@ -74,11 +74,10 @@ define <4 x i32> @fptoui_v4i32_v4f32(<4 x float> %x) #0 {
 define <4 x i64> @fptosi_v4i64_v4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fptosi_v4i64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    fcvtl2 v1.2d, v0.4s
 ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
-; CHECK-NEXT:    fcvtl v1.2d, v1.2s
-; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    ret
   %val = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
   ret <4 x i64> %val
@@ -87,11 +86,10 @@ define <4 x i64> @fptosi_v4i64_v4f32(<4 x float> %x) #0 {
 define <4 x i64> @fptoui_v4i64_v4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fptoui_v4i64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    fcvtl2 v1.2d, v0.4s
 ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
-; CHECK-NEXT:    fcvtl v1.2d, v1.2s
-; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtzu v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
 ; CHECK-NEXT:    ret
   %val = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
   ret <4 x i64> %val

diff  --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index 1b5a00885b29fe..6d2fd54ce8d254 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -75,9 +75,9 @@ define <4 x double> @fpext_v4f32_v4f64(<4 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fpext_v4f32_v4f64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    fcvtl v0.2d, v0.2s
-; CHECK-GI-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-GI-NEXT:    fcvtl v2.2d, v0.2s
+; CHECK-GI-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fpext <4 x float> %a to <4 x double>
@@ -224,9 +224,9 @@ define <8 x float> @fpext_v8f16_v8f32(<8 x half> %a) {
 ;
 ; CHECK-GI-LABEL: fpext_v8f16_v8f32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fpext <8 x half> %a to <8 x float>