[llvm] [GlobalISel] Commute G_FMUL and G_FADD constant LHS to RHS. (PR #65298)

Tue Sep 5 08:25:22 PDT 2023

https://github.com/aemerson updated https://github.com/llvm/llvm-project/pull/65298:

>From bda7b386d52e44b24a15f2f095ce062f725b114f Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Fri, 1 Sep 2023 10:29:38 -0700
Subject: [PATCH 1/2] [GlobalISel] Commute G_FMUL and G_FADD constant LHS to
 RHS.

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  6 +++
 .../include/llvm/Target/GlobalISel/Combine.td | 23 ++++++----
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 16 +++++++
 .../combine-commute-fp-const-lhs.mir          | 45 +++++++++++++++++++
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll     |  4 +-
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll     | 18 ++++----
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           | 44 +++++++++---------
 7 files changed, 114 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 4b10ca3b7eb69a..4f87590b9cbf97 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -787,6 +787,12 @@ class CombinerHelper {
   /// Match constant LHS ops that should be commuted.
   bool matchCommuteConstantToRHS(MachineInstr &MI);
 
+  /// Match constant LHS FP ops that should be commuted.
+  bool matchCommuteFPConstantToRHS(MachineInstr &MI);
+
+  // Given a binop \p MI, commute operands 1 and 2.
+  void applyCommuteBinOpOperands(MachineInstr &MI);
+
 private:
   /// Given a non-indexed load or store instruction \p MI, find an offset that
   /// can be usefully and legally folded into it as a post-indexing operation.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index b76f739cdcaa22..3742cd2b6fc0b3 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -396,20 +396,25 @@ def select_to_logical : GICombineRule<
 // Fold (C op x) -> (x op C)
 // TODO: handle more isCommutable opcodes
 // TODO: handle compares (currently not marked as isCommutable)
-def commute_constant_to_rhs : GICombineRule<
+def commute_int_constant_to_rhs : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR):$root,
     [{ return Helper.matchCommuteConstantToRHS(*${root}); }]),
-  (apply [{
-    Observer.changingInstr(*${root});
-    Register LHSReg = ${root}->getOperand(1).getReg();
-    Register RHSReg = ${root}->getOperand(2).getReg();
-    ${root}->getOperand(1).setReg(RHSReg);
-    ${root}->getOperand(2).setReg(LHSReg);
-    Observer.changedInstr(*${root});
-  }])
+  (apply [{ Helper.applyCommuteBinOpOperands(*${root}); }])
+>;
+
+def commute_fp_constant_to_rhs : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_FADD, G_FMUL):$root,
+    [{ return Helper.matchCommuteFPConstantToRHS(*${root}); }]),
+  (apply [{ Helper.applyCommuteBinOpOperands(*${root}); }])
 >;
 
+def commute_constant_to_rhs : GICombineGroup<[
+  commute_int_constant_to_rhs,
+  commute_fp_constant_to_rhs
+]>;
+
 // Fold x op 0 -> x
 def right_identity_zero_frags : GICombinePatFrag<
   (outs root:$dst), (ins $x),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1604de13b49403..9e1fe6f0a16b56 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6026,6 +6026,22 @@ bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) {
          !getIConstantVRegVal(RHS, MRI);
 }
 
+bool CombinerHelper::matchCommuteFPConstantToRHS(MachineInstr &MI) {
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  return getFConstantVRegValWithLookThrough(LHS, MRI, false).has_value() &&
+         !getFConstantVRegValWithLookThrough(RHS, MRI, false).has_value();
+}
+
+void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
+  Observer.changingInstr(MI);
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+  MI.getOperand(1).setReg(RHSReg);
+  MI.getOperand(2).setReg(LHSReg);
+  Observer.changedInstr(MI);
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
new file mode 100644
index 00000000000000..9f5b402d32d6a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
@@ -0,0 +1,45 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            fadd
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fadd
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; CHECK-NEXT: %add:_(s32) = G_FADD [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 1.000000e+00
+    %add:_(s32) = G_FADD %cst, %0
+    $s0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            fmul
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmul
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %mul:_(s32) = G_FMUL [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %mul:_(s32) = G_FMUL %cst, %0
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index be3cfa7d88c4cd..33a2c9b2ce8bd3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -1728,7 +1728,7 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX8-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX8-NEXT:    v_mul_f16_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1789,7 +1789,7 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 ; GFX8-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX8-NEXT:    v_mul_f16_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index 621badb4d395ef..a7eab3105a5252 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -440,7 +440,7 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; GCN-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GCN-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; GCN-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -453,7 +453,7 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; GFX10-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -469,7 +469,7 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 ; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; GFX11-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
@@ -1436,8 +1436,8 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GCN-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
 ; GCN-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
 ; GCN-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; GCN-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
-; GCN-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; GCN-NEXT:    v_mul_f64 v[8:9], v[4:5], 1.0
+; GCN-NEXT:    v_mul_f64 v[10:11], v[6:7], 1.0
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
 ; GCN-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
@@ -1457,8 +1457,8 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GFX10-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; GFX10-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
-; GFX10-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; GFX10-NEXT:    v_mul_f64 v[8:9], v[4:5], 1.0
+; GFX10-NEXT:    v_mul_f64 v[10:11], v[6:7], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
@@ -1483,8 +1483,8 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GFX11-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
-; GFX11-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; GFX11-NEXT:    v_mul_f64 v[8:9], v[4:5], 1.0
+; GFX11-NEXT:    v_mul_f64 v[10:11], v[6:7], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
 ; GFX11-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 9caea1b3b3853d..046df70e95240d 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -3132,7 +3132,7 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3203,7 +3203,7 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3279,7 +3279,7 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3350,7 +3350,7 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3427,7 +3427,7 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3499,7 +3499,7 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3575,7 +3575,7 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3646,7 +3646,7 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3722,7 +3722,7 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3793,7 +3793,7 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3869,7 +3869,7 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3940,7 +3940,7 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4017,7 +4017,7 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4089,7 +4089,7 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4383,8 +4383,8 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
@@ -4506,8 +4506,8 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
-; VI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
@@ -4587,7 +4587,7 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4662,7 +4662,7 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4747,7 +4747,7 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4818,7 +4818,7 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]

>From 22a39d676f2f11ca6409f2a1bdd8bd9367469517 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Tue, 5 Sep 2023 08:16:45 -0700
Subject: [PATCH 2/2] Also match vector splats and add tests.

---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  6 +-
 .../combine-commute-fp-const-lhs.mir          | 73 +++++++++++++++++++
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 9e1fe6f0a16b56..e7862b7a26211e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6029,8 +6029,10 @@ bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) {
 bool CombinerHelper::matchCommuteFPConstantToRHS(MachineInstr &MI) {
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
-  return getFConstantVRegValWithLookThrough(LHS, MRI, false).has_value() &&
-         !getFConstantVRegValWithLookThrough(RHS, MRI, false).has_value();
+  std::optional<FPValueAndVReg> ValAndVReg;
+  if (!mi_match(LHS, MRI, m_GFCstOrSplat(ValAndVReg)))
+    return false;
+  return !mi_match(RHS, MRI, m_GFCstOrSplat(ValAndVReg));
 }
 
 void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
index 9f5b402d32d6a6..76d82884a7b1f1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
@@ -43,3 +43,76 @@ body:             |
     $s0 = COPY %mul
     RET_ReallyLR
 ...
+---
+name:            fmul_vector
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmul_vector
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: %cst_scalar:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %cst:_(<4 x s32>) = G_BUILD_VECTOR %cst_scalar(s32), %cst_scalar(s32), %cst_scalar(s32), %cst_scalar(s32)
+    ; CHECK-NEXT: %mul:_(<4 x s32>) = G_FMUL [[COPY]], %cst
+    ; CHECK-NEXT: $q0 = COPY %mul(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(<4 x s32>) = COPY $q0
+    %cst_scalar:_(s32) = G_FCONSTANT float 2.000000e+00
+    %cst:_(<4 x s32>) = G_BUILD_VECTOR %cst_scalar, %cst_scalar, %cst_scalar, %cst_scalar
+    %mul:_(<4 x s32>) = G_FMUL %cst, %0
+    $q0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            fmul_splat_with_undef
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: fmul_splat_with_undef
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %cst_scalar:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %cst:_(<4 x s32>) = G_BUILD_VECTOR %undef(s32), %undef(s32), %cst_scalar(s32), %cst_scalar(s32)
+    ; CHECK-NEXT: %mul:_(<4 x s32>) = G_FMUL [[COPY]], %cst
+    ; CHECK-NEXT: $q0 = COPY %mul(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(<4 x s32>) = COPY $q0
+    %undef:_(s32) = G_IMPLICIT_DEF
+    %cst_scalar:_(s32) = G_FCONSTANT float 2.000000e+00
+    %cst:_(<4 x s32>) = G_BUILD_VECTOR %undef, %undef, %cst_scalar, %cst_scalar
+    %mul:_(<4 x s32>) = G_FMUL %cst, %0
+    $q0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            fmul_vector_nonsplat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmul_vector_nonsplat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: %scalar:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst_scalar:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %cst:_(<4 x s32>) = G_BUILD_VECTOR %cst_scalar(s32), %cst_scalar(s32), %cst_scalar(s32), %scalar(s32)
+    ; CHECK-NEXT: %mul:_(<4 x s32>) = G_FMUL %cst, [[COPY]]
+    ; CHECK-NEXT: $q0 = COPY %mul(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(<4 x s32>) = COPY $q0
+    %scalar:_(s32) = COPY $s0
+    %cst_scalar:_(s32) = G_FCONSTANT float 2.000000e+00
+    %cst:_(<4 x s32>) = G_BUILD_VECTOR %cst_scalar, %cst_scalar, %cst_scalar, %scalar
+    %mul:_(<4 x s32>) = G_FMUL %cst, %0
+    $q0 = COPY %mul
+    RET_ReallyLR
+...