[llvm] [AMDGPU] Added isCommutable attribute to V_ADD_NC_U16 (PR #111789)

Sun Oct 20 00:59:58 PDT 2024

https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/111789

>From 0f38c4defe03b3fb00b4fade0d37975219389b6d Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 7 Oct 2024 12:03:22 +0530
Subject: [PATCH 1/7] added isCommutable attribute to V_ADD_NC_U16

---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 78ca7a2f258cb3..69a7a77f5ee8eb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -870,9 +870,11 @@ let SubtargetPredicate = isGFX10Plus in {
     def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
     def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
   }
-
-  defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
-  defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
+  
+  let isCommutable = 1 in {
+    defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
+    defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
+  } // End isCommutable = 1
 
   def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
   def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;

>From 93a657c7f257726578a1491517c9ac8ef22e20f1 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 10 Oct 2024 13:48:56 +0530
Subject: [PATCH 2/7] added swap for imm values and global values

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 48 +++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0d153df5c3977c..4d7daa21ffa033 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2742,6 +2742,50 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
   return &MI;
 }
 
+static MachineInstr *swapNonRegOperands(MachineInstr &MI,
+                                             MachineOperand &NonRegOp1,
+                                             MachineOperand &NonRegOp2) {
+  if (NonRegOp1.isImm() && NonRegOp2.isImm()){
+    auto TargetFlags = NonRegOp1.getTargetFlags();
+    auto NonRegVal = NonRegOp1.getImm();
+
+    NonRegOp1.setImm(NonRegOp2.getImm());
+    NonRegOp2.setImm(NonRegVal);
+    NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
+    NonRegOp2.setTargetFlags(TargetFlags);
+  }
+  // --> Still working on the FrameInfo case.
+  // else if (NonRegOp1.isFI() && NonRegOp2.isFI()){
+  //   auto TargetFlags = NonRegOp1.getTargetFlags();
+  //   auto FrameIndex = NonRegOp1.getIndex();  
+  //   NonRegOp1.ChangeToFrameIndex(NonRegOp2.getIndex());  
+  //   NonRegOp2.ChangeToFrameIndex(FrameIndex);  
+  //   NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
+  //   NonRegOp2.setTargetFlags(TargetFlags);
+  // }
+  else if (NonRegOp1.isGlobal() && NonRegOp2.isImm()){
+    auto TargetFlags = NonRegOp1.getTargetFlags();
+    auto GlobalVal = NonRegOp1.getGlobal();  
+    auto GlobalOffset = NonRegOp1.getOffset();  
+    NonRegOp1.ChangeToImmediate(NonRegOp2.getImm());  
+    NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
+    NonRegOp2.ChangeToGA(GlobalVal, GlobalOffset, TargetFlags);  
+    NonRegOp2.setTargetFlags(TargetFlags);
+  }
+  else if (NonRegOp1.isImm() && NonRegOp2.isGlobal()){
+    auto TargetFlags = NonRegOp2.getTargetFlags();
+    auto GlobalVal = NonRegOp2.getGlobal();  
+    auto GlobalOffset = NonRegOp2.getOffset();  
+    NonRegOp2.ChangeToImmediate(NonRegOp1.getImm());  
+    NonRegOp2.setTargetFlags(NonRegOp1.getTargetFlags());
+    NonRegOp1.ChangeToGA(GlobalVal, GlobalOffset, TargetFlags);  
+    NonRegOp1.setTargetFlags(TargetFlags);
+  }
+  else 
+    return nullptr;
+  return &MI;
+}
+
 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                   unsigned Src0Idx,
                                                   unsigned Src1Idx) const {
@@ -2780,8 +2824,10 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     if (isOperandLegal(MI, Src1Idx, &Src0))
       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
   } else {
+      CommutedMI = swapNonRegOperands(MI, Src1, Src0);
+    
     // FIXME: Found two non registers to commute. This does happen.
-    return nullptr;
+    // return nullptr;
   }
 
   if (CommutedMI) {

>From 58cd70f4c66c280b0399b6afe345557af16eb07c Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 10 Oct 2024 14:41:58 +0530
Subject: [PATCH 3/7] Modified test case, will be reverted back

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      |  5 +--
 llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 37 +++++++++++++++++++--
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4d7daa21ffa033..af9224baf8d7b2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2754,7 +2754,7 @@ static MachineInstr *swapNonRegOperands(MachineInstr &MI,
     NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
     NonRegOp2.setTargetFlags(TargetFlags);
   }
-  // --> Still working on the FrameInfo case.
+  // --> Still working on the FrameInfo case :)
   // else if (NonRegOp1.isFI() && NonRegOp2.isFI()){
   //   auto TargetFlags = NonRegOp1.getTargetFlags();
   //   auto FrameIndex = NonRegOp1.getIndex();  
@@ -2825,9 +2825,6 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
   } else {
       CommutedMI = swapNonRegOperands(MI, Src1, Src0);
-    
-    // FIXME: Found two non registers to commute. This does happen.
-    // return nullptr;
   }
 
   if (CommutedMI) {
diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
index b9397f9d5d4ddc..b86778fe49e5eb 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
+++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
@@ -4,14 +4,47 @@
 # GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
 # GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
 # GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
+# ---
+# name: test_machine_cse_op_sel
+# body: |
+#   bb.0:
+#     %0:vgpr_32 = IMPLICIT_DEF
+#     %1:vgpr_32 = IMPLICIT_DEF
+#     %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
+#     %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
+#     DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
+# ...
+
+--- |
+  @bar = internal global i32 10, align 4
+  @foo = internal global i32 10, align 4
+  
+  define i32 @test_machine_cse_op_sel() {
+  entry:
+    %0 = load i32, ptr @bar, align 4
+    ret i32 %0
+  }
+...
 ---
 name: test_machine_cse_op_sel
 body: |
   bb.0:
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vgpr_32 = IMPLICIT_DEF
-    %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
-    %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
+
+    ; Case 1:
+    %2:vgpr_32 = V_ADD_NC_U16_e64 0, 1, 0, 2, 0, 0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_ADD_NC_U16_e64 0, 2, 0, 1, 0, 0, implicit $mode, implicit $exec
     DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
+
+    ; Case 2:
+    %4:vgpr_32 = V_ADD_NC_U16_e64 0, 1, 0, @bar, 0, 0, implicit $mode, implicit $exec
+    %5:vgpr_32 = V_ADD_NC_U16_e64 0, @bar, 0, 1, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE2_B32_gfx9 undef %6:vgpr_32, %4, %5, 0, 1, 0, implicit $exec
+
+    ; Case 3:
+    ;%7:vgpr_32 = V_ADD_NC_U16_e64 0, @foo, 0, @bar, 0, 0, implicit $mode, implicit $exec
+    ;%8:vgpr_32 = V_ADD_NC_U16_e64 0, @bar, 0, @foo, 0, 0, implicit $mode, implicit $exec
+    ;DS_WRITE2_B32_gfx9 undef %9:vgpr_32, %7, %8, 0, 1, 0, implicit $exec
 ...
 

>From 34f871f30da85bd657137b52df4258b9e8edb806 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 14 Oct 2024 10:52:30 +0530
Subject: [PATCH 4/7] temp commit

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index af9224baf8d7b2..38970a7a0ef86b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2756,7 +2756,7 @@ static MachineInstr *swapNonRegOperands(MachineInstr &MI,
   }
   // --> Still working on the FrameInfo case :)
   // else if (NonRegOp1.isFI() && NonRegOp2.isFI()){
-  //   auto TargetFlags = NonRegOp1.getTargetFlags();
+  //   auto TargetFlags = NonRegOp 1.getTargetFlags();
   //   auto FrameIndex = NonRegOp1.getIndex();  
   //   NonRegOp1.ChangeToFrameIndex(NonRegOp2.getIndex());  
   //   NonRegOp2.ChangeToFrameIndex(FrameIndex);  

>From 5d592ba7302dd961f4d3e24fd710abbc30139d18 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Sun, 20 Oct 2024 13:26:02 +0530
Subject: [PATCH 5/7] temp commit

---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 69a7a77f5ee8eb..806f7703b0bf5f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -871,10 +871,8 @@ let SubtargetPredicate = isGFX10Plus in {
     def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
   }
   
-  let isCommutable = 1 in {
-    defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
-    defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
-  } // End isCommutable = 1
+  defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
+  defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
 
   def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
   def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;

>From 69a7caacae54fc1b0499515d60d64fa3826231de Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Sun, 20 Oct 2024 13:27:53 +0530
Subject: [PATCH 6/7] temp commit

---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 806f7703b0bf5f..78ca7a2f258cb3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -870,7 +870,7 @@ let SubtargetPredicate = isGFX10Plus in {
     def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
     def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
   }
-  
+
   defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
   defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
 

>From a6e4a13f0cb5a36b73508cbbc9eef67c7dfba411 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Sun, 20 Oct 2024 13:29:42 +0530
Subject: [PATCH 7/7] temp

---
 llvm/test/CodeGen/AMDGPU/commute-op-sel.mir | 40 ++-------------------
 1 file changed, 3 insertions(+), 37 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
index b86778fe49e5eb..8eee879d302949 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
+++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
@@ -5,46 +5,12 @@
 # GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
 # GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
 # ---
-# name: test_machine_cse_op_sel
-# body: |
-#   bb.0:
-#     %0:vgpr_32 = IMPLICIT_DEF
-#     %1:vgpr_32 = IMPLICIT_DEF
-#     %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
-#     %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
-#     DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
-# ...
-
---- |
-  @bar = internal global i32 10, align 4
-  @foo = internal global i32 10, align 4
-  
-  define i32 @test_machine_cse_op_sel() {
-  entry:
-    %0 = load i32, ptr @bar, align 4
-    ret i32 %0
-  }
-...
----
 name: test_machine_cse_op_sel
 body: |
   bb.0:
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vgpr_32 = IMPLICIT_DEF
-
-    ; Case 1:
-    %2:vgpr_32 = V_ADD_NC_U16_e64 0, 1, 0, 2, 0, 0, implicit $mode, implicit $exec
-    %3:vgpr_32 = V_ADD_NC_U16_e64 0, 2, 0, 1, 0, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
     DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
-
-    ; Case 2:
-    %4:vgpr_32 = V_ADD_NC_U16_e64 0, 1, 0, @bar, 0, 0, implicit $mode, implicit $exec
-    %5:vgpr_32 = V_ADD_NC_U16_e64 0, @bar, 0, 1, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE2_B32_gfx9 undef %6:vgpr_32, %4, %5, 0, 1, 0, implicit $exec
-
-    ; Case 3:
-    ;%7:vgpr_32 = V_ADD_NC_U16_e64 0, @foo, 0, @bar, 0, 0, implicit $mode, implicit $exec
-    ;%8:vgpr_32 = V_ADD_NC_U16_e64 0, @bar, 0, @foo, 0, 0, implicit $mode, implicit $exec
-    ;DS_WRITE2_B32_gfx9 undef %9:vgpr_32, %7, %8, 0, 1, 0, implicit $exec
-...
-
+...
\ No newline at end of file