[llvm] Test branch wave reduce (PR #111366)

Mon Oct 7 22:28:36 PDT 2024

https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/111366

>From 30454d1ceb4e126d08ed01d19fbcd7cc513ecec6 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 24 Sep 2024 15:35:42 +0530
Subject: [PATCH 01/13] Added wave reduce intrinsics for int
 add,sub,or,xor,and. Still have to extend for unsigned sub and floats.

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   7 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   9 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 236 +++++++++++++++++-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  35 +++
 .../global_atomics_iterative_scan_fp.ll       |   2 +-
 5 files changed, 281 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4cd32a0502c66d..097a074859ca10 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2121,6 +2121,13 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
 
 def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
 
 def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f2c9619cb8276a..c5ee2944e3015e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4851,7 +4851,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     case Intrinsic::amdgcn_wave_reduce_umin:
-    case Intrinsic::amdgcn_wave_reduce_umax: {
+    case Intrinsic::amdgcn_wave_reduce_umax: 
+    case Intrinsic::amdgcn_wave_reduce_and: 
+    case Intrinsic::amdgcn_wave_reduce_or: 
+    case Intrinsic::amdgcn_wave_reduce_xor: 
+    case Intrinsic::amdgcn_wave_reduce_usub: 
+    case Intrinsic::amdgcn_wave_reduce_sub: 
+    case Intrinsic::amdgcn_wave_reduce_uadd: 
+    case Intrinsic::amdgcn_wave_reduce_add: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 10108866a7005a..f787f3d71fc045 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4859,10 +4859,220 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   Register DstReg = MI.getOperand(0).getReg();
   MachineBasicBlock *RetBB = nullptr;
   if (isSGPR) {
-    // These operations with a uniform value i.e. SGPR are idempotent.
-    // Reduced value will be same as given sgpr.
-    BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
-    RetBB = &BB;
+    switch(Opc){
+      case AMDGPU::S_MIN_U32:
+      case AMDGPU::S_MAX_U32:
+      case AMDGPU::S_AND_B32:
+      case AMDGPU::S_OR_B32:
+        // These operations with a uniform value i.e. SGPR are idempotent.
+        // Reduced value will be same as given sgpr.
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+        RetBB = &BB;
+        break;
+      // TODO --> add support for Unsigned ADD and unsigned SUB.
+      case AMDGPU::S_XOR_B32:
+      case AMDGPU::S_ADD_U32:
+      case AMDGPU::S_ADD_I32:
+      // case AMDGPU::S_SUB_U32:
+      case AMDGPU::S_SUB_I32:{
+        MachineBasicBlock::iterator I = BB.end();
+        Register SrcReg = MI.getOperand(1).getReg();
+
+        // Create Control flow for loop
+        // Split MI's Machine Basic block into For loop
+        auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+        // Create virtual registers required for lowering.
+        const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+        const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+        Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
+        Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+
+        Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+        Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+        Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+
+        Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+        Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
+
+        bool IsWave32 = ST.isWave32();
+        unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+        unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+        // Create initail values of induction variable from Exec, Accumulator and
+        // insert branch instr to newly created ComputeBlock
+        uint32_t InitalValue = 0;
+        
+        auto TmpSReg =
+            BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
+        BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
+            .addImm(InitalValue);
+        BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+        // Start constructing ComputeLoop
+        I = ComputeLoop->end();
+        auto Accumulator =
+            BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+                .addReg(InitalValReg)
+                .addMBB(&BB);
+        auto ActiveBits =
+            BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+                .addReg(TmpSReg->getOperand(0).getReg())
+                .addMBB(&BB);
+
+        // Perform the computations
+        unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+        auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+                      .addReg(ActiveBits->getOperand(0).getReg());
+        auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), CountOfActiveLanesReg)
+                                  .addReg(Accumulator->getOperand(0).getReg())
+                                  .addImm(1);
+
+        // Manipulate the iterator to get the next active lane
+        unsigned BITSETOpc =
+            IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+        auto NewActiveBits =
+            BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+                .addReg(FF1->getOperand(0).getReg())
+                .addReg(ActiveBits->getOperand(0).getReg());
+
+        // Add phi nodes
+        Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+            .addMBB(ComputeLoop);
+        ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+            .addMBB(ComputeLoop);
+
+        // Creating branching
+        unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+        BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+            .addReg(NewActiveBits->getOperand(0).getReg())
+            .addImm(0);
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+            .addMBB(ComputeLoop);
+
+        I = ComputeEnd->begin();
+        switch(Opc){
+          case AMDGPU::S_XOR_B32:{
+            // Performing an XOR operation on a uniform value
+            // depends on the number of active lanes. If there 
+            // are an even number of active lanes, then the XOR 
+            // will result in 0. And if there are an odd number
+            // of Active lanes then the XOR will result in the
+            // same value as that in the SGPR. This comes from 
+            // the fact that A^A = 0 and A^0 = A.
+
+            // Create basic block to check the parity.
+            // MachineFunction &MF = *ComputeEnd->getParent();
+            // MachineBasicBlock *CheckParity = MF.CreateMachineBasicBlock();
+            // MachineFunction::iterator It = ComputeEnd->getIterator();
+            // MF.insert(It, CheckParity);
+            // ComputeLoop->addSuccessor(CheckParity);
+            // ComputeLoop->removeSuccessor(ComputeEnd);
+
+            Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+            // Register Product = MRI.createVirtualRegister(DstRegClass);
+            // Register OddResult = MRI.createVirtualRegister(DstRegClass);
+            // MachineBasicBlock *Even = MF.CreateMachineBasicBlock();  
+            // MachineBasicBlock *Odd = MF.CreateMachineBasicBlock();  
+            // MF.push_back(Even);  
+            // MF.push_back(Odd); 
+            // CheckParity->addSuccessor(Even);  
+            // CheckParity->addSuccessor(Odd);  
+            // Even->addSuccessor(ComputeEnd);  
+            // Odd->addSuccessor(ComputeEnd);     
+
+            // If the LSB is set, the number is odd, else it is even.
+            // TODO --> is FF0 faster or left-shift by 31 faster or AND 0xfffffffe??
+            // I = CheckParity->begin();
+            auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+                .addReg(NewAccumulator->getOperand(0).getReg())
+                .addImm(1);
+
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)  
+                .addReg(ParityReg->getOperand(0).getReg())  
+                .addImm(SrcReg);
+            // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))  
+            //     .addMBB(Even);  
+            // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_BRANCH))  
+            //     .addMBB(Odd); 
+
+            // If there are an even number of active lanes, the result is 0.
+            // I = Even->begin();  
+            // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_MOV_B32), EvenResult).addImm(0); 
+            // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_BRANCH))  
+            //     .addMBB(ComputeEnd); 
+  
+            // If there are an odd number of active lanes, the result is the value itself.
+            // I = Odd->begin(); 
+            // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_MOV_B32), OddResult).addReg(SrcReg);  
+            // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_BRANCH))  
+            //     .addMBB(ComputeEnd);  
+
+            // Add PHI node to get the appropriate result.
+            // I = ComputeEnd->begin();
+            // auto PhiNode =
+            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::PHI), DstReg)
+            //         .addReg(EvenResult)
+            //         .addMBB(Even);
+            // PhiNode.addReg(OddResult)
+            //     .addMBB(Odd);
+            break;
+          }
+          case AMDGPU::S_SUB_U32:{
+            // Doubt --> how can you have a negative unsigned value?? 
+            break;
+          }
+          case AMDGPU::S_SUB_I32:{
+            // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
+            Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+            // Take the negation of the source operand.
+            auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+            // Multiply the negated value with the number of active lanes.
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(InvertedValReg->getOperand(0).getReg()).addReg(NewAccumulator->getOperand(0).getReg());
+            break;
+          }
+          // Doubt --> is SSA form still have to be followed for MIR?
+          case AMDGPU::S_ADD_U32:{
+            // For unsigned multiplication, zero extend the inputs to 64bits,
+            // perform an unsigned multiplication on them and then store the 
+            // 32 lower order bits as the result. 
+            Register ExtendedInput = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);  
+            Register ZeroExtension = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);  
+            Register ExtendedCount = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);  
+            Register UnsignedProduct = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);  
+
+            auto ZeroExtented = 
+                BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MOV_B32), ZeroExtension)  
+                    .addImm(0);  
+
+            // Zero extend the input to 64bits.
+            auto Input_64 = 
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedInput)  
+                .addReg(SrcReg).addImm(AMDGPU::sub0)  
+                .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1); 
+            
+            // Zero extend the number of active lanes to 64bits.
+            auto Count_64 = 
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedCount)  
+                .addReg(NewAccumulator->getOperand(0).getReg()).addImm(AMDGPU::sub0)  
+                .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1); 
+
+            auto Product = 
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_U64), UnsignedProduct)  
+                .addReg(Input_64->getOperand(0).getReg())
+                .addReg(Count_64->getOperand(0).getReg()); 
+
+            // Store the lower 32bits of the product as the result.
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(Product->getOperand(0).getReg(), 0, AMDGPU::sub0);
+            break;
+          }
+          case AMDGPU::S_ADD_I32:
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(SrcReg).addReg(NewAccumulator->getOperand(0).getReg());
+        }
+
+        RetBB = ComputeEnd;
+      }
+    }
   } else {
     // TODO: Implement DPP Strategy and switch based on immediate strategy
     // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4898,7 +5108,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
     // Create initail values of induction variable from Exec, Accumulator and
-    // insert branch instr to newly created ComputeBlockk
+    // insert branch instr to newly created ComputeBlock
     uint32_t InitalValue =
         (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
     auto TmpSReg =
@@ -4970,6 +5180,20 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_U32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_U32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_U32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
     const DebugLoc &DL = MI.getDebugLoc();
@@ -6771,7 +6995,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
-  // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
+  // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64  // TODO --> `..are zero-extended to 32-bits, then we ..` , should this be zero-extended from 32 bits?
   // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
   // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
   KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9afb29d95abd7d..b61094cd5f6309 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -264,6 +264,41 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
   }
+
+  def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
+  }
+  
+  def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
+  }
+  
+  def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
+  }
+  
+  def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
+  }
+  
+  def WAVE_REDUCE_AND_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
+  }
+  
+  def WAVE_REDUCE_OR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
+  }
+  
+  def WAVE_REDUCE_XOR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
+  }
 }
 
 let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index d1e50bd560cb23..02942254cc555b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -156,7 +156,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
 ; IR-DPP:       14:
 ; IR-DPP-NEXT:    ret void
 ;
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst
   ret void
 }
 

>From 2ea5ae516d888e1c3c302f25a8297f6e49c46fd4 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 26 Sep 2024 13:33:57 +0530
Subject: [PATCH 02/13] S_MUL fiasciao

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  15 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 180 +++++++++++-----------
 2 files changed, 104 insertions(+), 91 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 097a074859ca10..c80168c01bc9ad 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2109,7 +2109,7 @@ def int_amdgcn_s_quadmask :
 def int_amdgcn_s_wqm :
   DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
 
-class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
+class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
     [data_ty],
     [
       LLVMMatchType<0>,   // llvm value to reduce (SGPR/VGPR)
@@ -2119,6 +2119,19 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
 
+//multiclass AMDGPUWaveReducee {
+//  foreach Opcode = ["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"] in 
+//   def int_amdgcn_wave_reduce_#Opcode : AMDGPUWaveReduce;
+//}
+
+//multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
+//  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
+//    def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
+//}
+
+//WaveReduceDefs<["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"]>;
+//list<string> Operations
+
 def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9b4f25ba10d42b..646e7a3d415f72 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4875,7 +4875,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
       case AMDGPU::S_XOR_B32:
       case AMDGPU::S_ADD_U32:
       case AMDGPU::S_ADD_I32:
-      // case AMDGPU::S_SUB_U32:
+      case AMDGPU::S_SUB_U32:
       case AMDGPU::S_SUB_I32:{
         MachineBasicBlock::iterator I = BB.end();
         Register SrcReg = MI.getOperand(1).getReg();
@@ -4963,115 +4963,115 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             // same value as that in the SGPR. This comes from 
             // the fact that A^A = 0 and A^0 = A.
 
-            // Create basic block to check the parity.
-            // MachineFunction &MF = *ComputeEnd->getParent();
-            // MachineBasicBlock *CheckParity = MF.CreateMachineBasicBlock();
-            // MachineFunction::iterator It = ComputeEnd->getIterator();
-            // MF.insert(It, CheckParity);
-            // ComputeLoop->addSuccessor(CheckParity);
-            // ComputeLoop->removeSuccessor(ComputeEnd);
-
             Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-            // Register Product = MRI.createVirtualRegister(DstRegClass);
-            // Register OddResult = MRI.createVirtualRegister(DstRegClass);
-            // MachineBasicBlock *Even = MF.CreateMachineBasicBlock();  
-            // MachineBasicBlock *Odd = MF.CreateMachineBasicBlock();  
-            // MF.push_back(Even);  
-            // MF.push_back(Odd); 
-            // CheckParity->addSuccessor(Even);  
-            // CheckParity->addSuccessor(Odd);  
-            // Even->addSuccessor(ComputeEnd);  
-            // Odd->addSuccessor(ComputeEnd);     
-
-            // If the LSB is set, the number is odd, else it is even.
-            // TODO --> is FF0 faster or left-shift by 31 faster or AND 0xfffffffe??
-            // I = CheckParity->begin();
+
             auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
                 .addReg(NewAccumulator->getOperand(0).getReg())
                 .addImm(1);
 
             BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)  
                 .addReg(ParityReg->getOperand(0).getReg())  
-                .addImm(SrcReg);
-            // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))  
-            //     .addMBB(Even);  
-            // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_BRANCH))  
-            //     .addMBB(Odd); 
-
-            // If there are an even number of active lanes, the result is 0.
-            // I = Even->begin();  
-            // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_MOV_B32), EvenResult).addImm(0); 
-            // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_BRANCH))  
-            //     .addMBB(ComputeEnd); 
-  
-            // If there are an odd number of active lanes, the result is the value itself.
-            // I = Odd->begin(); 
-            // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_MOV_B32), OddResult).addReg(SrcReg);  
-            // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_BRANCH))  
-            //     .addMBB(ComputeEnd);  
-
-            // Add PHI node to get the appropriate result.
-            // I = ComputeEnd->begin();
-            // auto PhiNode =
-            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::PHI), DstReg)
-            //         .addReg(EvenResult)
-            //         .addMBB(Even);
-            // PhiNode.addReg(OddResult)
-            //     .addMBB(Odd);
-            break;
-          }
-          case AMDGPU::S_SUB_U32:{
-            // Doubt --> how can you have a negative unsigned value?? 
+                .addReg(SrcReg);
             break;
           }
+          // case AMDGPU::S_SUB_U32:{
+          // //   // Doubt --> how can you have a negative unsigned value?? 
+          //   Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+          //   Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          //   Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+          //   // Take the negation of the source operand.
+          //   auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+
+          //   auto V_SrcReg = 
+          //       BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
+          //           .addReg(InvertedValReg->getOperand(0).getReg());
+            
+          //   auto ProductVal = 
+          //       BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)  
+          //           .addReg(V_SrcReg->getOperand(0).getReg())
+          //           .addReg(NewAccumulator->getOperand(0).getReg())  
+          //           .addReg(AMDGPU::EXEC, RegState::Implicit);
+
+          //   BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+          //       .addReg(ProductVal->getOperand(0).getReg());
+
+          //   break;
+          // }
+          case AMDGPU::S_SUB_U32:
           case AMDGPU::S_SUB_I32:{
             // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
             Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+            // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+            // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
             // Take the negation of the source operand.
             auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
-            // Multiply the negated value with the number of active lanes.
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(InvertedValReg->getOperand(0).getReg()).addReg(NewAccumulator->getOperand(0).getReg());
-            break;
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+                .addReg(InvertedValReg->getOperand(0).getReg())
+                .addReg(NewAccumulator->getOperand(0).getReg());
+
+            // auto V_SrcReg = 
+            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
+            //         .addReg(InvertedValReg->getOperand(0).getReg());
+            
+            // auto ProductVal = 
+            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)  
+            //         .addReg(V_SrcReg->getOperand(0).getReg())
+            //         .addReg(NewAccumulator->getOperand(0).getReg())  
+            //         .addReg(AMDGPU::EXEC, RegState::Implicit)  
+            //         .setMIFlag(MachineInstr::MIFlag::NoUWrap)  
+            //         .setMIFlag(MachineInstr::MIFlag::NoSWrap);
+
+            // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+            //     .addReg(ProductVal->getOperand(0).getReg());
+
+            // break;
           }
           // Doubt --> is SSA form still have to be followed for MIR?
-          case AMDGPU::S_ADD_U32:{
-            // For unsigned multiplication, zero extend the inputs to 64bits,
-            // perform an unsigned multiplication on them and then store the 
-            // 32 lower order bits as the result. 
-            Register ExtendedInput = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);  
-            Register ZeroExtension = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);  
-            Register ExtendedCount = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);  
-            Register UnsignedProduct = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);  
-
-            auto ZeroExtented = 
-                BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MOV_B32), ZeroExtension)  
-                    .addImm(0);  
-
-            // Zero extend the input to 64bits.
-            auto Input_64 = 
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedInput)  
-                .addReg(SrcReg).addImm(AMDGPU::sub0)  
-                .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1); 
+          case AMDGPU::S_ADD_U32:
+          case AMDGPU::S_ADD_I32:{
+            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+                .addReg(SrcReg)
+                .addReg(NewAccumulator->getOperand(0).getReg());
+            // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+            // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+            // auto V_SrcReg = 
+            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
+            //         .addReg(SrcReg);
             
-            // Zero extend the number of active lanes to 64bits.
-            auto Count_64 = 
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedCount)  
-                .addReg(NewAccumulator->getOperand(0).getReg()).addImm(AMDGPU::sub0)  
-                .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1); 
-
-            auto Product = 
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_U64), UnsignedProduct)  
-                .addReg(Input_64->getOperand(0).getReg())
-                .addReg(Count_64->getOperand(0).getReg()); 
-
-            // Store the lower 32bits of the product as the result.
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(Product->getOperand(0).getReg(), 0, AMDGPU::sub0);
+            // auto ProductVal = 
+            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)  
+            //         .addReg(V_SrcReg->getOperand(0).getReg())
+            //         .addReg(NewAccumulator->getOperand(0).getReg())  
+            //         .addReg(AMDGPU::EXEC, RegState::Implicit)  
+            //         .setMIFlag(MachineInstr::MIFlag::NoUWrap)  
+            //         .setMIFlag(MachineInstr::MIFlag::NoSWrap);
+
+            // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+            //     .addReg(ProductVal->getOperand(0).getReg());
             break;
           }
-          case AMDGPU::S_ADD_I32:
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(SrcReg).addReg(NewAccumulator->getOperand(0).getReg());
+          // case AMDGPU::S_ADD_U32:{
+          //   Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          //   Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+          //   auto V_SrcReg = 
+          //       BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
+          //           .addReg(SrcReg);
+
+          //   auto ProductVal = 
+          //       BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)  
+          //           .addReg(V_SrcReg->getOperand(0).getReg())
+          //           .addReg(NewAccumulator->getOperand(0).getReg())  
+          //           .addReg(AMDGPU::EXEC, RegState::Implicit);
+
+          //   BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+          //       .addReg(ProductVal->getOperand(0).getReg());
+          //   break;
+          // }
         }
-
         RetBB = ComputeEnd;
       }
     }

>From c0dd1471380cb33344aa358b12bc13583d0e1cc6 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 26 Sep 2024 14:11:51 +0530
Subject: [PATCH 03/13] temp commit

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 646e7a3d415f72..5a06070dd64feb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4868,7 +4868,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
       case AMDGPU::S_OR_B32:
         // These operations with a uniform value i.e. SGPR are idempotent.
         // Reduced value will be same as given sgpr.
-        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+        // bool IsWave32 = ST.isWave32();
+        unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+        BuildMI(BB, MI, DL, TII->get(MovOpc), DstReg).addReg(SrcReg);
         RetBB = &BB;
         break;
       // TODO --> add support for Unsigned ADD and unsigned SUB.

>From 93e8802817dee2b79258732d3ac01e0a92ba6625 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 26 Sep 2024 17:34:14 +0530
Subject: [PATCH 04/13] temp commit

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  10 ++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 130 +++++++---------------
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  50 +++++++++
 3 files changed, 102 insertions(+), 88 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c80168c01bc9ad..299513a95e2989 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2133,13 +2133,23 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
 //list<string> Operations
 
 def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_min : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fmin : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_max : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fmax : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fadd : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fsub : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_uand : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fand : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_uor : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_for : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
 
 def int_amdgcn_readfirstlane :
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5a06070dd64feb..cd057702d6072d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4863,9 +4863,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   if (isSGPR) {
     switch(Opc){
       case AMDGPU::S_MIN_U32:
+      case AMDGPU::S_MIN_I32:
+      case AMDGPU::S_MIN_F32:
       case AMDGPU::S_MAX_U32:
+      case AMDGPU::S_MAX_I32:
+      case AMDGPU::S_MAX_F32:
       case AMDGPU::S_AND_B32:
-      case AMDGPU::S_OR_B32:
+      case AMDGPU::S_OR_B32:{
         // These operations with a uniform value i.e. SGPR are idempotent.
         // Reduced value will be same as given sgpr.
         // bool IsWave32 = ST.isWave32();
@@ -4873,12 +4877,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         BuildMI(BB, MI, DL, TII->get(MovOpc), DstReg).addReg(SrcReg);
         RetBB = &BB;
         break;
-      // TODO --> add support for Unsigned ADD and unsigned SUB.
+      }
       case AMDGPU::S_XOR_B32:
       case AMDGPU::S_ADD_U32:
       case AMDGPU::S_ADD_I32:
+      case AMDGPU::S_ADD_F32:
       case AMDGPU::S_SUB_U32:
-      case AMDGPU::S_SUB_I32:{
+      case AMDGPU::S_SUB_I32:
+      case AMDGPU::S_SUB_F32:{
         MachineBasicBlock::iterator I = BB.end();
         Register SrcReg = MI.getOperand(1).getReg();
 
@@ -4970,109 +4976,37 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
                 .addReg(NewAccumulator->getOperand(0).getReg())
                 .addImm(1);
-
+// S_MUL_I32
+            // auto MulOp = 
+            // Can you have one float and one int op? I dont think you can, need to handle the float case seperately.  
             BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)  
-                .addReg(ParityReg->getOperand(0).getReg())  
-                .addReg(SrcReg);
+                .addReg(SrcReg)
+                .addReg(ParityReg->getOperand(0).getReg())  ;
             break;
           }
-          // case AMDGPU::S_SUB_U32:{
-          // //   // Doubt --> how can you have a negative unsigned value?? 
-          //   Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
-          //   Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-          //   Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-          //   // Take the negation of the source operand.
-          //   auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
-
-          //   auto V_SrcReg = 
-          //       BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
-          //           .addReg(InvertedValReg->getOperand(0).getReg());
-            
-          //   auto ProductVal = 
-          //       BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)  
-          //           .addReg(V_SrcReg->getOperand(0).getReg())
-          //           .addReg(NewAccumulator->getOperand(0).getReg())  
-          //           .addReg(AMDGPU::EXEC, RegState::Implicit);
-
-          //   BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
-          //       .addReg(ProductVal->getOperand(0).getReg());
-
-          //   break;
-          // }
           case AMDGPU::S_SUB_U32:
-          case AMDGPU::S_SUB_I32:{
+          case AMDGPU::S_SUB_I32:
+          case AMDGPU::S_SUB_F32:{
             // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
             Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
-            // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-            // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
+            
             // Take the negation of the source operand.
             auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
             BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
                 .addReg(InvertedValReg->getOperand(0).getReg())
                 .addReg(NewAccumulator->getOperand(0).getReg());
-
-            // auto V_SrcReg = 
-            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
-            //         .addReg(InvertedValReg->getOperand(0).getReg());
-            
-            // auto ProductVal = 
-            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)  
-            //         .addReg(V_SrcReg->getOperand(0).getReg())
-            //         .addReg(NewAccumulator->getOperand(0).getReg())  
-            //         .addReg(AMDGPU::EXEC, RegState::Implicit)  
-            //         .setMIFlag(MachineInstr::MIFlag::NoUWrap)  
-            //         .setMIFlag(MachineInstr::MIFlag::NoSWrap);
-
-            // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
-            //     .addReg(ProductVal->getOperand(0).getReg());
-
-            // break;
+            break;
           }
           // Doubt --> is SSA form still have to be followed for MIR?
           case AMDGPU::S_ADD_U32:
-          case AMDGPU::S_ADD_I32:{
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+          case AMDGPU::S_ADD_I32:
+          case AMDGPU::S_ADD_F32:{
+            auto Opcode = Opc == AMDGPU::S_ADD_U32 || Opc == AMDGPU::S_ADD_I32 ? AMDGPU::S_MUL_I32 : AMDGPU::S_MUL_F32;  
+            BuildMI(*ComputeEnd, I, DL, TII->get(Opcode), DstReg)
                 .addReg(SrcReg)
                 .addReg(NewAccumulator->getOperand(0).getReg());
-            // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-            // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-            // auto V_SrcReg = 
-            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
-            //         .addReg(SrcReg);
-            
-            // auto ProductVal = 
-            //     BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)  
-            //         .addReg(V_SrcReg->getOperand(0).getReg())
-            //         .addReg(NewAccumulator->getOperand(0).getReg())  
-            //         .addReg(AMDGPU::EXEC, RegState::Implicit)  
-            //         .setMIFlag(MachineInstr::MIFlag::NoUWrap)  
-            //         .setMIFlag(MachineInstr::MIFlag::NoSWrap);
-
-            // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
-            //     .addReg(ProductVal->getOperand(0).getReg());
             break;
           }
-          // case AMDGPU::S_ADD_U32:{
-          //   Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-          //   Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-          //   auto V_SrcReg = 
-          //       BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
-          //           .addReg(SrcReg);
-
-          //   auto ProductVal = 
-          //       BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)  
-          //           .addReg(V_SrcReg->getOperand(0).getReg())
-          //           .addReg(NewAccumulator->getOperand(0).getReg())  
-          //           .addReg(AMDGPU::EXEC, RegState::Implicit);
-
-          //   BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
-          //       .addReg(ProductVal->getOperand(0).getReg());
-          //   break;
-          // }
         }
         RetBB = ComputeEnd;
       }
@@ -5182,20 +5116,40 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   switch (MI.getOpcode()) {
   case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
+  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_F32);
   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_F32);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_F32);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_F32);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
   case AMDGPU::WAVE_REDUCE_OR_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
   case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
   case AMDGPU::S_UADDO_PSEUDO:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b61094cd5f6309..534b4d2c052482 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -260,15 +260,40 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
     [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
   }
 
+  def WAVE_REDUCE_UMIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_UMIN_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_fmin f32:$src, i32:$strategy))]> {
+  }
+
   def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
   }
 
+  def WAVE_REDUCE_UMAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_UMAX_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_fmax f32:$src, i32:$strategy))]> {
+  }
+
   def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
   }
+
+  def WAVE_REDUCE_ADD_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_add f32:$src, i32:$strategy))]> {
+  }
   
   def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
@@ -284,16 +309,41 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
   }
+
+  def WAVE_REDUCE_SUB_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_fsub f32:$src, i32:$strategy))]> {
+  }
   
   def WAVE_REDUCE_AND_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_uand i32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_AND_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
   }
+
+  def WAVE_REDUCE_AND_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_fand f32:$src, i32:$strategy))]> {
+  }
   
   def WAVE_REDUCE_OR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_uor i32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_OR_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
   }
+
+  def WAVE_REDUCE_OR_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set f32:$sdst, (int_amdgcn_wave_reduce_for f32:$src, i32:$strategy))]> {
+  }
   
   def WAVE_REDUCE_XOR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),

>From d6dc7a5e23e2c5b839fabfbb85259cb3efdd0197 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Fri, 27 Sep 2024 10:03:36 +0530
Subject: [PATCH 05/13] Changed atomic optimizer to emit wave.reduce intrinsic.

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  12 -
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   | 346 +++++++++---------
 2 files changed, 177 insertions(+), 181 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 299513a95e2989..fc6d13899a809d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2119,18 +2119,6 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
 
-//multiclass AMDGPUWaveReducee {
-//  foreach Opcode = ["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"] in 
-//   def int_amdgcn_wave_reduce_#Opcode : AMDGPUWaveReduce;
-//}
-
-//multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
-//  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
-//    def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
-//}
-
-//WaveReduceDefs<["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"]>;
-//list<string> Operations
 
 def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_min : AMDGPUWaveReduce;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index f408a013d7a379..8bc3bc81002adf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -682,7 +682,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // lanes that are around only for the purposes of derivatives to take part
   // in any cross-lane communication, and we use a branch on whether the lane is
   // live to do this.
-  if (IsPixelShader) {
+  if (false) {
     // Record I's original position as the entry block.
     PixelEntryBB = I.getParent();
 
@@ -705,29 +705,35 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // This is the value in the atomic operation we need to combine in order to
   // reduce the number of atomic operations.
   Value *V = I.getOperand(ValIdx);
+  // ------------------------------------
+  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  CallInst *const WaveRed =
+      B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+
+  // ------------------------------------
 
   // We need to know how many lanes are active within the wavefront, and we do
   // this by doing a ballot of active lanes.
-  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
-  CallInst *const Ballot =
-      B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+  // Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  // CallInst *const Ballot =
+  //     B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
 
   // We need to know how many lanes are active within the wavefront that are
   // below us. If we counted each lane linearly starting from 0, a lane is
   // below us only if its associated index was less than ours. We do this by
   // using the mbcnt intrinsic.
-  Value *Mbcnt;
-  if (ST->isWave32()) {
-    Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
-                              {Ballot, B.getInt32(0)});
-  } else {
-    Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
-    Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
-    Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
-                              {ExtractLo, B.getInt32(0)});
-    Mbcnt =
-        B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
-  }
+  // Value *Mbcnt;
+  // if (ST->isWave32()) {
+  //   Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+  //                             {Ballot, B.getInt32(0)});
+  // } else {
+  //   Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
+  //   Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
+  //   Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+  //                             {ExtractLo, B.getInt32(0)});
+  //   Mbcnt =
+  //       B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
+  // }
 
   Function *F = I.getFunction();
   LLVMContext &C = F->getContext();
@@ -745,13 +751,14 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   Value *ExclScan = nullptr;
   Value *NewV = nullptr;
 
-  const bool NeedResult = !I.use_empty();
+  // const bool NeedResult = !I.use_empty();
+  const bool NeedResult = false;
 
   BasicBlock *ComputeLoop = nullptr;
   BasicBlock *ComputeEnd = nullptr;
   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
-  if (ValDivergent) {
+  if (false) {
     if (ScanImpl == ScanOptions::DPP) {
       // First we need to set all inactive invocations to the identity value, so
       // that they can correctly contribute to the final result.
@@ -785,54 +792,55 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       llvm_unreachable("Atomic Optimzer is disabled for None strategy");
     }
   } else {
-    switch (Op) {
-    default:
-      llvm_unreachable("Unhandled atomic op");
-
-    case AtomicRMWInst::Add:
-    case AtomicRMWInst::Sub: {
-      // The new value we will be contributing to the atomic operation is the
-      // old value times the number of active lanes.
-      Value *const Ctpop = B.CreateIntCast(
-          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-      NewV = buildMul(B, V, Ctpop);
-      break;
-    }
-    case AtomicRMWInst::FAdd:
-    case AtomicRMWInst::FSub: {
-      Value *const Ctpop = B.CreateIntCast(
-          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
-      Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
-      NewV = B.CreateFMul(V, CtpopFP);
-      break;
-    }
-    case AtomicRMWInst::And:
-    case AtomicRMWInst::Or:
-    case AtomicRMWInst::Max:
-    case AtomicRMWInst::Min:
-    case AtomicRMWInst::UMax:
-    case AtomicRMWInst::UMin:
-    case AtomicRMWInst::FMin:
-    case AtomicRMWInst::FMax:
-      // These operations with a uniform value are idempotent: doing the atomic
-      // operation multiple times has the same effect as doing it once.
-      NewV = V;
-      break;
-
-    case AtomicRMWInst::Xor:
-      // The new value we will be contributing to the atomic operation is the
-      // old value times the parity of the number of active lanes.
-      Value *const Ctpop = B.CreateIntCast(
-          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-      NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
-      break;
-    }
+  //   switch (Op) {
+  //   default:
+  //     llvm_unreachable("Unhandled atomic op");
+
+  //   case AtomicRMWInst::Add:
+  //   case AtomicRMWInst::Sub: {
+  //     // The new value we will be contributing to the atomic operation is the
+  //     // old value times the number of active lanes.
+  //     // Value *const Ctpop = B.CreateIntCast(
+  //     //     B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+  //     // NewV = buildMul(B, V, Ctpop);
+  //     break;
+  //   }
+  //   case AtomicRMWInst::FAdd:
+  //   case AtomicRMWInst::FSub: {
+  //     // Value *const Ctpop = B.CreateIntCast(
+  //     //     B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
+  //     // Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
+  //     // NewV = B.CreateFMul(V, CtpopFP);
+  //     break;
+  //   }
+  //   case AtomicRMWInst::And:
+  //   case AtomicRMWInst::Or:
+  //   case AtomicRMWInst::Max:
+  //   case AtomicRMWInst::Min:
+  //   case AtomicRMWInst::UMax:
+  //   case AtomicRMWInst::UMin:
+  //   case AtomicRMWInst::FMin:
+  //   case AtomicRMWInst::FMax:
+  //     // These operations with a uniform value are idempotent: doing the atomic
+  //     // operation multiple times has the same effect as doing it once.
+  //     NewV = V;
+  //     break;
+
+  //   case AtomicRMWInst::Xor:
+  //     // The new value we will be contributing to the atomic operation is the
+  //     // old value times the parity of the number of active lanes.
+  //     Value *const Ctpop = B.CreateIntCast(
+  //         B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+  //     NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
+  //     break;
+  //   }
+  // }
   }
 
   // We only want a single lane to enter our new control flow, and we do this
   // by checking if there are any active lanes below us. Only one lane will
   // have 0 active lanes below us, so that will be the only one to progress.
-  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
+  // Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
 
   // Store I's original basic block before we split the block.
   BasicBlock *const OriginalBB = I.getParent();
@@ -842,8 +850,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // new block such that:
   // entry --> single_lane -\
   //       \------------------> exit
-  Instruction *const SingleLaneTerminator =
-      SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
+  // Instruction *const SingleLaneTerminator =
+  //     SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
 
   // At this point, we have split the I's block to allow one lane in wavefront
   // to update the precomputed reduced value. Also, completed the codegen for
@@ -854,135 +862,135 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // ComputeEnd block. We also need to set up predecessor to next block when
   // single lane done updating the final reduced value.
   BasicBlock *Predecessor = nullptr;
-  if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
+  // if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
     // Move terminator from I's block to ComputeEnd block.
     //
     // OriginalBB is known to have a branch as terminator because
     // SplitBlockAndInsertIfThen will have inserted one.
-    BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
-    B.SetInsertPoint(ComputeEnd);
-    Terminator->removeFromParent();
-    B.Insert(Terminator);
+    // BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
+    // B.SetInsertPoint(ComputeEnd);
+    // Terminator->removeFromParent();
+    // B.Insert(Terminator);
 
     // Branch to ComputeLoop Block unconditionally from the I's block for
     // iterative approach.
-    B.SetInsertPoint(OriginalBB);
-    B.CreateBr(ComputeLoop);
+    // B.SetInsertPoint(OriginalBB);
+    // B.CreateBr(ComputeLoop);
 
     // Update the dominator tree for new control flow.
-    SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
-        {{DominatorTree::Insert, OriginalBB, ComputeLoop},
-         {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
+    // SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
+    //     {{DominatorTree::Insert, OriginalBB, ComputeLoop},
+    //      {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
 
     // We're moving the terminator from EntryBB to ComputeEnd, make sure we move
     // the DT edges as well.
-    for (auto *Succ : Terminator->successors()) {
-      DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
-      DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
-    }
+  //   for (auto *Succ : Terminator->successors()) {
+  //     DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
+  //     DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
+  //   }
 
-    DTU.applyUpdates(DomTreeUpdates);
+  //   DTU.applyUpdates(DomTreeUpdates);
 
-    Predecessor = ComputeEnd;
-  } else {
-    Predecessor = OriginalBB;
-  }
+  //   Predecessor = ComputeEnd;
+  // } else {
+  //   Predecessor = OriginalBB;
+  // }
   // Move the IR builder into single_lane next.
-  B.SetInsertPoint(SingleLaneTerminator);
+  // B.SetInsertPoint(SingleLaneTerminator);
 
   // Clone the original atomic operation into single lane, replacing the
   // original value with our newly created one.
   Instruction *const NewI = I.clone();
   B.Insert(NewI);
-  NewI->setOperand(ValIdx, NewV);
+  NewI->setOperand(ValIdx, WaveRed);
 
   // Move the IR builder into exit next, and start inserting just before the
   // original instruction.
-  B.SetInsertPoint(&I);
-
-  if (NeedResult) {
-    // Create a PHI node to get our new atomic result into the exit block.
-    PHINode *const PHI = B.CreatePHI(Ty, 2);
-    PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
-    PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
-
-    // We need to broadcast the value who was the lowest active lane (the first
-    // lane) to all other lanes in the wavefront.
-    Value *BroadcastI = nullptr;
-    BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
-
-    // Now that we have the result of our single atomic operation, we need to
-    // get our individual lane's slice into the result. We use the lane offset
-    // we previously calculated combined with the atomic result value we got
-    // from the first lane, to get our lane's index into the atomic result.
-    Value *LaneOffset = nullptr;
-    if (ValDivergent) {
-      if (ScanImpl == ScanOptions::DPP) {
-        LaneOffset =
-            B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
-      } else if (ScanImpl == ScanOptions::Iterative) {
-        LaneOffset = ExclScan;
-      } else {
-        llvm_unreachable("Atomic Optimzer is disabled for None strategy");
-      }
-    } else {
-      Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
-                                      : B.CreateIntCast(Mbcnt, Ty, false);
-      switch (Op) {
-      default:
-        llvm_unreachable("Unhandled atomic op");
-      case AtomicRMWInst::Add:
-      case AtomicRMWInst::Sub:
-        LaneOffset = buildMul(B, V, Mbcnt);
-        break;
-      case AtomicRMWInst::And:
-      case AtomicRMWInst::Or:
-      case AtomicRMWInst::Max:
-      case AtomicRMWInst::Min:
-      case AtomicRMWInst::UMax:
-      case AtomicRMWInst::UMin:
-      case AtomicRMWInst::FMin:
-      case AtomicRMWInst::FMax:
-        LaneOffset = B.CreateSelect(Cond, Identity, V);
-        break;
-      case AtomicRMWInst::Xor:
-        LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
-        break;
-      case AtomicRMWInst::FAdd:
-      case AtomicRMWInst::FSub: {
-        LaneOffset = B.CreateFMul(V, Mbcnt);
-        break;
-      }
-      }
-    }
-    Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
-    if (isAtomicFloatingPointTy) {
-      // For fadd/fsub the first active lane of LaneOffset should be the
-      // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
-      // is V * +0.0 which might have the wrong sign or might be nan (if V is
-      // inf or nan).
-      //
-      // For all floating point ops if the in-memory value was a nan then the
-      // binop we just built might have quieted it or changed its payload.
-      //
-      // Correct all these problems by using BroadcastI as the result in the
-      // first active lane.
-      Result = B.CreateSelect(Cond, BroadcastI, Result);
-    }
-
-    if (IsPixelShader) {
-      // Need a final PHI to reconverge to above the helper lane branch mask.
-      B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
-
-      PHINode *const PHI = B.CreatePHI(Ty, 2);
-      PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
-      PHI->addIncoming(Result, I.getParent());
-      I.replaceAllUsesWith(PHI);
-    } else {
-      // Replace the original atomic instruction with the new one.
-      I.replaceAllUsesWith(Result);
-    }
-  }
+  // B.SetInsertPoint(&I);
+
+  // if (NeedResult) {
+  //   // Create a PHI node to get our new atomic result into the exit block.
+  //   PHINode *const PHI = B.CreatePHI(Ty, 2);
+  //   PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
+  //   PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
+
+  //   // We need to broadcast the value who was the lowest active lane (the first
+  //   // lane) to all other lanes in the wavefront.
+  //   Value *BroadcastI = nullptr;
+  //   BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
+
+  //   // Now that we have the result of our single atomic operation, we need to
+  //   // get our individual lane's slice into the result. We use the lane offset
+  //   // we previously calculated combined with the atomic result value we got
+  //   // from the first lane, to get our lane's index into the atomic result.
+  //   Value *LaneOffset = nullptr;
+  //   if (ValDivergent) {
+  //     if (ScanImpl == ScanOptions::DPP) {
+  //       LaneOffset =
+  //           B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+  //     } else if (ScanImpl == ScanOptions::Iterative) {
+  //       LaneOffset = ExclScan;
+  //     } else {
+  //       llvm_unreachable("Atomic Optimzer is disabled for None strategy");
+  //     }
+  //   } else {
+  //     Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
+  //                                     : B.CreateIntCast(Mbcnt, Ty, false);
+  //     switch (Op) {
+  //     default:
+  //       llvm_unreachable("Unhandled atomic op");
+  //     case AtomicRMWInst::Add:
+  //     case AtomicRMWInst::Sub:
+  //       LaneOffset = buildMul(B, V, Mbcnt);
+  //       break;
+  //     case AtomicRMWInst::And:
+  //     case AtomicRMWInst::Or:
+  //     case AtomicRMWInst::Max:
+  //     case AtomicRMWInst::Min:
+  //     case AtomicRMWInst::UMax:
+  //     case AtomicRMWInst::UMin:
+  //     case AtomicRMWInst::FMin:
+  //     case AtomicRMWInst::FMax:
+  //       LaneOffset = B.CreateSelect(Cond, Identity, V);
+  //       break;
+  //     case AtomicRMWInst::Xor:
+  //       LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
+  //       break;
+  //     case AtomicRMWInst::FAdd:
+  //     case AtomicRMWInst::FSub: {
+  //       LaneOffset = B.CreateFMul(V, Mbcnt);
+  //       break;
+  //     }
+  //     }
+  //   }
+  //   Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
+  //   if (isAtomicFloatingPointTy) {
+  //     // For fadd/fsub the first active lane of LaneOffset should be the
+  //     // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
+  //     // is V * +0.0 which might have the wrong sign or might be nan (if V is
+  //     // inf or nan).
+  //     //
+  //     // For all floating point ops if the in-memory value was a nan then the
+  //     // binop we just built might have quieted it or changed its payload.
+  //     //
+  //     // Correct all these problems by using BroadcastI as the result in the
+  //     // first active lane.
+  //     Result = B.CreateSelect(Cond, BroadcastI, Result);
+  //   }
+
+  //   if (IsPixelShader) {
+  //     // Need a final PHI to reconverge to above the helper lane branch mask.
+  //     B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+
+  //     PHINode *const PHI = B.CreatePHI(Ty, 2);
+  //     PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
+  //     PHI->addIncoming(Result, I.getParent());
+  //     I.replaceAllUsesWith(PHI);
+  //   } else {
+  //     // Replace the original atomic instruction with the new one.
+  //     I.replaceAllUsesWith(Result);
+  //   }
+  // }
 
   // And delete the original.
   I.eraseFromParent();

>From dd799eedb86c3e6f76bc27a0f37b36375f5a9e62 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 30 Sep 2024 10:33:45 +0530
Subject: [PATCH 06/13] Working testing module, but atomicAdd has umin
 intrinsic

---
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   | 345 +++++++++---------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   4 +-
 2 files changed, 172 insertions(+), 177 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 8bc3bc81002adf..e4782cb5d9c131 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -682,7 +682,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // lanes that are around only for the purposes of derivatives to take part
   // in any cross-lane communication, and we use a branch on whether the lane is
   // live to do this.
-  if (false) {
+  if (IsPixelShader) {
     // Record I's original position as the entry block.
     PixelEntryBB = I.getParent();
 
@@ -705,35 +705,29 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // This is the value in the atomic operation we need to combine in order to
   // reduce the number of atomic operations.
   Value *V = I.getOperand(ValIdx);
-  // ------------------------------------
-  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
-  CallInst *const WaveRed =
-      B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
-
-  // ------------------------------------
 
   // We need to know how many lanes are active within the wavefront, and we do
   // this by doing a ballot of active lanes.
-  // Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
-  // CallInst *const Ballot =
-  //     B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  CallInst *const Ballot =
+      B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
 
   // We need to know how many lanes are active within the wavefront that are
   // below us. If we counted each lane linearly starting from 0, a lane is
   // below us only if its associated index was less than ours. We do this by
   // using the mbcnt intrinsic.
-  // Value *Mbcnt;
-  // if (ST->isWave32()) {
-  //   Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
-  //                             {Ballot, B.getInt32(0)});
-  // } else {
-  //   Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
-  //   Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
-  //   Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
-  //                             {ExtractLo, B.getInt32(0)});
-  //   Mbcnt =
-  //       B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
-  // }
+  Value *Mbcnt;
+  if (ST->isWave32()) {
+    Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+                              {Ballot, B.getInt32(0)});
+  } else {
+    Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
+    Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
+    Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+                              {ExtractLo, B.getInt32(0)});
+    Mbcnt =
+        B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
+  }
 
   Function *F = I.getFunction();
   LLVMContext &C = F->getContext();
@@ -751,47 +745,46 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   Value *ExclScan = nullptr;
   Value *NewV = nullptr;
 
-  // const bool NeedResult = !I.use_empty();
-  const bool NeedResult = false;
+  const bool NeedResult = !I.use_empty();
 
   BasicBlock *ComputeLoop = nullptr;
   BasicBlock *ComputeEnd = nullptr;
   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
-  if (false) {
-    if (ScanImpl == ScanOptions::DPP) {
-      // First we need to set all inactive invocations to the identity value, so
-      // that they can correctly contribute to the final result.
-      NewV =
-          B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-      if (!NeedResult && ST->hasPermLaneX16()) {
-        // On GFX10 the permlanex16 instruction helps us build a reduction
-        // without too many readlanes and writelanes, which are generally bad
-        // for performance.
-        NewV = buildReduction(B, ScanOp, NewV, Identity);
-      } else {
-        NewV = buildScan(B, ScanOp, NewV, Identity);
-        if (NeedResult)
-          ExclScan = buildShiftRight(B, NewV, Identity);
-        // Read the value from the last lane, which has accumulated the values
-        // of each active lane in the wavefront. This will be our new value
-        // which we will provide to the atomic operation.
-        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-        NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
-                                 {NewV, LastLaneIdx});
-      }
-      // Finally mark the readlanes in the WWM section.
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
-    } else if (ScanImpl == ScanOptions::Iterative) {
-      // Alternative implementation for scan
-      ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
-      ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
-      std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
-                                                      ComputeLoop, ComputeEnd);
-    } else {
-      llvm_unreachable("Atomic Optimzer is disabled for None strategy");
-    }
-  } else {
+  // if (ValDivergent) {
+  //   if (ScanImpl == ScanOptions::DPP) {
+  //     // First we need to set all inactive invocations to the identity value, so
+  //     // that they can correctly contribute to the final result.
+  //     NewV =
+  //         B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+  //     if (!NeedResult && ST->hasPermLaneX16()) {
+  //       // On GFX10 the permlanex16 instruction helps us build a reduction
+  //       // without too many readlanes and writelanes, which are generally bad
+  //       // for performance.
+  //       NewV = buildReduction(B, ScanOp, NewV, Identity);
+  //     } else {
+  //       NewV = buildScan(B, ScanOp, NewV, Identity);
+  //       if (NeedResult)
+  //         ExclScan = buildShiftRight(B, NewV, Identity);
+  //       // Read the value from the last lane, which has accumulated the values
+  //       // of each active lane in the wavefront. This will be our new value
+  //       // which we will provide to the atomic operation.
+  //       Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+  //       NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
+  //                                {NewV, LastLaneIdx});
+  //     }
+  //     // Finally mark the readlanes in the WWM section.
+  //     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
+  //   } else if (ScanImpl == ScanOptions::Iterative) {
+  //     // Alternative implementation for scan
+  //     ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
+  //     ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
+  //     std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
+  //                                                     ComputeLoop, ComputeEnd);
+  //   } else {
+  //     llvm_unreachable("Atomic Optimzer is disabled for None strategy");
+  //   }
+  // } else {
   //   switch (Op) {
   //   default:
   //     llvm_unreachable("Unhandled atomic op");
@@ -800,17 +793,17 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   //   case AtomicRMWInst::Sub: {
   //     // The new value we will be contributing to the atomic operation is the
   //     // old value times the number of active lanes.
-  //     // Value *const Ctpop = B.CreateIntCast(
-  //     //     B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-  //     // NewV = buildMul(B, V, Ctpop);
+  //     Value *const Ctpop = B.CreateIntCast(
+  //         B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+  //     NewV = buildMul(B, V, Ctpop);
   //     break;
   //   }
   //   case AtomicRMWInst::FAdd:
   //   case AtomicRMWInst::FSub: {
-  //     // Value *const Ctpop = B.CreateIntCast(
-  //     //     B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
-  //     // Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
-  //     // NewV = B.CreateFMul(V, CtpopFP);
+  //     Value *const Ctpop = B.CreateIntCast(
+  //         B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
+  //     Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
+  //     NewV = B.CreateFMul(V, CtpopFP);
   //     break;
   //   }
   //   case AtomicRMWInst::And:
@@ -835,12 +828,13 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   //     break;
   //   }
   // }
-  }
 
+
+  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
   // We only want a single lane to enter our new control flow, and we do this
   // by checking if there are any active lanes below us. Only one lane will
   // have 0 active lanes below us, so that will be the only one to progress.
-  // Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
+  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
 
   // Store I's original basic block before we split the block.
   BasicBlock *const OriginalBB = I.getParent();
@@ -850,8 +844,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // new block such that:
   // entry --> single_lane -\
   //       \------------------> exit
-  // Instruction *const SingleLaneTerminator =
-  //     SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
+  Instruction *const SingleLaneTerminator =
+      SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
 
   // At this point, we have split the I's block to allow one lane in wavefront
   // to update the precomputed reduced value. Also, completed the codegen for
@@ -863,27 +857,27 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // single lane done updating the final reduced value.
   BasicBlock *Predecessor = nullptr;
   // if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
-    // Move terminator from I's block to ComputeEnd block.
-    //
-    // OriginalBB is known to have a branch as terminator because
-    // SplitBlockAndInsertIfThen will have inserted one.
-    // BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
-    // B.SetInsertPoint(ComputeEnd);
-    // Terminator->removeFromParent();
-    // B.Insert(Terminator);
-
-    // Branch to ComputeLoop Block unconditionally from the I's block for
-    // iterative approach.
-    // B.SetInsertPoint(OriginalBB);
-    // B.CreateBr(ComputeLoop);
-
-    // Update the dominator tree for new control flow.
-    // SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
-    //     {{DominatorTree::Insert, OriginalBB, ComputeLoop},
-    //      {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
-
-    // We're moving the terminator from EntryBB to ComputeEnd, make sure we move
-    // the DT edges as well.
+  //   // Move terminator from I's block to ComputeEnd block.
+  //   //
+  //   // OriginalBB is known to have a branch as terminator because
+  //   // SplitBlockAndInsertIfThen will have inserted one.
+  //   BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
+  //   B.SetInsertPoint(ComputeEnd);
+  //   Terminator->removeFromParent();
+  //   B.Insert(Terminator);
+
+  //   // Branch to ComputeLoop Block unconditionally from the I's block for
+  //   // iterative approach.
+  //   B.SetInsertPoint(OriginalBB);
+  //   B.CreateBr(ComputeLoop);
+
+  //   // Update the dominator tree for new control flow.
+  //   SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
+  //       {{DominatorTree::Insert, OriginalBB, ComputeLoop},
+  //        {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
+
+  //   // We're moving the terminator from EntryBB to ComputeEnd, make sure we move
+  //   // the DT edges as well.
   //   for (auto *Succ : Terminator->successors()) {
   //     DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
   //     DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
@@ -895,102 +889,103 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // } else {
   //   Predecessor = OriginalBB;
   // }
+  Predecessor = OriginalBB;
   // Move the IR builder into single_lane next.
-  // B.SetInsertPoint(SingleLaneTerminator);
+  B.SetInsertPoint(SingleLaneTerminator);
 
   // Clone the original atomic operation into single lane, replacing the
   // original value with our newly created one.
   Instruction *const NewI = I.clone();
   B.Insert(NewI);
-  NewI->setOperand(ValIdx, WaveRed);
+  NewI->setOperand(ValIdx, NewV);
 
   // Move the IR builder into exit next, and start inserting just before the
   // original instruction.
-  // B.SetInsertPoint(&I);
-
-  // if (NeedResult) {
-  //   // Create a PHI node to get our new atomic result into the exit block.
-  //   PHINode *const PHI = B.CreatePHI(Ty, 2);
-  //   PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
-  //   PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
-
-  //   // We need to broadcast the value who was the lowest active lane (the first
-  //   // lane) to all other lanes in the wavefront.
-  //   Value *BroadcastI = nullptr;
-  //   BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
-
-  //   // Now that we have the result of our single atomic operation, we need to
-  //   // get our individual lane's slice into the result. We use the lane offset
-  //   // we previously calculated combined with the atomic result value we got
-  //   // from the first lane, to get our lane's index into the atomic result.
-  //   Value *LaneOffset = nullptr;
-  //   if (ValDivergent) {
-  //     if (ScanImpl == ScanOptions::DPP) {
-  //       LaneOffset =
-  //           B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
-  //     } else if (ScanImpl == ScanOptions::Iterative) {
-  //       LaneOffset = ExclScan;
-  //     } else {
-  //       llvm_unreachable("Atomic Optimzer is disabled for None strategy");
-  //     }
-  //   } else {
-  //     Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
-  //                                     : B.CreateIntCast(Mbcnt, Ty, false);
-  //     switch (Op) {
-  //     default:
-  //       llvm_unreachable("Unhandled atomic op");
-  //     case AtomicRMWInst::Add:
-  //     case AtomicRMWInst::Sub:
-  //       LaneOffset = buildMul(B, V, Mbcnt);
-  //       break;
-  //     case AtomicRMWInst::And:
-  //     case AtomicRMWInst::Or:
-  //     case AtomicRMWInst::Max:
-  //     case AtomicRMWInst::Min:
-  //     case AtomicRMWInst::UMax:
-  //     case AtomicRMWInst::UMin:
-  //     case AtomicRMWInst::FMin:
-  //     case AtomicRMWInst::FMax:
-  //       LaneOffset = B.CreateSelect(Cond, Identity, V);
-  //       break;
-  //     case AtomicRMWInst::Xor:
-  //       LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
-  //       break;
-  //     case AtomicRMWInst::FAdd:
-  //     case AtomicRMWInst::FSub: {
-  //       LaneOffset = B.CreateFMul(V, Mbcnt);
-  //       break;
-  //     }
-  //     }
-  //   }
-  //   Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
-  //   if (isAtomicFloatingPointTy) {
-  //     // For fadd/fsub the first active lane of LaneOffset should be the
-  //     // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
-  //     // is V * +0.0 which might have the wrong sign or might be nan (if V is
-  //     // inf or nan).
-  //     //
-  //     // For all floating point ops if the in-memory value was a nan then the
-  //     // binop we just built might have quieted it or changed its payload.
-  //     //
-  //     // Correct all these problems by using BroadcastI as the result in the
-  //     // first active lane.
-  //     Result = B.CreateSelect(Cond, BroadcastI, Result);
-  //   }
+  B.SetInsertPoint(&I);
 
-  //   if (IsPixelShader) {
-  //     // Need a final PHI to reconverge to above the helper lane branch mask.
-  //     B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+  if (NeedResult) {
+    // Create a PHI node to get our new atomic result into the exit block.
+    PHINode *const PHI = B.CreatePHI(Ty, 2);
+    PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
+    PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
+
+    // We need to broadcast the value who was the lowest active lane (the first
+    // lane) to all other lanes in the wavefront.
+    Value *BroadcastI = nullptr;
+    BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
+
+    // Now that we have the result of our single atomic operation, we need to
+    // get our individual lane's slice into the result. We use the lane offset
+    // we previously calculated combined with the atomic result value we got
+    // from the first lane, to get our lane's index into the atomic result.
+    Value *LaneOffset = nullptr;
+    if (ValDivergent) {
+      if (ScanImpl == ScanOptions::DPP) {
+        LaneOffset =
+            B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+      } else if (ScanImpl == ScanOptions::Iterative) {
+        LaneOffset = ExclScan;
+      } else {
+        llvm_unreachable("Atomic Optimzer is disabled for None strategy");
+      }
+    } else {
+      Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
+                                      : B.CreateIntCast(Mbcnt, Ty, false);
+      switch (Op) {
+      default:
+        llvm_unreachable("Unhandled atomic op");
+      case AtomicRMWInst::Add:
+      case AtomicRMWInst::Sub:
+        LaneOffset = buildMul(B, V, Mbcnt);
+        break;
+      case AtomicRMWInst::And:
+      case AtomicRMWInst::Or:
+      case AtomicRMWInst::Max:
+      case AtomicRMWInst::Min:
+      case AtomicRMWInst::UMax:
+      case AtomicRMWInst::UMin:
+      case AtomicRMWInst::FMin:
+      case AtomicRMWInst::FMax:
+        LaneOffset = B.CreateSelect(Cond, Identity, V);
+        break;
+      case AtomicRMWInst::Xor:
+        LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
+        break;
+      case AtomicRMWInst::FAdd:
+      case AtomicRMWInst::FSub: {
+        LaneOffset = B.CreateFMul(V, Mbcnt);
+        break;
+      }
+      }
+    }
+    Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
+    if (isAtomicFloatingPointTy) {
+      // For fadd/fsub the first active lane of LaneOffset should be the
+      // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
+      // is V * +0.0 which might have the wrong sign or might be nan (if V is
+      // inf or nan).
+      //
+      // For all floating point ops if the in-memory value was a nan then the
+      // binop we just built might have quieted it or changed its payload.
+      //
+      // Correct all these problems by using BroadcastI as the result in the
+      // first active lane.
+      Result = B.CreateSelect(Cond, BroadcastI, Result);
+    }
 
-  //     PHINode *const PHI = B.CreatePHI(Ty, 2);
-  //     PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
-  //     PHI->addIncoming(Result, I.getParent());
-  //     I.replaceAllUsesWith(PHI);
-  //   } else {
-  //     // Replace the original atomic instruction with the new one.
-  //     I.replaceAllUsesWith(Result);
-  //   }
-  // }
+    if (IsPixelShader) {
+      // Need a final PHI to reconverge to above the helper lane branch mask.
+      B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+
+      PHINode *const PHI = B.CreatePHI(Ty, 2);
+      PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
+      PHI->addIncoming(Result, I.getParent());
+      I.replaceAllUsesWith(PHI);
+    } else {
+      // Replace the original atomic instruction with the new one.
+      I.replaceAllUsesWith(Result);
+    }
+  }
 
   // And delete the original.
   I.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cd057702d6072d..7dfab57ebc6b0d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4873,8 +4873,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         // These operations with a uniform value i.e. SGPR are idempotent.
         // Reduced value will be same as given sgpr.
         // bool IsWave32 = ST.isWave32();
-        unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-        BuildMI(BB, MI, DL, TII->get(MovOpc), DstReg).addReg(SrcReg);
+        // unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
         RetBB = &BB;
         break;
       }

>From 91a569741796eca70f33ba3d132954f0fe564f16 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:33:46 +0530
Subject: [PATCH 07/13] changes to intrinsicsAMDGPU.td

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index fc6d13899a809d..b2b6e2039f1725 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2119,6 +2119,18 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
 
+//multiclass AMDGPUWaveReducee {
+//  foreach Opcode = ["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"] in 
+//   def int_amdgcn_wave_reduce_#Opcode : AMDGPUWaveReduce;
+//}
+
+//multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
+//  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
+//    def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
+//}
+
+//WaveReduceDefs<["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"]>;
+//list<string> Operations
 
 def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_min : AMDGPUWaveReduce;
@@ -2127,17 +2139,17 @@ def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_max : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_fmax : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_fadd : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_fsub : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_uand : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fand : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_uand : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_fand : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_uor : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_for : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_uor : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_for : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
 
 def int_amdgcn_readfirstlane :

>From d447aa509957f68db0b893d863afd2fe5829e3e5 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:37:56 +0530
Subject: [PATCH 08/13] changes to AMDGPUAtomicOptimizer.cpp

---
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   | 96 ++++++++++---------
 1 file changed, 52 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index e4782cb5d9c131..76c1feb0d5fe08 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -785,52 +785,60 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   //     llvm_unreachable("Atomic Optimzer is disabled for None strategy");
   //   }
   // } else {
-  //   switch (Op) {
-  //   default:
-  //     llvm_unreachable("Unhandled atomic op");
-
-  //   case AtomicRMWInst::Add:
-  //   case AtomicRMWInst::Sub: {
-  //     // The new value we will be contributing to the atomic operation is the
-  //     // old value times the number of active lanes.
-  //     Value *const Ctpop = B.CreateIntCast(
-  //         B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-  //     NewV = buildMul(B, V, Ctpop);
-  //     break;
-  //   }
-  //   case AtomicRMWInst::FAdd:
-  //   case AtomicRMWInst::FSub: {
-  //     Value *const Ctpop = B.CreateIntCast(
-  //         B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
-  //     Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
-  //     NewV = B.CreateFMul(V, CtpopFP);
-  //     break;
-  //   }
-  //   case AtomicRMWInst::And:
-  //   case AtomicRMWInst::Or:
-  //   case AtomicRMWInst::Max:
-  //   case AtomicRMWInst::Min:
-  //   case AtomicRMWInst::UMax:
-  //   case AtomicRMWInst::UMin:
-  //   case AtomicRMWInst::FMin:
-  //   case AtomicRMWInst::FMax:
-  //     // These operations with a uniform value are idempotent: doing the atomic
-  //     // operation multiple times has the same effect as doing it once.
-  //     NewV = V;
-  //     break;
-
-  //   case AtomicRMWInst::Xor:
-  //     // The new value we will be contributing to the atomic operation is the
-  //     // old value times the parity of the number of active lanes.
-  //     Value *const Ctpop = B.CreateIntCast(
-  //         B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
-  //     NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
-  //     break;
-  //   }
-  // }
+  // **************************************** Implement from here
+    switch (Op) {
+    // TODO --implement for floats
+    default:
+      llvm_unreachable("Unhandled atomic op");
+
+    case AtomicRMWInst::Add:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_add, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::Sub:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_sub, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::FAdd:
+    case AtomicRMWInst::FSub: {
+      Value *const Ctpop = B.CreateIntCast(
+          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
+      Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
+      NewV = B.CreateFMul(V, CtpopFP);
+      break;
+    }
+    case AtomicRMWInst::And:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_and, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::Or:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_or, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::Xor:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_xor, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::Max:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_max, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::Min:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_min, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::UMax:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::UMin:
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+      break;
+    case AtomicRMWInst::FMin:
+    case AtomicRMWInst::FMax:
+      // These operations with a uniform value are idempotent: doing the atomic
+      // operation multiple times has the same effect as doing it once.
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+      break;
+
+    }
+  
+  // **************************************** Implement to here
 
 
-  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+  // NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
   // We only want a single lane to enter our new control flow, and we do this
   // by checking if there are any active lanes below us. Only one lane will
   // have 0 active lanes below us, so that will be the only one to progress.

>From a33398dba7113486feedf3b163d0381a52cd7052 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:38:37 +0530
Subject: [PATCH 09/13] changes to AMDGPURegisterBankInfo.cpp

---
 .../lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 68b0857fd21504..24c6dc0afbce57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4846,15 +4846,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
       break;
     }
-    case Intrinsic::amdgcn_wave_reduce_umin:
+    case Intrinsic::amdgcn_wave_reduce_add:
+    case Intrinsic::amdgcn_wave_reduce_fadd: 
+    case Intrinsic::amdgcn_wave_reduce_sub: 
+    case Intrinsic::amdgcn_wave_reduce_fsub: 
+    case Intrinsic::amdgcn_wave_reduce_min: 
+    case Intrinsic::amdgcn_wave_reduce_umin: 
+    case Intrinsic::amdgcn_wave_reduce_fmin: 
+    case Intrinsic::amdgcn_wave_reduce_max: 
     case Intrinsic::amdgcn_wave_reduce_umax: 
+    case Intrinsic::amdgcn_wave_reduce_fmax: 
     case Intrinsic::amdgcn_wave_reduce_and: 
     case Intrinsic::amdgcn_wave_reduce_or: 
-    case Intrinsic::amdgcn_wave_reduce_xor: 
-    case Intrinsic::amdgcn_wave_reduce_usub: 
-    case Intrinsic::amdgcn_wave_reduce_sub: 
-    case Intrinsic::amdgcn_wave_reduce_uadd: 
-    case Intrinsic::amdgcn_wave_reduce_add: {
+    case Intrinsic::amdgcn_wave_reduce_xor: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

>From 9b6f7099d2c1b6a2f03ddd4724f2065c8dd7e6d8 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:39:49 +0530
Subject: [PATCH 10/13] changes to SIISelLowering.cpp

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 67 +++++++++++++++--------
 1 file changed, 44 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7dfab57ebc6b0d..41431ee9ee2824 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5047,8 +5047,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
 
     // Create initail values of induction variable from Exec, Accumulator and
     // insert branch instr to newly created ComputeBlock
-    uint32_t InitalValue =
-        (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+    uint32_t InitalValue;
+    switch(Opc){
+      case AMDGPU::S_MIN_U32:
+        InitalValue = std::numeric_limits<uint32_t>::max();
+        break;
+      case AMDGPU::S_MIN_I32:
+        InitalValue = std::numeric_limits<int32_t>::max();
+        break;
+      case AMDGPU::S_MAX_U32:
+        InitalValue = 0;
+        break;
+      case AMDGPU::S_MAX_I32:
+        InitalValue = std::numeric_limits<int32_t>::min();
+        break;
+      case AMDGPU::S_ADD_I32:
+      case AMDGPU::S_SUB_I32:
+      case AMDGPU::S_OR_B32:
+      case AMDGPU::S_XOR_B32:
+        InitalValue = 0x00000000;
+        break;
+      case AMDGPU::S_AND_B32:
+        InitalValue = 0xFFFFFFFF;
+    }
     auto TmpSReg =
         BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5114,43 +5135,43 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 
   switch (MI.getOpcode()) {
-  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
-  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_I32:
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
-  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_F32:
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_F32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_F32);
-  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
-  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_I32:
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
-  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_F32:
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_F32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_F32);
-  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
-    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
+  // case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_F32);
-  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
-    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
+  // case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_F32);
-  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_U32:
+  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
-  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_I32:
-    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
-  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_F32:
-    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
-  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_U32:
-    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
-  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_I32:
-    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
-  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_F32:
+  // case AMDGPU::WAVE_REDUCE_AND_PSEUDO_I32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  // case AMDGPU::WAVE_REDUCE_AND_PSEUDO_F32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
-  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_U32:
+  // case AMDGPU::WAVE_REDUCE_OR_PSEUDO_I32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  // case AMDGPU::WAVE_REDUCE_OR_PSEUDO_F32:
+  //   return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {

>From 45a02a1d01c6b1629dee41b8bf429d40300abe75 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:40:16 +0530
Subject: [PATCH 11/13] changes to SIInstructions.td

---
 llvm/lib/Target/AMDGPU/SIInstructions.td | 70 ++++++++++++------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 534b4d2c052482..c5883ff7839033 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -255,32 +255,32 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
     (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
 
 let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
-  def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_MIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
   }
 
-  def WAVE_REDUCE_UMIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
   }
 
-  def WAVE_REDUCE_UMIN_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_MIN_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set f32:$sdst, (int_amdgcn_wave_reduce_fmin f32:$src, i32:$strategy))]> {
   }
 
-  def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_MAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
   }
 
-  def WAVE_REDUCE_UMAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
   }
 
-  def WAVE_REDUCE_UMAX_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_MAX_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set f32:$sdst, (int_amdgcn_wave_reduce_fmax f32:$src, i32:$strategy))]> {
   }
@@ -295,57 +295,57 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
     [(set f32:$sdst, (int_amdgcn_wave_reduce_add f32:$src, i32:$strategy))]> {
   }
   
-  def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
-    (ins VSrc_b32: $src, VSrc_b32:$strategy),
-    [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
-  }
+  //def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
+  //}
   
   def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
   }
   
-  def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
-    (ins VSrc_b32: $src, VSrc_b32:$strategy),
-    [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
-  }
+  //def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  // [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
+  //}
 
   def WAVE_REDUCE_SUB_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set f32:$sdst, (int_amdgcn_wave_reduce_fsub f32:$src, i32:$strategy))]> {
   }
   
-  def WAVE_REDUCE_AND_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
-    (ins VSrc_b32: $src, VSrc_b32:$strategy),
-    [(set i32:$sdst, (int_amdgcn_wave_reduce_uand i32:$src, i32:$strategy))]> {
-  }
-
-  def WAVE_REDUCE_AND_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
   }
 
-  def WAVE_REDUCE_AND_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
-    (ins VSrc_b32: $src, VSrc_b32:$strategy),
-    [(set f32:$sdst, (int_amdgcn_wave_reduce_fand f32:$src, i32:$strategy))]> {
-  }
-  
-  def WAVE_REDUCE_OR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
-    (ins VSrc_b32: $src, VSrc_b32:$strategy),
-    [(set i32:$sdst, (int_amdgcn_wave_reduce_uor i32:$src, i32:$strategy))]> {
-  }
+  //def WAVE_REDUCE_AND_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
+  //}
 
-  def WAVE_REDUCE_OR_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //def WAVE_REDUCE_AND_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set f32:$sdst, (int_amdgcn_wave_reduce_fand f32:$src, i32:$strategy))]> {
+  //}
+  
+  def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
   }
 
-  def WAVE_REDUCE_OR_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
-    (ins VSrc_b32: $src, VSrc_b32:$strategy),
-    [(set f32:$sdst, (int_amdgcn_wave_reduce_for f32:$src, i32:$strategy))]> {
-  }
+  //def WAVE_REDUCE_OR_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
+  //}
+
+  //def WAVE_REDUCE_OR_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  //  (ins VSrc_b32: $src, VSrc_b32:$strategy),
+  //  [(set f32:$sdst, (int_amdgcn_wave_reduce_for f32:$src, i32:$strategy))]> {
+  //}
   
-  def WAVE_REDUCE_XOR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+  def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
     (ins VSrc_b32: $src, VSrc_b32:$strategy),
     [(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
   }

>From b5db313b9eb1ff7630887eae82d0568105648281 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 18:29:20 +0530
Subject: [PATCH 12/13] Code cleanup in tableGen files.

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 39 ++++-------------------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  9 ------
 2 files changed, 7 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b2b6e2039f1725..e4d6e4b2f54597 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2119,38 +2119,13 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
 
-//multiclass AMDGPUWaveReducee {
-//  foreach Opcode = ["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"] in 
-//   def int_amdgcn_wave_reduce_#Opcode : AMDGPUWaveReduce;
-//}
-
-//multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
-//  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
-//    def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
-//}
-
-//WaveReduceDefs<["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"]>;
-//list<string> Operations
-
-def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_min : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fmin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_max : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fmax : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fadd : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fsub : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_uand : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_fand : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_uor : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_for : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
+multiclass AMDGPUWaveReduceGenerator<list<string> Operations>{
+  foreach Opcode = Operations in
+    def Opcode : AMDGPUWaveReduce; 
+}
+
+defvar Operations = ["umin", "min", "fmin", "umax", "max", "fmax", "add", "fadd", "sub", "fsub", "and", "or", "xor"];
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;
 
 def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 41431ee9ee2824..9816d0d354af80 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4872,8 +4872,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
       case AMDGPU::S_OR_B32:{
         // These operations with a uniform value i.e. SGPR are idempotent.
         // Reduced value will be same as given sgpr.
-        // bool IsWave32 = ST.isWave32();
-        // unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
         BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
         RetBB = &BB;
         break;
@@ -4970,15 +4968,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             // of Active lanes then the XOR will result in the
             // same value as that in the SGPR. This comes from 
             // the fact that A^A = 0 and A^0 = A.
-
             Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-
             auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
                 .addReg(NewAccumulator->getOperand(0).getReg())
                 .addImm(1);
-// S_MUL_I32
-            // auto MulOp = 
-            // Can you have one float and one int op? I dont think you can, need to handle the float case seperately.  
             BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)  
                 .addReg(SrcReg)
                 .addReg(ParityReg->getOperand(0).getReg())  ;
@@ -4989,7 +4982,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
           case AMDGPU::S_SUB_F32:{
             // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
             Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
-            
             // Take the negation of the source operand.
             auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
             BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
@@ -4997,7 +4989,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                 .addReg(NewAccumulator->getOperand(0).getReg());
             break;
           }
-          // Doubt --> is SSA form still have to be followed for MIR?
           case AMDGPU::S_ADD_U32:
           case AMDGPU::S_ADD_I32:
           case AMDGPU::S_ADD_F32:{

>From 7e9cb1c4f1f4afe02d7c8e2f4270318d89092877 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 8 Oct 2024 10:56:00 +0530
Subject: [PATCH 13/13] Bitcount changes to be copied

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 156 +++++++++++-----------
 1 file changed, 75 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9816d0d354af80..4ffcee15225cd8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4864,10 +4864,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     switch(Opc){
       case AMDGPU::S_MIN_U32:
       case AMDGPU::S_MIN_I32:
-      case AMDGPU::S_MIN_F32:
       case AMDGPU::S_MAX_U32:
       case AMDGPU::S_MAX_I32:
-      case AMDGPU::S_MAX_F32:
       case AMDGPU::S_AND_B32:
       case AMDGPU::S_OR_B32:{
         // These operations with a uniform value i.e. SGPR are idempotent.
@@ -4877,88 +4875,86 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         break;
       }
       case AMDGPU::S_XOR_B32:
-      case AMDGPU::S_ADD_U32:
       case AMDGPU::S_ADD_I32:
-      case AMDGPU::S_ADD_F32:
-      case AMDGPU::S_SUB_U32:
-      case AMDGPU::S_SUB_I32:
-      case AMDGPU::S_SUB_F32:{
-        MachineBasicBlock::iterator I = BB.end();
-        Register SrcReg = MI.getOperand(1).getReg();
+      case AMDGPU::S_SUB_I32:{
+        // MachineBasicBlock::iterator I = BB.end();
+        // Register SrcReg = MI.getOperand(1).getReg();
 
-        // Create Control flow for loop
-        // Split MI's Machine Basic block into For loop
-        auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+        // // Create Control flow for loop
+        // // Split MI's Machine Basic block into For loop
+        // auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
 
-        // Create virtual registers required for lowering.
+        // // Create virtual registers required for lowering.
         const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
         const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
-        Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
-        Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+        Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
+        // Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
 
-        Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
-        Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-        Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+        // Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+        // Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+        // Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
 
-        Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+        // Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
         Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
 
         bool IsWave32 = ST.isWave32();
         unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
         unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+        unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
 
         // Create initail values of induction variable from Exec, Accumulator and
         // insert branch instr to newly created ComputeBlock
-        uint32_t InitalValue = 0;
+        // uint32_t InitalValue = 0;
         
-        auto TmpSReg =
-            BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
-        BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
-            .addImm(InitalValue);
-        BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
-
-        // Start constructing ComputeLoop
-        I = ComputeLoop->end();
-        auto Accumulator =
-            BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
-                .addReg(InitalValReg)
-                .addMBB(&BB);
-        auto ActiveBits =
-            BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
-                .addReg(TmpSReg->getOperand(0).getReg())
-                .addMBB(&BB);
-
-        // Perform the computations
-        unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
-        auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
-                      .addReg(ActiveBits->getOperand(0).getReg());
-        auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), CountOfActiveLanesReg)
-                                  .addReg(Accumulator->getOperand(0).getReg())
-                                  .addImm(1);
-
-        // Manipulate the iterator to get the next active lane
-        unsigned BITSETOpc =
-            IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
-        auto NewActiveBits =
-            BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
-                .addReg(FF1->getOperand(0).getReg())
-                .addReg(ActiveBits->getOperand(0).getReg());
-
-        // Add phi nodes
-        Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
-            .addMBB(ComputeLoop);
-        ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
-            .addMBB(ComputeLoop);
-
-        // Creating branching
-        unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
-        BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
-            .addReg(NewActiveBits->getOperand(0).getReg())
-            .addImm(0);
-        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
-            .addMBB(ComputeLoop);
-
-        I = ComputeEnd->begin();
+        auto Exec =
+            BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+
+        auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
+                                  .addReg(Exec->getOperand(0).getReg());
+
+        // BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
+        //     .addImm(InitalValue);
+        // BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+        // // Start constructing ComputeLoop
+        // I = ComputeLoop->end();
+        // auto Accumulator =
+        //     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+        //         .addReg(InitalValReg)
+        //         .addMBB(&BB);
+        // auto ActiveBits =
+        //     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+        //         .addReg(TmpSReg->getOperand(0).getReg())
+        //         .addMBB(&BB);
+
+        // // Perform the computations
+        // unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+        // auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+        //               .addReg(ActiveBits->getOperand(0).getReg());
+
+        // // Manipulate the iterator to get the next active lane
+        // unsigned BITSETOpc =
+        //     IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+        // auto NewActiveBits =
+        //     BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+        //         .addReg(FF1->getOperand(0).getReg())
+        //         .addReg(ActiveBits->getOperand(0).getReg());
+
+        // // Add phi nodes
+        // Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+        //     .addMBB(ComputeLoop);
+        // ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+        //     .addMBB(ComputeLoop);
+
+        // // Creating branching
+        // unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+        // BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+        //     .addReg(NewActiveBits->getOperand(0).getReg())
+        //     .addImm(0);
+        // BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+        //     .addMBB(ComputeLoop);
+
+        // I = ComputeEnd->begin();
         switch(Opc){
           case AMDGPU::S_XOR_B32:{
             // Performing an XOR operation on a uniform value
@@ -4968,38 +4964,36 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             // of Active lanes then the XOR will result in the
             // same value as that in the SGPR. This comes from 
             // the fact that A^A = 0 and A^0 = A.
+
             Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-            auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+
+            auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
                 .addReg(NewAccumulator->getOperand(0).getReg())
                 .addImm(1);
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)  
+            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)  
                 .addReg(SrcReg)
                 .addReg(ParityReg->getOperand(0).getReg())  ;
             break;
           }
-          case AMDGPU::S_SUB_U32:
-          case AMDGPU::S_SUB_I32:
-          case AMDGPU::S_SUB_F32:{
+          case AMDGPU::S_SUB_I32:{
             // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
             Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+            
             // Take the negation of the source operand.
-            auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
-            BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+            auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
                 .addReg(InvertedValReg->getOperand(0).getReg())
                 .addReg(NewAccumulator->getOperand(0).getReg());
             break;
           }
-          case AMDGPU::S_ADD_U32:
-          case AMDGPU::S_ADD_I32:
-          case AMDGPU::S_ADD_F32:{
-            auto Opcode = Opc == AMDGPU::S_ADD_U32 || Opc == AMDGPU::S_ADD_I32 ? AMDGPU::S_MUL_I32 : AMDGPU::S_MUL_F32;  
-            BuildMI(*ComputeEnd, I, DL, TII->get(Opcode), DstReg)
+          case AMDGPU::S_ADD_I32:{
+            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
                 .addReg(SrcReg)
                 .addReg(NewAccumulator->getOperand(0).getReg());
             break;
           }
         }
-        RetBB = ComputeEnd;
+        RetBB = &BB;
       }
     }
   } else {