[llvm] fde2aef - [AMDGPU] Use SDWA for 16 bit subreg copy

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 17 11:45:54 PDT 2020


Author: Stanislav Mekhanoshin
Date: 2020-04-17T11:45:44-07:00
New Revision: fde2aefa22b9be803628888a21067288c8e2636d

URL: https://github.com/llvm/llvm-project/commit/fde2aefa22b9be803628888a21067288c8e2636d
DIFF: https://github.com/llvm/llvm-project/commit/fde2aefa22b9be803628888a21067288c8e2636d.diff

LOG: [AMDGPU] Use SDWA for 16 bit subreg copy

This simplifies the logic and allows to use it on GFX8.

Differential Revision: https://reviews.llvm.org/D78150

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6e5907b12510..92d0440a5806 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -683,16 +683,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
            AMDGPU::VGPR_HI16RegClass.contains(SrcReg));
 
-    //          d          s
-    // l -> l : hhhhxxxx : xxxxllll -> v_alignbyte_b32 d, s, d, 2
-    //          llllhhhh : xxxxllll -> v_alignbyte_b32 d, d, d, 2
-    // l -> h : xxxxllll : xxxxhhhh -> v_lshlrev_b32 d, 16, d
-    //          llll0000 : xxxxhhhh -> v_alignbyte_b32 d, s, d, 2
-    // h -> l : hhhhxxxx : llllxxxx -> v_lshrrev_b32 d, 16, d
-    //          0000hhhh : llllxxxx -> v_alignbyte_b32 d, d, s, 2
-    // h -> h : xxxxllll : hhhhxxxx -> v_alignbyte_b32 d, d, s, 2
-    //          llllhhhh : hhhhxxxx -> v_alignbyte_b32 d, d, d, 2
-
     bool DstLow = RC == &AMDGPU::VGPR_LO16RegClass;
     bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg);
     DestReg = RI.getMatchingSuperReg(DestReg,
@@ -702,49 +692,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                     SrcLow ? AMDGPU::lo16 : AMDGPU::hi16,
                                     &AMDGPU::VGPR_32RegClass);
 
-    if (DestReg == SrcReg) {
-      // l -> h : v_pk_add_u16 v1, v1, 0 op_sel_hi:[0,0]
-      // h -> l : v_pk_add_u16 v1, v1, 0 op_sel:[1,0] op_sel_hi:[1,0]
-      if (DstLow == SrcLow)
-        return;
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_ADD_U16), DestReg)
-        .addImm(DstLow ? SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1 : 0)
-        .addReg(DestReg, RegState::Undef)
-        .addImm(0) // src1_mod
-        .addImm(0) // src1
-        .addImm(0)
-        .addImm(0)
-        .addImm(0)
-        .addImm(0)
-        .addImm(0);
-
-      return;
-    }
-
-    // Last instruction first:
-    auto Last = BuildMI(MBB, MI, DL, get(AMDGPU::V_ALIGNBYTE_B32), DestReg)
-      .addReg((SrcLow && !DstLow) ? SrcReg : DestReg,
-              (SrcLow && !DstLow) ? getKillRegState(KillSrc) : 0)
-      .addReg((!SrcLow && DstLow) ? SrcReg : DestReg,
-              (!SrcLow && DstLow) ? getKillRegState(KillSrc) : 0)
-      .addImm(2);
-
-    unsigned OpcFirst = (DstLow == SrcLow) ? AMDGPU::V_ALIGNBYTE_B32
-                                           : SrcLow ? AMDGPU::V_LSHRREV_B32_e32
-                                                    : AMDGPU::V_LSHLREV_B32_e32;
-    auto First = BuildMI(MBB, &*Last, DL, get(OpcFirst), DestReg);
-    if (DstLow == SrcLow) { // alignbyte
-      First
-          .addReg(SrcLow ? SrcReg : DestReg,
-                  SrcLow ? getKillRegState(KillSrc) : unsigned(RegState::Undef))
-          .addReg(SrcLow ? DestReg : SrcReg,
-                  SrcLow ? unsigned(RegState::Undef) : getKillRegState(KillSrc))
-          .addImm(2);
-    } else {
-      First.addImm(16)
-           .addReg(DestReg, RegState::Undef);
-    }
-
+    auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), DestReg)
+      .addImm(0) // src0_modifiers
+      .addReg(SrcReg)
+      .addImm(0) // clamp
+      .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
+                     : AMDGPU::SDWA::SdwaSel::WORD_1)
+      .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
+      .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
+                     : AMDGPU::SDWA::SdwaSel::WORD_1)
+      .addReg(DestReg, RegState::Implicit | RegState::Undef);
+    // First implicit operand is $exec.
+    MIB->tieOperands(0, MIB->getNumOperands() - 1);
     return;
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir b/llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir
index 79d2f48421fd..f5c507be361d 100644
--- a/llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir
@@ -1,8 +1,9 @@
+# RUN: llc -march=amdgcn -mcpu=gfx802 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
 # RUN: llc -march=amdgcn -mcpu=gfx900 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s
 
 # GCN-LABEL: {{^}}lo_to_lo:
-# GCN:      v_alignbyte_b32 v1, v0, v1, 2
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 name: lo_to_lo
 tracksRegLiveness: true
 body:             |
@@ -13,8 +14,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}lo_to_hi:
-# GCN:      v_lshrrev_b32_e32 v1, 16, v1
-# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
+# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 name: lo_to_hi
 tracksRegLiveness: true
 body:             |
@@ -25,8 +25,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}hi_to_lo:
-# GCN:      v_lshlrev_b32_e32 v1, 16, v1
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
+# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 name: hi_to_lo
 tracksRegLiveness: true
 body:             |
@@ -37,8 +36,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}hi_to_hi:
-# GCN:      v_alignbyte_b32 v1, v1, v0, 2
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 name: hi_to_hi
 tracksRegLiveness: true
 body:             |
@@ -49,8 +47,9 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}lo_to_lo_samereg:
-# GCN:      s_waitcnt
-# GCN-NEXT: s_endpgm
+# GCN:        s_waitcnt
+# GFX10-NEXT: s_waitcnt_vscnt
+# GCN-NEXT:   s_endpgm
 name: lo_to_lo_samereg
 tracksRegLiveness: true
 body:             |
@@ -61,7 +60,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}lo_to_hi_samereg:
-# GCN: v_pk_add_u16 v0, v0, 0 op_sel_hi:[0,0]
+# GCN: v_mov_b32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 name: lo_to_hi_samereg
 tracksRegLiveness: true
 body:             |
@@ -72,7 +71,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}hi_to_lo_samereg:
-# GCN: v_pk_add_u16 v0, v0, 0 op_sel:[1,0] op_sel_hi:[1,0]
+# GCN: v_mov_b32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 name: hi_to_lo_samereg
 tracksRegLiveness: true
 body:             |
@@ -84,6 +83,7 @@ body:             |
 
 # GCN-LABEL: {{^}}hi_to_hi_samereg:
 # GCN:      s_waitcnt
+# GFX10-NEXT: s_waitcnt_vscnt
 # GCN-NEXT: s_endpgm
 name: hi_to_hi_samereg
 tracksRegLiveness: true
@@ -95,8 +95,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}lo_to_lo_def_livein:
-# GCN:      v_alignbyte_b32 v1, v0, v1, 2
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 name: lo_to_lo_def_livein
 tracksRegLiveness: true
 body:             |
@@ -109,8 +108,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}lo_to_hi_def_livein:
-# GCN:      v_lshrrev_b32_e32 v1, 16, v1
-# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
+# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 name: lo_to_hi_def_livein
 tracksRegLiveness: true
 body:             |
@@ -123,8 +121,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}hi_to_lo_def_livein:
-# GCN:      v_lshlrev_b32_e32 v1, 16, v1
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
+# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 name: hi_to_lo_def_livein
 tracksRegLiveness: true
 body:             |
@@ -137,8 +134,7 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}hi_to_hi_def_livein:
-# GCN:      v_alignbyte_b32 v1, v1, v0, 2
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 name: hi_to_hi_def_livein
 tracksRegLiveness: true
 body:             |
@@ -152,10 +148,8 @@ body:             |
 
 # TODO: This can be coalesced into a VGPR_32 copy
 # GCN-LABEL: {{^}}lo_to_lo_hi_to_hi:
-# GCN:      v_alignbyte_b32 v1, v0, v1, 2
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+# GCN:      v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+# GCN-NEXT: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 # GCN-NEXT: v_mov_b32_e32 v2, v1
 # GCN-NEXT: s_endpgm
 name: lo_to_lo_hi_to_hi
@@ -170,10 +164,8 @@ body:             |
 ...
 
 # GCN-LABEL: {{^}}lo_to_hi_hi_to_lo:
-# GCN:      v_lshlrev_b32_e32 v1, 16, v1
-# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
-# GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
+# GCN:      v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+# GCN-NEXT: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 # GCN-NEXT: v_mov_b32_e32 v2, v1
 # GCN-NEXT: s_endpgm
 name: lo_to_hi_hi_to_lo
@@ -189,9 +181,10 @@ body:             |
 
 # NB: copy of undef just killed instead of expansion
 # GCN-LABEL: {{^}}lo_to_lo_undef:
-# GCN:      s_waitcnt
-# GCN-NEXT: v_mov_b32_e32 v2, v1
-# GCN-NEXT: s_endpgm
+# GCN:        s_waitcnt
+# GFX10-NEXT: s_waitcnt_vscnt
+# GCN-NEXT:   v_mov_b32_e32 v2, v1
+# GCN-NEXT:   s_endpgm
 name: lo_to_lo_undef
 tracksRegLiveness: true
 body:             |


        


More information about the llvm-commits mailing list