[llvm] 342642d - [AMDGPU][GISel] Smaller code for scalar 32 to 64-bit extensions
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 15 23:10:36 PST 2022
Author: Jay Foad
Date: 2022-11-16T06:57:21Z
New Revision: 342642dc75e499d865f37825de8918af37e57b65
URL: https://github.com/llvm/llvm-project/commit/342642dc75e499d865f37825de8918af37e57b65
DIFF: https://github.com/llvm/llvm-project/commit/342642dc75e499d865f37825de8918af37e57b65.diff
LOG: [AMDGPU][GISel] Smaller code for scalar 32 to 64-bit extensions
Differential Revision: https://reviews.llvm.org/D107639
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
llvm/test/CodeGen/AMDGPU/ctlz.ll
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
llvm/test/CodeGen/AMDGPU/cttz.ll
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2538d175dde2..02ef7f834e32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2267,6 +2267,29 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
}
+ // Using a single 32-bit SALU to calculate the high half is smaller than
+ // S_BFE with a literal constant operand.
+ if (DstSize > 32 && SrcSize == 32) {
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
+ if (Signed) {
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
+ .addReg(SrcReg, 0, SubReg)
+ .addImm(31);
+ } else {
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
+ .addImm(0);
+ }
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(SrcReg, 0, SubReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
+ *MRI);
+ }
+
const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
@@ -2275,7 +2298,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
// We need a 64-bit register source, but the high bits don't matter.
Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
+ unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir
index 24faa2ce2500..0dcbab0c5395 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir
@@ -225,10 +225,9 @@ body: |
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[DEF]], %subreg.sub1
- ; GCN-NEXT: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 [[REG_SEQUENCE]], 2097152, implicit-def $scc
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_I64_]]
+ ; GCN-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[COPY]].sub0, 31, implicit-def $scc
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]]
%0:sgpr(s64) = COPY $sgpr0_sgpr1
%1:sgpr(s64) = G_SEXT_INREG %0, 32
$sgpr0_sgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
index 1056cc41cbb8..ec95662cc289 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
@@ -127,10 +127,9 @@ body: |
; GCN: liveins: $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1
- ; GCN-NEXT: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 [[REG_SEQUENCE]], 2097152, implicit-def $scc
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_I64_]]
+ ; GCN-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[COPY]], 31, implicit-def $scc
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s64) = G_SEXT %0
$sgpr0_sgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
index 86ac8f59d483..8f18f58ed957 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
@@ -127,10 +127,9 @@ body: |
; GCN: liveins: $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1
- ; GCN-NEXT: [[S_BFE_U64_:%[0-9]+]]:sreg_64 = S_BFE_U64 [[REG_SEQUENCE]], 2097152, implicit-def $scc
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_U64_]]
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s64) = G_ZEXT %0
$sgpr0_sgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 8f5e9b732864..7295442324b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -252,11 +252,11 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(i32 addrspace(1)* %ptr) {
define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(i32 addrspace(1)* inreg %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_sgpr_ptr_sgpr_offset:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_ashr_i32 s5, s4, 31
+; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
-; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -266,11 +266,11 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(i32 addrspace(1)* inreg
;
; GFX7-LABEL: mubuf_store_sgpr_ptr_sgpr_offset:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
-; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -285,8 +285,8 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(i32 addrspace(1)* inreg
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -295,8 +295,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(i32 addrspace(1)* %ptr,
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -310,8 +310,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(i32 addrspace(1)* %ptr,
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -320,8 +320,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -336,8 +336,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -346,8 +346,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -698,11 +698,11 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(float addrspace(1)* %ptr)
define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inreg %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_sgpr_ptr_sgpr_offset:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_ashr_i32 s5, s4, 31
+; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
-; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_mov_b32_e32 v1, s5
@@ -712,11 +712,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inre
;
; GFX7-LABEL: mubuf_load_sgpr_ptr_sgpr_offset:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
-; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -731,8 +731,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inre
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -741,8 +741,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -756,8 +756,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
@@ -766,8 +766,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
@@ -782,8 +782,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
@@ -792,8 +792,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 0bd33b99bbca..8f300f2baccc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -120,14 +120,14 @@ define amdgpu_ps i64 @s_shl_i64_zext_i32_overflow(i32 inreg %x) {
; GCN-LABEL: s_shl_i64_zext_i32_overflow:
; GCN: ; %bb.0:
; GCN-NEXT: s_bitset0_b32 s0, 31
-; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
+; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_shl_i64_zext_i32_overflow:
; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b32 s1, 0
; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31
-; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and i32 %x, 2147483647
@@ -187,14 +187,14 @@ define amdgpu_ps i64 @s_shl_i64_sext_i32_overflow(i32 inreg %x) {
; GCN-LABEL: s_shl_i64_sext_i32_overflow:
; GCN: ; %bb.0:
; GCN-NEXT: s_bitset0_b32 s0, 31
-; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
+; GCN-NEXT: s_ashr_i32 s1, s0, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_shl_i64_sext_i32_overflow:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31
-; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
+; GFX10PLUS-NEXT: s_ashr_i32 s1, s0, 31
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and i32 %x, 2147483647
@@ -434,9 +434,10 @@ define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) {
; GCN-NEXT: s_brev_b32 s2, -4
; GCN-NEXT: s_mov_b32 s3, s2
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: s_bfe_u64 s[4:5], s[0:1], 0x200000
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s4, s1
+; GCN-NEXT: s_mov_b32 s5, s3
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2
; GCN-NEXT: ; return to shader part epilog
@@ -446,11 +447,12 @@ define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) {
; GFX10PLUS-NEXT: s_brev_b32 s2, -4
; GFX10PLUS-NEXT: s_mov_b32 s3, s2
; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT: s_mov_b32 s2, s1
-; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
-; GFX10PLUS-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000
-; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_mov_b32 s2, s0
+; GFX10PLUS-NEXT: s_mov_b32 s4, s1
+; GFX10PLUS-NEXT: s_mov_b32 s5, s3
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[4:5], 2
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
%ext = zext <2 x i32> %and to <2 x i64>
@@ -525,9 +527,10 @@ define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) {
; GCN-NEXT: s_brev_b32 s2, -8
; GCN-NEXT: s_mov_b32 s3, s2
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x200000
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: s_bfe_i64 s[4:5], s[0:1], 0x200000
+; GCN-NEXT: s_ashr_i32 s3, s0, 31
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_ashr_i32 s5, s1, 31
+; GCN-NEXT: s_mov_b32 s4, s1
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2
; GCN-NEXT: ; return to shader part epilog
@@ -537,11 +540,12 @@ define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) {
; GFX10PLUS-NEXT: s_brev_b32 s2, -8
; GFX10PLUS-NEXT: s_mov_b32 s3, s2
; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT: s_mov_b32 s2, s1
-; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000
-; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GFX10PLUS-NEXT: s_ashr_i32 s3, s0, 31
+; GFX10PLUS-NEXT: s_mov_b32 s2, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s5, s1, 31
+; GFX10PLUS-NEXT: s_mov_b32 s4, s1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[4:5], 2
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and <2 x i32> %x, <i32 536870911, i32 536870911>
%ext = sext <2 x i32> %and to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index acf24a66bb15..ddc6734335dc 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -664,11 +664,11 @@ define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32],
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3]
; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 9c4d7aed5eed..0d02bf8c01a2 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -448,10 +448,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out,
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_mov_b32 s1, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3]
-; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index dd26102ea224..fd8c73926650 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -572,11 +572,11 @@ define amdgpu_kernel void @s_cttz_i64(i64 addrspace(1)* noalias %out, [8 x i32],
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3]
; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 198c1805778a..ba3ed974a34a 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -549,12 +549,12 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* n
; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_mov_b32 s5, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3]
-; GFX9-GISEL-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
More information about the llvm-commits
mailing list