[llvm] d84e5fd - [AMDGPU][GlobalISel] Fix v2s16 right shifts
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 4 09:04:45 PST 2021
Author: Jay Foad
Date: 2021-02-04T17:04:32Z
New Revision: d84e5fdac1a65732556b9e56984c998aad915139
URL: https://github.com/llvm/llvm-project/commit/d84e5fdac1a65732556b9e56984c998aad915139
DIFF: https://github.com/llvm/llvm-project/commit/d84e5fdac1a65732556b9e56984c998aad915139.diff
LOG: [AMDGPU][GlobalISel] Fix v2s16 right shifts
When widening, each half of the v2s16 operands needs to be sign extended
for G_ASHR or zero extended for G_LSHR.
Differential Revision: https://reviews.llvm.org/D96048
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 408c8d96439e..49fb3fa8d9d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1564,9 +1564,11 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
// Return a suitable opcode for extending the operands of Opc when widening.
static unsigned getExtendOp(unsigned Opc) {
switch (Opc) {
+ case TargetOpcode::G_ASHR:
case TargetOpcode::G_SMIN:
case TargetOpcode::G_SMAX:
return TargetOpcode::G_SEXT;
+ case TargetOpcode::G_LSHR:
case TargetOpcode::G_UMIN:
case TargetOpcode::G_UMAX:
return TargetOpcode::G_ZEXT;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index a0032a5940db..be25cc78a18c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -560,11 +560,13 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX9-LABEL: s_ashr_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-NEXT: s_lshr_b32 s3, s1, 16
+; GFX9-NEXT: s_sext_i32_i16 s2, s0
+; GFX9-NEXT: s_sext_i32_i16 s3, s1
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_ashr_i32 s1, s1, 16
+; GFX9-NEXT: s_ashr_i32 s2, s2, s3
; GFX9-NEXT: s_ashr_i32 s0, s0, s1
-; GFX9-NEXT: s_ashr_i32 s1, s2, s3
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0
; GFX9-NEXT: ; return to shader part epilog
%result = ashr <2 x i16> %value, %amount
%cast = bitcast <2 x i16> %result to i32
@@ -754,16 +756,20 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX9-LABEL: s_ashr_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s4, s0, 16
-; GFX9-NEXT: s_lshr_b32 s5, s2, 16
+; GFX9-NEXT: s_sext_i32_i16 s4, s0
+; GFX9-NEXT: s_sext_i32_i16 s5, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_ashr_i32 s2, s2, 16
; GFX9-NEXT: s_ashr_i32 s0, s0, s2
-; GFX9-NEXT: s_ashr_i32 s2, s4, s5
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX9-NEXT: s_lshr_b32 s2, s1, 16
-; GFX9-NEXT: s_lshr_b32 s4, s3, 16
-; GFX9-NEXT: s_ashr_i32 s1, s1, s3
+; GFX9-NEXT: s_ashr_i32 s4, s4, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s0
+; GFX9-NEXT: s_sext_i32_i16 s2, s1
+; GFX9-NEXT: s_sext_i32_i16 s4, s3
+; GFX9-NEXT: s_ashr_i32 s1, s1, 16
+; GFX9-NEXT: s_ashr_i32 s3, s3, 16
; GFX9-NEXT: s_ashr_i32 s2, s2, s4
-; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT: s_ashr_i32 s1, s1, s3
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1
; GFX9-NEXT: ; return to shader part epilog
%result = ashr <4 x i16> %value, %amount
%cast = bitcast <4 x i16> %result to <2 x i32>
@@ -968,26 +974,34 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX9-LABEL: s_ashr_v8i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s8, s0, 16
-; GFX9-NEXT: s_lshr_b32 s9, s4, 16
+; GFX9-NEXT: s_sext_i32_i16 s8, s0
+; GFX9-NEXT: s_sext_i32_i16 s9, s4
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_ashr_i32 s4, s4, 16
; GFX9-NEXT: s_ashr_i32 s0, s0, s4
-; GFX9-NEXT: s_ashr_i32 s4, s8, s9
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX9-NEXT: s_lshr_b32 s4, s1, 16
-; GFX9-NEXT: s_lshr_b32 s8, s5, 16
+; GFX9-NEXT: s_ashr_i32 s8, s8, s9
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s0
+; GFX9-NEXT: s_sext_i32_i16 s4, s1
+; GFX9-NEXT: s_sext_i32_i16 s8, s5
+; GFX9-NEXT: s_ashr_i32 s1, s1, 16
+; GFX9-NEXT: s_ashr_i32 s5, s5, 16
; GFX9-NEXT: s_ashr_i32 s1, s1, s5
; GFX9-NEXT: s_ashr_i32 s4, s4, s8
-; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX9-NEXT: s_lshr_b32 s4, s2, 16
-; GFX9-NEXT: s_lshr_b32 s5, s6, 16
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
+; GFX9-NEXT: s_sext_i32_i16 s5, s6
+; GFX9-NEXT: s_ashr_i32 s2, s2, 16
+; GFX9-NEXT: s_ashr_i32 s6, s6, 16
; GFX9-NEXT: s_ashr_i32 s4, s4, s5
; GFX9-NEXT: s_ashr_i32 s2, s2, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX9-NEXT: s_lshr_b32 s4, s3, 16
-; GFX9-NEXT: s_lshr_b32 s5, s7, 16
-; GFX9-NEXT: s_ashr_i32 s3, s3, s7
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2
+; GFX9-NEXT: s_sext_i32_i16 s4, s3
+; GFX9-NEXT: s_sext_i32_i16 s5, s7
+; GFX9-NEXT: s_ashr_i32 s3, s3, 16
+; GFX9-NEXT: s_ashr_i32 s6, s7, 16
; GFX9-NEXT: s_ashr_i32 s4, s4, s5
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
+; GFX9-NEXT: s_ashr_i32 s3, s3, s6
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3
; GFX9-NEXT: ; return to shader part epilog
%result = ashr <8 x i16> %value, %amount
%cast = bitcast <8 x i16> %result to <4 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 7fca2017bd11..9ce0d28b7623 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -569,10 +569,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX9-LABEL: s_lshr_v2i16:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s3, 0xffff
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-NEXT: s_lshr_b32 s3, s1, 16
+; GFX9-NEXT: s_lshr_b32 s4, s1, 16
+; GFX9-NEXT: s_and_b32 s0, s0, s3
+; GFX9-NEXT: s_and_b32 s1, s1, s3
; GFX9-NEXT: s_lshr_b32 s0, s0, s1
-; GFX9-NEXT: s_lshr_b32 s1, s2, s3
+; GFX9-NEXT: s_lshr_b32 s1, s2, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
%result = lshr <2 x i16> %value, %amount
@@ -747,13 +750,18 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX9-LABEL: s_lshr_v4i16:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s5, 0xffff
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
-; GFX9-NEXT: s_lshr_b32 s5, s2, 16
+; GFX9-NEXT: s_lshr_b32 s6, s2, 16
+; GFX9-NEXT: s_and_b32 s0, s0, s5
+; GFX9-NEXT: s_and_b32 s2, s2, s5
; GFX9-NEXT: s_lshr_b32 s0, s0, s2
-; GFX9-NEXT: s_lshr_b32 s2, s4, s5
+; GFX9-NEXT: s_lshr_b32 s2, s4, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
+; GFX9-NEXT: s_and_b32 s1, s1, s5
+; GFX9-NEXT: s_and_b32 s3, s3, s5
; GFX9-NEXT: s_lshr_b32 s1, s1, s3
; GFX9-NEXT: s_lshr_b32 s2, s2, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
@@ -937,24 +945,33 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX9-LABEL: s_lshr_v8i16:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s9, 0xffff
; GFX9-NEXT: s_lshr_b32 s8, s0, 16
-; GFX9-NEXT: s_lshr_b32 s9, s4, 16
+; GFX9-NEXT: s_lshr_b32 s10, s4, 16
+; GFX9-NEXT: s_and_b32 s0, s0, s9
+; GFX9-NEXT: s_and_b32 s4, s4, s9
; GFX9-NEXT: s_lshr_b32 s0, s0, s4
-; GFX9-NEXT: s_lshr_b32 s4, s8, s9
+; GFX9-NEXT: s_lshr_b32 s4, s8, s10
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
; GFX9-NEXT: s_lshr_b32 s8, s5, 16
+; GFX9-NEXT: s_and_b32 s1, s1, s9
+; GFX9-NEXT: s_and_b32 s5, s5, s9
; GFX9-NEXT: s_lshr_b32 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s4, s4, s8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: s_lshr_b32 s5, s6, 16
-; GFX9-NEXT: s_lshr_b32 s4, s4, s5
+; GFX9-NEXT: s_and_b32 s2, s2, s9
+; GFX9-NEXT: s_and_b32 s6, s6, s9
; GFX9-NEXT: s_lshr_b32 s2, s2, s6
+; GFX9-NEXT: s_lshr_b32 s4, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshr_b32 s5, s7, 16
-; GFX9-NEXT: s_lshr_b32 s3, s3, s7
+; GFX9-NEXT: s_and_b32 s3, s3, s9
+; GFX9-NEXT: s_and_b32 s6, s7, s9
+; GFX9-NEXT: s_lshr_b32 s3, s3, s6
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir
index bdfc6a20c940..f21d685ff57b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir
@@ -183,14 +183,16 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
; CHECK: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
+ ; CHECK: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST]], 16
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; CHECK: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32)
; CHECK: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+ ; CHECK: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
- ; CHECK: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[BITCAST1]](s32)
- ; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[LSHR]], [[LSHR1]](s32)
- ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR]](s32), [[ASHR1]](s32)
+ ; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
+ ; CHECK: [[ASHR2:%[0-9]+]]:sgpr(s32) = G_ASHR [[SEXT_INREG]], [[SEXT_INREG1]](s32)
+ ; CHECK: [[ASHR3:%[0-9]+]]:sgpr(s32) = G_ASHR [[ASHR]], [[ASHR1]](s32)
+ ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR2]](s32), [[ASHR3]](s32)
; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>)
%0:_(<2 x s16>) = COPY $sgpr0
%1:_(<2 x s16>) = COPY $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir
index e5a2f561772f..cd35fa468de8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir
@@ -185,10 +185,14 @@ body: |
; CHECK: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
; CHECK: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]]
; CHECK: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
- ; CHECK: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[BITCAST1]](s32)
+ ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
+ ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
+ ; CHECK: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[AND]], [[AND1]](s32)
; CHECK: [[LSHR3:%[0-9]+]]:sgpr(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32)
; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32)
; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>)
More information about the llvm-commits
mailing list