[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Combine (sext (trunc (sext_in_reg x))) (PR #131312)

Fri Mar 14 04:19:30 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Pierre van Houtryve (Pierre-vh)

<details>
<summary>Changes</summary>

This is a bit of an akward pattern that can come up as a result
of legalization and then widening of i16 operations to i32 in RegBankSelect
on AMDGPU.

This quick combine avoids redundant patterns like
```
s_sext_i32_i8 s0, s0
s_sext_i32_i16 s0, s0
s_ashr_i32 s0, s0, s1
```

With this the second sext is removed as it's redundant.

---
Full diff: https://github.com/llvm/llvm-project/pull/131312.diff


3 Files Affected:

- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+11-1) 
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir (+86) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll (+16-62) 


``````````diff

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3590ab221ad44..9727b86b4be8b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -258,6 +258,14 @@ def sext_trunc_sextload : GICombineRule<
          [{ return Helper.matchSextTruncSextLoad(*${d}); }]),
   (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
 
+def sext_trunc_sextinreg : GICombineRule<
+  (defs root:$dst),
+  (match (G_SEXT_INREG $sir, $src, $width),
+         (G_TRUNC $trunc, $sir),
+         (G_SEXT $dst, $trunc),
+         [{ return (MRI.getType(${trunc}.getReg()).getScalarSizeInBits() >= ${width}.getImm()); }]),
+  (apply (GIReplaceReg $dst, $sir))>;
+
 def sext_inreg_of_load_matchdata : GIDefMatchData<"std::tuple<Register, unsigned>">;
 def sext_inreg_of_load : GICombineRule<
   (defs root:$root, sext_inreg_of_load_matchdata:$matchinfo),
@@ -1896,7 +1904,9 @@ def cast_of_cast_combines: GICombineGroup<[
   sext_of_anyext,
   anyext_of_anyext,
   anyext_of_zext,
-  anyext_of_sext
+  anyext_of_sext,
+
+  sext_trunc_sextinreg
 ]>;
 
 def cast_combines: GICombineGroup<[
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
new file mode 100644
index 0000000000000..d41e5b172efc2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-trunc-sextinreg.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: trunc_s16_inreg_8
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: trunc_s16_inreg_8
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+    ; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+    %copy:_(s32) = COPY $vgpr0
+    %inreg:_(s32) = G_SEXT_INREG %copy, 8
+    %trunc:_(s16) = G_TRUNC %inreg
+    %sext:_(s32) = G_SEXT %trunc
+    $vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s16_inreg_16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: trunc_s16_inreg_16
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+    ; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+    %copy:_(s32) = COPY $vgpr0
+    %inreg:_(s32) = G_SEXT_INREG %copy, 16
+    %trunc:_(s16) = G_TRUNC %inreg
+    %sext:_(s32) = G_SEXT %trunc
+    $vgpr0 = COPY %sext
+...
+
+---
+name: trunc_s8_inreg_16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: trunc_s8_inreg_16
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 16
+    ; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+    ; CHECK-NEXT: %sext:_(s32) = G_SEXT %trunc(s8)
+    ; CHECK-NEXT: $vgpr0 = COPY %sext(s32)
+    %copy:_(s32) = COPY $vgpr0
+    %inreg:_(s32) = G_SEXT_INREG %copy, 16
+    %trunc:_(s8) = G_TRUNC %inreg
+    %sext:_(s32) = G_SEXT %trunc
+    $vgpr0 = COPY %sext
+...
+
+# TODO?: We could handle this by inserting a trunc, but I'm not sure how useful that'd be.
+---
+name: mismatching_types
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: mismatching_types
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %copy:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %copy, 8
+    ; CHECK-NEXT: %trunc:_(s8) = G_TRUNC %inreg(s32)
+    ; CHECK-NEXT: %sext:_(s16) = G_SEXT %trunc(s8)
+    ; CHECK-NEXT: %anyext:_(s32) = G_ANYEXT %sext(s16)
+    ; CHECK-NEXT: $vgpr0 = COPY %anyext(s32)
+    %copy:_(s32) = COPY $vgpr0
+    %inreg:_(s32) = G_SEXT_INREG %copy, 8
+    %trunc:_(s8) = G_TRUNC %inreg
+    %sext:_(s16) = G_SEXT %trunc
+    %anyext:_(s32) = G_ANYEXT %sext
+    $vgpr0 = COPY %anyext
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 8c687d85ac24b..7ec27f47578c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -197,33 +197,13 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 }
 
 define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
-; GFX6-LABEL: abs_sgpr_v2i8:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sext_i32_i8 s0, s0
-; GFX6-NEXT:    s_sext_i32_i8 s1, s1
-; GFX6-NEXT:    s_abs_i32 s0, s0
-; GFX6-NEXT:    s_abs_i32 s1, s1
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_v2i8:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NEXT:    s_sext_i32_i8 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_abs_i32 s0, s0
-; GFX8-NEXT:    s_abs_i32 s1, s1
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_v2i8:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10-NEXT:    s_sext_i32_i8 s1, s1
-; GFX10-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10-NEXT:    s_sext_i32_i16 s1, s1
-; GFX10-NEXT:    s_abs_i32 s0, s0
-; GFX10-NEXT:    s_abs_i32 s1, s1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX-LABEL: abs_sgpr_v2i8:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_sext_i32_i8 s0, s0
+; GFX-NEXT:    s_sext_i32_i8 s1, s1
+; GFX-NEXT:    s_abs_i32 s0, s0
+; GFX-NEXT:    s_abs_i32 s1, s1
+; GFX-NEXT:    ; return to shader part epilog
   %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
   ret <2 x i8> %res
 }
@@ -268,41 +248,15 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 }
 
 define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
-; GFX6-LABEL: abs_sgpr_v3i8:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sext_i32_i8 s0, s0
-; GFX6-NEXT:    s_sext_i32_i8 s1, s1
-; GFX6-NEXT:    s_sext_i32_i8 s2, s2
-; GFX6-NEXT:    s_abs_i32 s0, s0
-; GFX6-NEXT:    s_abs_i32 s1, s1
-; GFX6-NEXT:    s_abs_i32 s2, s2
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_v3i8:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NEXT:    s_sext_i32_i8 s1, s1
-; GFX8-NEXT:    s_sext_i32_i8 s2, s2
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_abs_i32 s0, s0
-; GFX8-NEXT:    s_abs_i32 s1, s1
-; GFX8-NEXT:    s_abs_i32 s2, s2
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_v3i8:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10-NEXT:    s_sext_i32_i8 s1, s1
-; GFX10-NEXT:    s_sext_i32_i8 s2, s2
-; GFX10-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10-NEXT:    s_sext_i32_i16 s1, s1
-; GFX10-NEXT:    s_sext_i32_i16 s2, s2
-; GFX10-NEXT:    s_abs_i32 s0, s0
-; GFX10-NEXT:    s_abs_i32 s1, s1
-; GFX10-NEXT:    s_abs_i32 s2, s2
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX-LABEL: abs_sgpr_v3i8:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_sext_i32_i8 s0, s0
+; GFX-NEXT:    s_sext_i32_i8 s1, s1
+; GFX-NEXT:    s_sext_i32_i8 s2, s2
+; GFX-NEXT:    s_abs_i32 s0, s0
+; GFX-NEXT:    s_abs_i32 s1, s1
+; GFX-NEXT:    s_abs_i32 s2, s2
+; GFX-NEXT:    ; return to shader part epilog
   %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
   ret <3 x i8> %res
 }

``````````

</details>


https://github.com/llvm/llvm-project/pull/131312