[PATCH] D109671: [AMDGPU] Add BFE pattern matches for constrained shifts.

Sun Sep 12 21:57:54 PDT 2021

abinavpp created this revision.
abinavpp added reviewers: arsenm, foad.
Herald added subscribers: kerbowa, hiraditya, t-tye, Anastasia, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl.
abinavpp requested review of this revision.
Herald added subscribers: llvm-commits, wdng.
Herald added a project: LLVM.

042e0883cbcd35641d60fd2d22105ac5c6a402f8 made clang conform to
https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#operators-shift

So, if a `clang -x c ...` emits:

  %sub = sub i32 32, %bits
  %shl = shl i32 %a, %sub

The equivalent `clang -x cl ...` after optimization can emit:

  %sub = sub i32 0, %bits
  %shl.mask = and i32 %sub, 31
  %shl = shl i32 %a, %shl.mask

This change adds the BFE pattern match for the `-x cl` output.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D109671

Files:
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/test/CodeGen/AMDGPU/bfe-patterns.ll


Index: llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -19,6 +19,16 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}shl_mask:
+; GCN: v_bfe_i32 v0, v0, 0, v1
+define i32 @shl_mask(i32 %a, i32 %bits) {
+  %sub = sub i32 0, %bits
+  %shl.mask = and i32 %sub, 31
+  %shl = shl i32 %a, %shl.mask
+  %shr = ashr i32 %shl, %shl.mask
+  ret i32 %shr
+}
+
 ; GCN-LABEL: {{^}}v_ubfe_sub_multi_use_shl_i32:
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
Index: llvm/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2577,6 +2577,23 @@
   (V_BFE_I32_e64 $src, (i32 0), $width)
 >;
 
+// An OpenCL front-end, as per
+// https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#operators-shift
+// , can emit (and (sub 32, i32:$x) 31) instead of (sub 32, i32:$x) as the
+// second operand of a 32 bit shift expression. This operand can be transformed
+// to (and (sub 0, i32:$x), 31) by the optimizer.
+def : AMDGPUPat <
+  (DivergentBinFrag<srl> (shl_oneuse i32:$src, (and (sub 0, i32:$width), 31)),
+                         (and (sub 0, i32:$width), 31)),
+  (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+def : AMDGPUPat <
+  (DivergentBinFrag<sra> (shl_oneuse i32:$src, (and (sub 0, i32:$width), 31)),
+                         (and (sub 0, i32:$width), 31)),
+  (V_BFE_I32_e64 $src, (i32 0), $width)
+>;
+
 // SHA-256 Ma patterns
 
 // ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D109671.372161.patch
Type: text/x-patch
Size: 1747 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210913/cbcd703c/attachment.bin>