[llvm] [LoopVectorize] Peek through bitcasts when performing CSE (PR #146856)

Fri Jul 18 08:36:46 PDT 2025

pedroclobo wrote:

This patch fixes a regression in the generated IR by the [prototype introducing the byte type](https://github.com/pedroclobo/llvm-project/tree/byte-type), which blocked the following `shuffle + bitcast`, in the byte type version, from getting CSE'd. The prototype was developed as part of [this GSoC project](https://summerofcode.withgoogle.com/programs/2025/projects/F3yLFcWR).

```llvm
; base version of clang
vector.ph926: 
  %n.vec928 = and i64 %53, -16
  %broadcast.splatinsert929 = insertelement <8 x i8> poison, i8 %50, i64 0   
  %broadcast.splatinsert931 = insertelement <8 x i8> poison, i8 %49, i64 0   
  %invariant.gep982 = getelementptr i8, ptr %dest.6.1660, i64 16
  %interleaved.vec937 = shufflevector <8 x i8> %broadcast.splatinsert931, <8 x i8> %broadcast.splatinsert929, ...
  br label %vector.body933

; clang with byte type
vector.ph942:
  %n.vec944 = and i64 %53, -16 
  %broadcast.splatinsert945 = insertelement <8 x b8> poison, b8 %50, i64 0
  %broadcast.splatinsert947 = insertelement <8 x i8> poison, i8 %49, i64 0
  %broadcast.splat948 = shufflevector <8 x i8> %broadcast.splatinsert947, <8 x i8> poison, <8 x i32> zeroinitializer 
  %invariant.gep998 = getelementptr i8, ptr %dest.6.1676, i64 16
  %54 bitcast <8 x i8> %broadcast.splat948 to <8 x b8>
  %interleaved.vec953 = shufflevector <8 x b8> %54, <8 x b8> %broadcast.splatinsert945, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, …>
  %55 = bitcast <8 x i8> %broadcast.splat948 to <8 x b8> 
  %interleaved.vec954 = shufflevector <8 x b8> %55, <8 x b8> %broadcast.splatinsert945, <16 x i32> <i32 8, i32 8, i32 1, i32 8, i32 2, i32 8, …>
  %next.gep935 = getelementptr i8, ptr %dest.6.1668, i64 %54
  br label %vector.body949
```

The following original reduced test case was adapted to not include the byte type.

```llvm
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define fastcc i32 @Decompress_Sequences(i8 %0) {
entry:
  br label %do.body93.i705

do.body93.i705:                                   ; preds = %do.body93.i705, %entry
  %dest.10.i706 = phi ptr [ null, %entry ], [ %add.ptr97.i709, %do.body93.i705 ]
  %len.addr.8.i707 = phi i64 [ 0, %entry ], [ %sub99.i710, %do.body93.i705 ]
  store i8 %0, ptr %dest.10.i706, align 1
  %arrayidx96.i708 = getelementptr i8, ptr %dest.10.i706, i64 1
  store b8 0, ptr %arrayidx96.i708, align 1
  %add.ptr97.i709 = getelementptr i8, ptr %dest.10.i706, i64 2
  %sub99.i710 = add i64 %len.addr.8.i707, -2
  %tobool100.not.i711 = icmp eq i64 %sub99.i710, 0
  br i1 %tobool100.not.i711, label %for.end.loopexit909, label %do.body93.i705

for.end.loopexit909:                              ; preds = %do.body93.i705
  ret i32 0
}
```

The patch does not seem to have any significant impact on compile time, [as reported by the compile time tracker](https://llvm-compile-time-tracker.com/compare.php?from=82c0a53763bc39f978d39c79b17e20ae1b57748d&to=98fdac6ebfd5a247753b0f2d3125c6a8d91c266f&stat=instructions:u).

https://github.com/llvm/llvm-project/pull/146856