[llvm-dev] memcmp code fragment

Fri May 19 12:46:17 PDT 2017

Hi,

Look at the following code:

Look at the following C code  seqence:

unsigned char mainGtU ( unsigned int i1,
               unsigned int i2,
               unsigned char* block)
{
   unsigned char c1, c2;
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;

   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;

..
..
<repeat 12 times>

In LLVM IR it will be following:

define i8 @mainGtU(i32 %i1, i32 %i2, i8* readonly %block, i16* nocapture
readnone %quadrant, i32 %nblock, i32* nocapture readnone %budget)
local_unnamed_addr #0 {
entry:
  %idxprom = zext i32 %i1 to i64
  %arrayidx = getelementptr inbounds i8, i8* %block, i64 %idxprom
  %0 = load i8, i8* %arrayidx, align 1
  %idxprom1 = zext i32 %i2 to i64
  %arrayidx2 = getelementptr inbounds i8, i8* %block, i64 %idxprom1
  %1 = load i8, i8* %arrayidx2, align 1
  %cmp = icmp eq i8 %0, %1
  br i1 %cmp, label %if.end, label %if.then

if.then:                                          ; preds = %entry
  %cmp7 = icmp ugt i8 %0, %1
  br label %return

if.end:                                           ; preds = %entry
  %inc = add i32 %i1, 1
  %inc10 = add i32 %i2, 1
  %idxprom11 = zext i32 %inc to i64
  %arrayidx12 = getelementptr inbounds i8, i8* %block, i64 %idxprom11
  %2 = load i8, i8* %arrayidx12, align 1
  %idxprom13 = zext i32 %inc10 to i64
  %arrayidx14 = getelementptr inbounds i8, i8* %block, i64 %idxprom13
  %3 = load i8, i8* %arrayidx14, align 1
  %cmp17 = icmp eq i8 %2, %3
  br i1 %cmp17, label %if.end25, label %if.then19

if.then19:                                        ; preds = %if.end
  %cmp22 = icmp ugt i8 %2, %3
  br label %return

..
..
<repeats 12 times>

This code sequence can be collapsed into call to  memcmp and we can get rid
of basic blocks. I have written a small peephole optimization for squenece
of instructions that identifies
branch termiantor, compare, load, gep etc and converts them to a call to
memcmp. This small pass gave me improvement of 67% on SPEC2000 bzip2 on X86.

Is there a better idea, other than small peephole pass on IR to optimize
this code?

Sirish
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170519/0880884b/attachment.html>