<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/101456>101456</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            LoopVectorizePass is not working for riscv

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          4465464

      </td>

    </tr>

</table>

<pre>

    In the following case, the assembly is generated for both X86 and RISC-V using clang. By using -mllvm -print-after-all to print the IR after all passes, it is observed that X86 generates vectorized IR instructions, but RISC-V does not generate vectorized instructions, and the LoopVectorizePass does not take effect.

The case：

#include <stdlib.h>

#define N 1024

int a[N*2] ;

int b[N*2] ;

int c[N*2] ;

int d[N*2] ;

void example1 () {

  int i;

  for (i=0; i<256; i++){

    a[i] = b[i] + c[i];

  }

}

The compilation command is:

clang  test_loop.c -O3  --target=x86_64-unknown-linux-gnu  -mllvm -print-after-all 

clang test_loop.c -O3   --target=riscv64-unknown-linux-gnu   -mllvm -print-after-all

For X86, after the LoopVectorizePass, the LLVM IR generates vectorized instructions. However, for RISC-V, after the LoopVectorizePass, the LLVM IR remains unchanged.

The LLVM IR of x86 after the LoopVectorizePass is:

; *** IR Dump After InjectTLIMappings on example1 ***

; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable

define dso_local void @example1() local_unnamed_addr #0 {

entry:

  br label %for.body

for.body:                                         ; preds = %entry, %for.body

  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]

  %arrayidx = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %indvars.iv

  %0 = load i32, ptr %arrayidx, align 4, !tbaa !5

  %arrayidx2 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %indvars.iv

  %1 = load i32, ptr %arrayidx2, align 4, !tbaa !5

  %add = add nsw i32 %1, %0

  %arrayidx4 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv

  store i32 %add, ptr %arrayidx4, align 4, !tbaa !5

  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

  %exitcond.not = icmp eq i64 %indvars.iv.next, 256

  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !9

for.end:                                          ; preds = %for.body

  ret void

}

; *** IR Dump After LoopVectorizePass on example1 ***

; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable

define dso_local void @example1() local_unnamed_addr #0 {

entry:

  br i1 false, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %entry

  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph

  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]

  %0 = add i64 %index, 0

  %1 = add i64 %index, 4

  %2 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %0

  %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %1

  %4 = getelementptr inbounds i32, ptr %2, i32 0

  %5 = getelementptr inbounds i32, ptr %2, i32 4

  %wide.load = load <4 x i32>, ptr %4, align 4, !tbaa !5

  %wide.load11 = load <4 x i32>, ptr %5, align 4, !tbaa !5

  %6 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %0

  %7 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %1

  %8 = getelementptr inbounds i32, ptr %6, i32 0

  %9 = getelementptr inbounds i32, ptr %6, i32 4

  %wide.load12 = load <4 x i32>, ptr %8, align 4, !tbaa !5

  %wide.load13 = load <4 x i32>, ptr %9, align 4, !tbaa !5

  %10 = add nsw <4 x i32> %wide.load12, %wide.load

  %11 = add nsw <4 x i32> %wide.load13, %wide.load11

  %12 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %0

  %13 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %1

  %14 = getelementptr inbounds i32, ptr %12, i32 0

  %15 = getelementptr inbounds i32, ptr %12, i32 4

  store <4 x i32> %10, ptr %14, align 4, !tbaa !5

  store <4 x i32> %11, ptr %15, align 4, !tbaa !5

  %index.next = add nuw i64 %index, 8

  %16 = icmp eq i64 %index.next, 256

  br i1 %16, label %middle.block, label %vector.body, !llvm.loop !9

middle.block:                                     ; preds = %vector.body

  br i1 true, label %for.end, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %entry

  %bc.resume.val = phi i64 [ 256, %middle.block ], [ 0, %entry ]

  br label %for.body

for.body:                                         ; preds = %scalar.ph, %for.body

  %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]

  %arrayidx = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %indvars.iv

  %17 = load i32, ptr %arrayidx, align 4, !tbaa !5

  %arrayidx2 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %indvars.iv

  %18 = load i32, ptr %arrayidx2, align 4, !tbaa !5

  %add = add nsw i32 %18, %17

  %arrayidx4 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv

  store i32 %add, ptr %arrayidx4, align 4, !tbaa !5

  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

  %exitcond.not = icmp eq i64 %indvars.iv.next, 256

  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !13

for.end:                                          ; preds = %middle.block, %for.body

  ret void

}

The LLVM IR of Riscv after the LoopVectorizePass is:

; *** IR Dump After InjectTLIMappings on example1 ***

; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable

define dso_local void @example1() local_unnamed_addr #0 {

  br label %1

1:                                                ; preds = %0, %1

  %2 = phi i64 [ 0, %0 ], [ %9, %1 ]

  %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %2

  %4 = load i32, ptr %3, align 4, !tbaa !9

  %5 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %2

  %6 = load i32, ptr %5, align 4, !tbaa !9

  %7 = add nsw i32 %6, %4

  %8 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %2

  store i32 %7, ptr %8, align 4, !tbaa !9

  %9 = add nuw nsw i64 %2, 1

  %10 = icmp eq i64 %9, 256

  br i1 %10, label %11, label %1, !llvm.loop !13

11:                                               ; preds = %1

  ret void

}

; *** IR Dump After LoopVectorizePass on example1 ***

; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable

define dso_local void @example1() local_unnamed_addr #0 {

  br label %1

1:                                                ; preds = %0, %1

  %2 = phi i64 [ 0, %0 ], [ %9, %1 ]

  %3 = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 %2

  %4 = load i32, ptr %3, align 4, !tbaa !9

  %5 = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 %2

  %6 = load i32, ptr %5, align 4, !tbaa !9

  %7 = add nsw i32 %6, %4

  %8 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %2

  store i32 %7, ptr %8, align 4, !tbaa !9

  %9 = add nuw nsw i64 %2, 1

  %10 = icmp eq i64 %9, 256

  br i1 %10, label %11, label %1, !llvm.loop !13

11:                                               ; preds = %1

  ret void

}

It can be observed that x86 modifies LLVM IR, but why is there no vectorization of LLVM IR for RISC-V?

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWt1u47oRfhr6ZmBDon5sX_giTjZogD2nxfZg0buAEimbXYl0Scp2-vQFKdmmJXmjpGnR05MgsCWS883wZ76ZMUi05hvB2Aola5Q8TEhttlKt4jhN4jSeZJK-rJ4EmC2DQpalPHCxgZxohvC9ayVasyorX4Br2DDBFDGMQiEVZNJs4W-LFIig8O3pr_fT71BrJ18SsZnB-qV9n1Zlua9gulNcmCkpDFNTUpZgJLgmp-jpG7gesD07q1ZbG7ixmmWmmdozCmZLjFN6skXDnuVGKv5PRi0GF9qoOjdcCief1eZkHJVMg5DmLOuLduXspKxZX6XcfT8N-wvR-gJjyA8GrChYbmYoeEDBXfP525Y1S_jlEa3v0bJtRjjiIi9rygBF99rQkmezLYq-nLspK7hg8CuEAY6bVrs6BCXrXxG-wyh5ABStLz3ZzZ78Zg8d7Gk-95JTYEdS7UoWAsILhJeA5u0YACvPzzLgjgHCC46ihwBFa9t3j5O0ecRr97_05MHNhTeaH5z5zQteO4vtiweP5g-tdeeHywLLasdLYvfLPld2v7hGUTvEHUEAw7R5LqXczXKY_jkCmE4NURtmUPRwXKTPaTytxQ8hD2JaclEfpxtRw83z6kP3kH1oxXW-H8a-Be7P71Eqe8bdMXQuMXgQTx769ev3X-zJH_QI_1jP4E_ywPZMWUm7dY1fvFGNYhXhQkMt8i0RG0Z7Z_80UhZwtPRwG9vfsfYzWgPCd82_BXmoqx3cOYgn8XeWm9--Pv1CdjsuNhqk8E9rK3UBeqyFmzncGaOsJhCyUIyBkIrltdL2Sb-IHISsxYELChWrpHpBeKEYoQfFjSNCojYVqxoA4Vq4IHnOtOZZya66llAfDMlK1pjRujTV8rmUOSnBuRiKg5PdrZO5zudaCFIx-kwotY4VBRfnY8Kol_NiAWQKSpKxEhBOCqlmlsn9hTy3Rc7vRv3ZNdspRrXzToSTRim-7ysB28QF3ROlZ3zvBHZbDjyNASVrCFophwDWre17sr4Wmwl2NB18N_iigyhFXjg9Og0bZphdb2F2RgEXmayFtTZZ4yBewBF4hFtddgCKg8xtVho3BjnrfAM8RYHTUEpCHcoJ4mKBOwgl3wiIG5NDkxFiv5MBe_E7Dc5HGxy-ZjAeaTGlDsl-C32wYA6-3ZdgYHLxOydHxk1OG6nYyQ5C6cDc4nFz6xy1yzzrQzPXngn4HkIPgB25yaWgMxvwrTTPqx2wf_Qlz2fZBsCLk_KwC2PHXLkuE7TX5jytmZcNFzMbaezLsuvjVvgNLt738a5fK2YcSXUj70-Iuc_rf2xi5iEUpGxy6POu6pyURM1226vWJlTbVj8NOzeO39kb3D0QLVr0bsDwm8eq7Sn1QRr66Ezw5JXseDNknCX6YYMd_YjhKesEjeDs6Bc3ZY7Agx6BDo2KvVHvJfLByOOrjz4S2Oesn7Lzdahwz5ZofcOSd8j7K3bglM1cXDoHKBTdx-1coi-e_EgaPyOG4euYyTjM9CMDtL98848E9vd1MX5f0qF9Xb5DfnBfQ_z6LizeurPR65jLcZhhcJXPXKF15tEyybnFRwnHokRdlNDftfC9BDKYLF3x13sZZBD5yuY3UEg4yCHhG0gk7LNIkwH21jwMfLEx5HELKPSBRjLGJQBd5ZHd4LHwlyG9kTR6kWwoXwzTqyyh4pSWbJaVMv8xlD6MyhavQEZG-J8G-CujjarZq5ntJQfy7Lo0_hvZTneFugkQwkmWzxTTdcVme1L2so-W8zpQfgrSL2n_m6X4Vfr45nK8O_0W5Az6u6jQw_nvrkRf_Idr9EW7NeH8s0r_H6rSw-jDy_QBghtbufc-Oz_SfuM633_-TPshvwZcR4PQX7PwTQfgxjE4BaGwV6EOldJBh9eXJ-kOhX9oKYp7pegA_0W3mWE5thR9K0XjXvU3YNhPcsFlr8jr0PEpg4jHFm1vJWE8yL3zcWXXslcKDnAs7lJrW0x1CHV5M3ENrkiyybM9f3iVLMN3uEnPS8LPHzM_6euTvj7p6_-UvnqfTwZyIiBjnZsyx0UKlaS84EyfUj7U3oo5bN2lHrNlyjLU-dZAc6dCFucU0bsqED1O6Cqiy2hJJmwVzjGOl3GapJPtihY0WOJlMc_zJJhjUmTZfD6PIpbEBQ2KbMJXOMBxsAjCII3jMJ0RFkdhlOEgLHDK8NzyVUV4OXPrK9VmwrWu2SoMwjhJJ24btLvIhLFgB3C9CNuTN1ErKzTN6o1GcVBybfQFxnBTstVQWuvu8Ryk-sHFxs3T3dyY1KpcbY3ZubQXPyL8uOFmW2ezXFYIP1rg9mu6U9Kmuwg_OnM0wo-tvfsV_lcAAAD__3xCd4E">