<div dir="ltr">Hello, I want some help in understanding knl intel assembly of matrix multiplication code. some of the things are not clear;<div><br></div><div>here .c file:</div><div><br></div><div><div>#include <stdio.h></div><div>#define N 1000</div><div> </div><div>// This function multiplies A[][] and B[][], and stores</div><div>// the result in C[][]</div><div>void multiply(int A[][N], int B[][N], int C[][N])</div><div>{</div><div>    int i, j, k, r;</div><div>    for (i = 0; i < N; i++)</div><div>    {</div><div>        for (j = 0; j < N; j++)</div><div>        {</div><div>            r = 0;</div><div>            for (k = 0; k < N; k++) {</div><div>                r += A[i][k]*B[k][j];}</div><div>                C[i][j] = r;</div><div><br></div><div>        }</div><div>            </div><div>    }</div><div>}</div><div> </div></div><div>here .s file: <font color="#ff0000"><b> the code that i want to ask is in red color.</b></font></div><div><br></div><div><div><span style="white-space:pre">      </span>.text</div><div><span style="white-space:pre"> </span>.intel_syntax noprefix</div><div><span style="white-space:pre">        </span>.file<span style="white-space:pre">        </span>"matn_o3.ll"</div><div><span style="white-space:pre">        </span>.section<span style="white-space:pre">     </span>.rodata,"a",@progbits</div><div><span style="white-space:pre">       </span>.p2align<span style="white-space:pre">     </span>6</div><div>.LCPI0_0:</div><div><span style="white-space:pre">     </span>.quad<span style="white-space:pre">        </span>8                       # 0x8</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>9                       # 0x9</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>10                      # 0xa</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>11                      # 0xb</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>12                      # 0xc</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>13                      # 0xd</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>14                      # 0xe</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>15                      # 0xf</div><div>.LCPI0_1:</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>0                       # 0x0</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>1                       # 0x1</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>2                       # 0x2</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>3                       # 0x3</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>4                       # 0x4</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>5                       # 0x5</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>6                       # 0x6</div><div><span style="white-space:pre">      </span>.quad<span style="white-space:pre">        </span>7                       # 0x7</div><div><span style="white-space:pre">      </span>.section<span style="white-space:pre">     </span>.rodata.cst8,"aM",@progbits,8</div><div><span style="white-space:pre">       </span>.p2align<span style="white-space:pre">     </span>3</div><div>.LCPI0_2:</div><div><span style="white-space:pre">     </span>.quad<span style="white-space:pre">        </span>4000                    # 0xfa0</div><div>.LCPI0_3:</div><div><span style="white-space:pre">     </span>.quad<span style="white-space:pre">        </span>64000                   # 0xfa00</div><div>.LCPI0_4:</div><div><span style="white-space:pre">     </span>.quad<span style="white-space:pre">        </span>128000                  # 0x1f400</div><div>.LCPI0_5:</div><div><span style="white-space:pre">    </span>.quad<span style="white-space:pre">        </span>192000                  # 0x2ee00</div><div>.LCPI0_6:</div><div><span style="white-space:pre">    </span>.quad<span style="white-space:pre">        </span>64                      # 0x40</div><div><span style="white-space:pre">     </span>.text</div><div><span style="white-space:pre"> </span>.globl<span style="white-space:pre">       </span>multiply</div><div><span style="white-space:pre">      </span>.p2align<span style="white-space:pre">     </span>4, 0x90</div><div><span style="white-space:pre">       </span>.type<span style="white-space:pre">        </span>multiply,@function</div><div>multiply:                               # @multiply</div><div><span style="white-space:pre">   </span>.cfi_startproc</div><div># BB#0:</div><div><span style="white-space:pre">  </span>push<span style="white-space:pre"> </span>rbp</div><div>.Lcfi0:</div><div><span style="white-space:pre">     </span>.cfi_def_cfa_offset 16</div><div><span style="white-space:pre">        </span>push<span style="white-space:pre"> </span>r15</div><div>.Lcfi1:</div><div><span style="white-space:pre">     </span>.cfi_def_cfa_offset 24</div><div><span style="white-space:pre">        </span>push<span style="white-space:pre"> </span>r14</div><div>.Lcfi2:</div><div><span style="white-space:pre">     </span>.cfi_def_cfa_offset 32</div><div><span style="white-space:pre">        </span>push<span style="white-space:pre"> </span>r12</div><div>.Lcfi3:</div><div><span style="white-space:pre">     </span>.cfi_def_cfa_offset 40</div><div><span style="white-space:pre">        </span>push<span style="white-space:pre"> </span>rbx</div><div>.Lcfi4:</div><div><span style="white-space:pre">     </span>.cfi_def_cfa_offset 48</div><div>.Lcfi5:</div><div><span style="white-space:pre">  </span>.cfi_offset rbx, -48</div><div>.Lcfi6:</div><div><span style="white-space:pre">    </span>.cfi_offset r12, -40</div><div>.Lcfi7:</div><div><span style="white-space:pre">    </span>.cfi_offset r14, -32</div><div>.Lcfi8:</div><div><span style="white-space:pre">    </span>.cfi_offset r15, -24</div><div>.Lcfi9:</div><div><span style="white-space:pre">    </span>.cfi_offset rbp, -16</div><div><span style="white-space:pre">  </span>lea<span style="white-space:pre">  </span>r8, [rdi + 3856]</div><div><span style="white-space:pre">      </span>xor<span style="white-space:pre">  </span>r9d, r9d</div><div><span style="white-space:pre">      </span>vmovdqa64<span style="white-space:pre">    </span>zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 = [8,9,10,11,12,13,14,15]</div><div><span style="white-space:pre"> </span>vmovdqa64<span style="white-space:pre">    </span>zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 = [0,1,2,3,4,5,6,7]</div><div><span style="white-space:pre">       </span>vpbroadcastq<span style="white-space:pre"> </span>zmm2, qword ptr [rip + .LCPI0_2]</div><div><span style="white-space:pre">      </span>vpbroadcastq<span style="white-space:pre"> </span>zmm3, rsi</div><div><span style="white-space:pre">     </span>add<span style="white-space:pre">  </span>rsi, 3856000</div><div><span style="white-space:pre">  </span>vpbroadcastq<span style="white-space:pre"> </span>zmm4, qword ptr [rip + .LCPI0_3]</div><div><span style="white-space:pre">      </span>vpbroadcastq<span style="white-space:pre"> </span>zmm5, qword ptr [rip + .LCPI0_4]</div><div><span style="white-space:pre">      </span>vpbroadcastq<span style="white-space:pre"> </span>zmm6, qword ptr [rip + .LCPI0_5]</div><div><span style="white-space:pre">      </span>kxnorw<span style="white-space:pre">       </span>k1, k0, k0</div><div><span style="white-space:pre">    </span>kshiftrw<span style="white-space:pre">     </span>k1, k1, 8</div><div><span style="white-space:pre">     </span>vpbroadcastq<span style="white-space:pre"> </span>zmm7, qword ptr [rip + .LCPI0_6]</div><div><span style="white-space:pre">      </span>.p2align<span style="white-space:pre">     </span>4, 0x90</div><div>.LBB0_1:                                # %.preheader26</div><div>                                        # =>This Loop Header: Depth=1</div><div>                                        #     Child Loop BB0_2 Depth 2</div><div>                                        #       Child Loop BB0_3 Depth 3</div><div>                                        #       Child Loop BB0_5 Depth 3</div><div><span style="white-space:pre">       </span>xor<span style="white-space:pre">  </span>r11d, r11d</div><div><span style="white-space:pre">    </span>.p2align<span style="white-space:pre">     </span>4, 0x90</div><div>.LBB0_2:                                # %.preheader</div><div>                                        #   Parent Loop BB0_1 Depth=1</div><div>                                        # =>  This Loop Header: Depth=2</div><div>                                        #       Child Loop BB0_3 Depth 3</div><div>                                        #       Child Loop BB0_5 Depth 3</div><div><span style="white-space:pre">        </span>vpxord<span style="white-space:pre">       </span>zmm8, zmm8, zmm8</div><div><span style="white-space:pre">      </span>mov<span style="white-space:pre">  </span>ecx, 960</div><div><span style="white-space:pre">      </span>vmovdqa64<span style="white-space:pre">    </span>zmm9, zmm23</div><div><span style="white-space:pre">   </span>vmovdqa64<span style="white-space:pre">    </span>zmm10, zmm22</div><div><span style="white-space:pre">  </span>vpxord<span style="white-space:pre">       </span>zmm11, zmm11, zmm11</div><div><span style="white-space:pre">   </span>vpxord<span style="white-space:pre">       </span>zmm12, zmm12, zmm12</div><div><span style="white-space:pre">   </span>vpxord<span style="white-space:pre">       </span>zmm13, zmm13, zmm13</div><div><span style="white-space:pre">   </span>.p2align<span style="white-space:pre">     </span>4, 0x90</div><div>.LBB0_3:                                # %vector.body</div><div>                                        #   Parent Loop BB0_1 Depth=1</div><div>                                        #     Parent Loop BB0_2 Depth=2</div><div>                                        # =>    This Inner Loop Header: Depth=3</div><div>                                        # this bb will run 15 times</div><div><span style="white-space:pre">        </span>vmovq<span style="white-space:pre">        </span>rax, xmm9</div><div><span style="white-space:pre">     </span>imul<span style="white-space:pre"> </span>r10, r9, 4000</div><div><span style="white-space:pre"> </span>lea<span style="white-space:pre">  </span>rbx, [rdi + r10]</div><div><span style="white-space:pre">      </span><b><font color="#ff0000">vpmuludq</font><span style="color:rgb(255,0,0);white-space:pre">      </span><font color="#ff0000">zmm14, zmm10, zmm2       ; </font><font color="#0000ff">this is BB for vector here we have to do gather for B due to arbitrary addresses so here zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in these indexes? and zmm2=[4000, 4000,.....4000]. these are the indexes for B we need to multiple indexes with stride=4000. i know here these indexes are 64 bit but the values stored in these locations are 32 bits then  the load using zmm10 index will give 8 elements of 32 bits present in these locations, so do the registers contain 8 elements of 32 bits present at specified indexes?? so after multiplication we get indexes for higher 8 elements of B i.e [3200,3600,40000,.......54000].</font></b></div><div><b><font color="#0000ff"><br></font></b></div><div><b><span style="color:rgb(255,0,0);white-space:pre">        </span><font color="#ff0000">vpsrlq</font><span style="color:rgb(255,0,0);white-space:pre"> </span><font color="#ff0000">zmm15, zmm10, 32              ; </font><font color="#0000ff">i dont understand the need for this step, please explain the purpose of all these steps. here vpsrlq will shift right zmm10 values by 256 bits (32*8)....zmmm10 initially=</font></b><b><font color="#0000ff">[8,9,10,11,12,13,14,15]. it will now become [0,0,0,0,8,9,10,11]...Am I correct? Please explain me the purpose of this step.</font></b></div><div><b><span style="color:rgb(255,0,0);white-space:pre">        </span><font color="#ff0000">vpmuludq</font><span style="color:rgb(255,0,0);white-space:pre">       </span><font color="#ff0000">zmm15, zmm15, zmm2  ;    </font><font color="#0000ff">similarly </font></b><b><font color="#0000ff">dont understand the need for this step.</font></b><b><font color="#0000ff"> </font><font color="#ff0000">      </font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpsllq<span style="white-space:pre">       </span>zmm15, zmm15, 32    ; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">   </span>vpaddq<span style="white-space:pre">       </span>zmm14, zmm14, zmm3  ; </b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">        </span>vpaddq<span style="white-space:pre">       </span>zmm14, zmm15, zmm14 ; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">     </span>vpbroadcastq<span style="white-space:pre"> </span>zmm15, r11 ; </b></font><b><font color="#0000ff">r11 changes when loop variable j changes whats the need of this step?</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">       </span>vpsllq<span style="white-space:pre">       </span>zmm15, zmm15, 2   ; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddq<span style="white-space:pre">       </span>zmm14, zmm14, zmm15 ; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">     </span>vpmuludq<span style="white-space:pre">     </span>zmm16, zmm9, zmm2 ; </b></font><b><font color="#0000ff">here same as before the lower 8 elements of B indexes are computed as Zmm16=[0,4000,8000,.......28000]</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">       </span>vpsrlq<span style="white-space:pre">       </span>zmm17, zmm9, 32   </b></font><font color="#ff0000"><b>; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">       </span>vpmuludq<span style="white-space:pre">     </span>zmm17, zmm17, zmm2  </b></font><font color="#ff0000"><b>; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpsllq<span style="white-space:pre">       </span>zmm17, zmm17, 32  </b></font><font color="#ff0000"><b>; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">        </span>vpaddq<span style="white-space:pre">       </span>zmm16, zmm16, zmm3  </b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">   </span>vpaddq<span style="white-space:pre">       </span>zmm16, zmm17, zmm16  </b></font><font color="#ff0000"><b>; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">     </span>vpaddq<span style="white-space:pre">       </span>zmm15, zmm16, zmm15  </b></font><font color="#ff0000"><b>; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">     </span>vpaddq<span style="white-space:pre">       </span>zmm16, zmm15, zmm4</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddq<span style="white-space:pre">       </span>zmm17, zmm14, zmm4</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddq<span style="white-space:pre">       </span>zmm18, zmm15, zmm5</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddq<span style="white-space:pre">       </span>zmm19, zmm14, zmm5</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddq<span style="white-space:pre">       </span>zmm20, zmm15, zmm6</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddq<span style="white-space:pre">       </span>zmm21, zmm14, zmm6</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>kmovw<span style="white-space:pre">        </span>k2, k1  </b></font><font color="#ff0000"><b>; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><b><span style="color:rgb(255,0,0);white-space:pre">     </span><font color="#ff0000">vpgatherqd</font><span style="color:rgb(255,0,0);white-space:pre">     </span><font color="#ff0000">ymm0 {k2}, zmmword ptr [zmm14] ; </font><font color="#0000ff">since zmm14 contains 8 indexes ( or values at these 8 indexes???) so it will load 8 elements not 16. here it should be zmm14</font></b><b><font color="#0000ff">=[3200,3600,40000,.......54000]. but by the above computation these indexes are changes??</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">   </span>kxnorw<span style="white-space:pre">       </span>k2, k0, k0  </b></font><font color="#ff0000"><b>; </b></font><b><font color="#0000ff">dont understand the need for this step</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpgatherqd<span style="white-space:pre">   </span>ymm14 {k2}, zmmword ptr [zmm15]   </b></font><font color="#ff0000"><b>; </b></font><b><font color="#0000ff">here again issues with index zmm15. it should be </font></b><b><font color="#0000ff">[0,4000,8000,.......28000] but its different due to above computation.</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">       </span>vinserti64x4<span style="white-space:pre"> </span>zmm0, zmm14, ymm0, 1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">    </span>kmovw<span style="white-space:pre">        </span>k2, k1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">  </span>vpgatherqd<span style="white-space:pre">   </span>ymm14 {k2}, zmmword ptr [zmm17]</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre"> </span>kxnorw<span style="white-space:pre">       </span>k2, k0, k0</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpgatherqd<span style="white-space:pre">   </span>ymm15 {k2}, zmmword ptr [zmm16]</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre"> </span>vinserti64x4<span style="white-space:pre"> </span>zmm14, zmm15, ymm14, 1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">  </span>kmovw<span style="white-space:pre">        </span>k2, k1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">  </span>vpgatherqd<span style="white-space:pre">   </span>ymm15 {k2}, zmmword ptr [zmm19]</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre"> </span>kxnorw<span style="white-space:pre">       </span>k2, k0, k0</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpgatherqd<span style="white-space:pre">   </span>ymm16 {k2}, zmmword ptr [zmm18]</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre"> </span>vinserti64x4<span style="white-space:pre"> </span>zmm15, zmm16, ymm15, 1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">  </span>kmovw<span style="white-space:pre">        </span>k2, k1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">  </span>vpgatherqd<span style="white-space:pre">   </span>ymm1 {k2}, zmmword ptr [zmm21]</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">  </span>kxnorw<span style="white-space:pre">       </span>k2, k0, k0</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpgatherqd<span style="white-space:pre">   </span>ymm16 {k2}, zmmword ptr [zmm20]</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre"> </span>vinserti64x4<span style="white-space:pre"> </span>zmm1, zmm16, ymm1, 1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">    </span>vpmulld<span style="white-space:pre">      </span>zmm0, zmm0, zmmword ptr [rbx + 4*rax]</b></font></div><div><span style="white-space:pre">  </span>vpmulld<span style="white-space:pre">      </span>zmm14, zmm14, zmmword ptr [rbx + 4*rax + 64]</div><div><span style="white-space:pre">  </span>vpmulld<span style="white-space:pre">      </span>zmm15, zmm15, zmmword ptr [rbx + 4*rax + 128]</div><div><span style="white-space:pre"> </span>vpmulld<span style="white-space:pre">      </span>zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192]</div><div><span style="white-space:pre">   </span>vpaddd<span style="white-space:pre">       </span>zmm8, zmm0, zmm8</div><div><span style="white-space:pre">      </span>vpaddd<span style="white-space:pre">       </span>zmm11, zmm14, zmm11</div><div><span style="white-space:pre">   </span>vpaddd<span style="white-space:pre">       </span>zmm12, zmm15, zmm12</div><div><span style="white-space:pre">   </span>vpaddd<span style="white-space:pre">       </span>zmm13, zmm1, zmm13</div><div><span style="white-space:pre">    </span>vpaddq<span style="white-space:pre">       </span>zmm9, zmm9, zmm7        #zmm7=64</div><div><span style="white-space:pre">  </span>vpaddq<span style="white-space:pre">       </span>zmm10, zmm10, zmm7</div><div><span style="white-space:pre">    </span>add<span style="white-space:pre">  </span>rcx, -64     #decrement counter by 64</div><div><span style="white-space:pre">       </span>jne<span style="white-space:pre">  </span>.LBB0_3       # if rcx not equal to zero goto .lbbo_3</div><div># BB#4:                                 # %middle.block</div><div>                                        #   in Loop: Header=BB0_2 Depth=2</div><div><span style="white-space:pre">   </span>vpaddd<span style="white-space:pre">       </span>zmm0, zmm11, zmm8</div><div><span style="white-space:pre">     </span>vpaddd<span style="white-space:pre">       </span>zmm0, zmm12, zmm0</div><div><span style="white-space:pre">     </span>vpaddd<span style="white-space:pre">       </span>zmm0, zmm13, zmm0</div><div><span style="white-space:pre">     </span><b><font color="#ff0000">vshufi64x2</font><span style="color:rgb(255,0,0);white-space:pre">    </span><font color="#ff0000">zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1]   </font><font color="#0000ff">; please explain how shuffle instructions work here. i know of llvm ir shuffle, but these assembly ones are difficult for me to understand</font></b></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddd<span style="white-space:pre">       </span>zmm0, zmm0, zmm1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">        </span>vshufi64x2<span style="white-space:pre">   </span>zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddd<span style="white-space:pre">       </span>zmm0, zmm0, zmm1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">        </span>vpshufd<span style="white-space:pre">      </span>zmm1, zmm0, 238         # zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">      </span>vpaddd<span style="white-space:pre">       </span>zmm0, zmm0, zmm1</b></font></div><div><font color="#ff0000"><b><span style="white-space:pre">        </span>vpshufd<span style="white-space:pre">      </span>zmm1, zmm0, 229         # zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]</b></font></div><div><span style="white-space:pre">       </span>vpaddd<span style="white-space:pre">       </span>zmm0, zmm0, zmm1</div><div><span style="white-space:pre">      </span>vmovd<span style="white-space:pre">        </span>ebx, xmm0</div><div><span style="white-space:pre">     </span>mov<span style="white-space:pre">  </span>rax, r8</div><div><span style="white-space:pre">       </span>xor<span style="white-space:pre">  </span>r14d, r14d</div><div><span style="white-space:pre">    </span>.p2align<span style="white-space:pre">     </span>4, 0x90</div><div>.LBB0_5:                                #   Parent Loop BB0_1 Depth=1</div><div>                                        #     Parent Loop BB0_2 Depth=2</div><div>                                        # =>    This Inner Loop Header: Depth=3</div><div><span style="white-space:pre">       </span>lea<span style="white-space:pre">  </span>r15, [rsi + r14]</div><div><span style="white-space:pre">      </span>mov<span style="white-space:pre">  </span>r12d, dword ptr [r15 + 4*r11 - 16000]</div><div><span style="white-space:pre"> </span>imul<span style="white-space:pre"> </span>r12d, dword ptr [rax - 16]</div><div><span style="white-space:pre">    </span>mov<span style="white-space:pre">  </span>ecx, dword ptr [r15 + 4*r11 - 12000]</div><div><span style="white-space:pre">  </span>imul<span style="white-space:pre"> </span>ecx, dword ptr [rax - 12]</div><div><span style="white-space:pre">     </span>mov<span style="white-space:pre">  </span>ebp, dword ptr [r15 + 4*r11 - 8000]</div><div><span style="white-space:pre">   </span>imul<span style="white-space:pre"> </span>ebp, dword ptr [rax - 8]</div><div><span style="white-space:pre">      </span>add<span style="white-space:pre">  </span>r12d, ebx</div><div><span style="white-space:pre">     </span>add<span style="white-space:pre">  </span>ecx, r12d</div><div><span style="white-space:pre">     </span>add<span style="white-space:pre">  </span>ebp, ecx</div><div><span style="white-space:pre">      </span>mov<span style="white-space:pre">  </span>ecx, dword ptr [r15 + 4*r11 - 4000]</div><div><span style="white-space:pre">   </span>imul<span style="white-space:pre"> </span>ecx, dword ptr [rax - 4]</div><div><span style="white-space:pre">      </span>add<span style="white-space:pre">  </span>ecx, ebp</div><div><span style="white-space:pre">      </span>mov<span style="white-space:pre">  </span>ebx, dword ptr [r15 + 4*r11]</div><div><span style="white-space:pre">  </span>imul<span style="white-space:pre"> </span>ebx, dword ptr [rax]</div><div><span style="white-space:pre">  </span>add<span style="white-space:pre">  </span>ebx, ecx</div><div><span style="white-space:pre">      </span>add<span style="white-space:pre">  </span>r14, 20000</div><div><span style="white-space:pre">    </span>add<span style="white-space:pre">  </span>rax, 20</div><div><span style="white-space:pre">       </span>cmp<span style="white-space:pre">  </span>r14, 160000</div><div><span style="white-space:pre">   </span>jne<span style="white-space:pre">  </span>.LBB0_5</div><div># BB#6:                                 # %.loopexit</div><div>                                        #   in Loop: Header=BB0_2 Depth=2</div><div><span style="white-space:pre">       </span>add<span style="white-space:pre">  </span>r10, rdx                #rdx is c[][]</div><div><span style="white-space:pre"> </span>mov<span style="white-space:pre">  </span>dword ptr [r10 + 4*r11], ebx</div><div><span style="white-space:pre">  </span>inc<span style="white-space:pre">  </span>r11</div><div><span style="white-space:pre">   </span>cmp<span style="white-space:pre">  </span>r11, 1000</div><div><span style="white-space:pre">     </span>jne<span style="white-space:pre">  </span>.LBB0_2</div><div># BB#7:                                 #   in Loop: Header=BB0_1 Depth=1</div><div><span style="white-space:pre">      </span>inc<span style="white-space:pre">  </span>r9</div><div><span style="white-space:pre">    </span>add<span style="white-space:pre">  </span>r8, 4000</div><div><span style="white-space:pre">      </span>cmp<span style="white-space:pre">  </span>r9, 1000</div><div><span style="white-space:pre">      </span>jne<span style="white-space:pre">  </span>.LBB0_1</div><div># BB#8:</div><div><span style="white-space:pre"> </span>pop<span style="white-space:pre">  </span>rbx</div><div><span style="white-space:pre">   </span>pop<span style="white-space:pre">  </span>r12</div><div><span style="white-space:pre">   </span>pop<span style="white-space:pre">  </span>r14</div><div><span style="white-space:pre">   </span>pop<span style="white-space:pre">  </span>r15</div><div><span style="white-space:pre">   </span>pop<span style="white-space:pre">  </span>rbp</div><div><span style="white-space:pre">   </span>ret</div><div><br></div><div><br></div><div>Looking forward to your reply</div><div><br></div><div>Thank You</div><div><br></div></div><div><br></div></div>