[llvm-dev] KNL Assembly Code for Matrix Multiplication

Fri Jun 30 08:59:08 PDT 2017

Some comments inline, I'll need to look more later.

~Craig

On Fri, Jun 30, 2017 at 5:28 AM, hameeza ahmed via llvm-dev <
llvm-dev at lists.llvm.org> wrote:

> Hello, I want some help in understanding knl intel assembly of matrix
> multiplication code. some of the things are not clear;
>
> here .c file:
>
> #include <stdio.h>
> #define N 1000
>
> // This function multiplies A[][] and B[][], and stores
> // the result in C[][]
> void multiply(int A[][N], int B[][N], int C[][N])
> {
>     int i, j, k, r;
>     for (i = 0; i < N; i++)
>     {
>         for (j = 0; j < N; j++)
>         {
>             r = 0;
>             for (k = 0; k < N; k++) {
>                 r += A[i][k]*B[k][j];}
>                 C[i][j] = r;
>
>         }
>
>     }
> }
>
> here .s file: * the code that i want to ask is in red color.*
>
> .text
> .intel_syntax noprefix
> .file "matn_o3.ll"
> .section .rodata,"a", at progbits
> .p2align 6
> .LCPI0_0:
> .quad 8                       # 0x8
> .quad 9                       # 0x9
> .quad 10                      # 0xa
> .quad 11                      # 0xb
> .quad 12                      # 0xc
> .quad 13                      # 0xd
> .quad 14                      # 0xe
> .quad 15                      # 0xf
> .LCPI0_1:
> .quad 0                       # 0x0
> .quad 1                       # 0x1
> .quad 2                       # 0x2
> .quad 3                       # 0x3
> .quad 4                       # 0x4
> .quad 5                       # 0x5
> .quad 6                       # 0x6
> .quad 7                       # 0x7
> .section .rodata.cst8,"aM", at progbits,8
> .p2align 3
> .LCPI0_2:
> .quad 4000                    # 0xfa0
> .LCPI0_3:
> .quad 64000                   # 0xfa00
> .LCPI0_4:
> .quad 128000                  # 0x1f400
> .LCPI0_5:
> .quad 192000                  # 0x2ee00
> .LCPI0_6:
> .quad 64                      # 0x40
> .text
> .globl multiply
> .p2align 4, 0x90
> .type multiply, at function
> multiply:                               # @multiply
> .cfi_startproc
> # BB#0:
> push rbp
> .Lcfi0:
> .cfi_def_cfa_offset 16
> push r15
> .Lcfi1:
> .cfi_def_cfa_offset 24
> push r14
> .Lcfi2:
> .cfi_def_cfa_offset 32
> push r12
> .Lcfi3:
> .cfi_def_cfa_offset 40
> push rbx
> .Lcfi4:
> .cfi_def_cfa_offset 48
> .Lcfi5:
> .cfi_offset rbx, -48
> .Lcfi6:
> .cfi_offset r12, -40
> .Lcfi7:
> .cfi_offset r14, -32
> .Lcfi8:
> .cfi_offset r15, -24
> .Lcfi9:
> .cfi_offset rbp, -16
> lea r8, [rdi + 3856]
> xor r9d, r9d
> vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 =
> [8,9,10,11,12,13,14,15]
> vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 = [0,1,2,3,4,5,6,7]
> vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]
> vpbroadcastq zmm3, rsi
> add rsi, 3856000
> vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3]
> vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4]
> vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5]
> kxnorw k1, k0, k0
> kshiftrw k1, k1, 8
> vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6]
> .p2align 4, 0x90
> .LBB0_1:                                # %.preheader26
>                                         # =>This Loop Header: Depth=1
>                                         #     Child Loop BB0_2 Depth 2
>                                         #       Child Loop BB0_3 Depth 3
>                                         #       Child Loop BB0_5 Depth 3
> xor r11d, r11d
> .p2align 4, 0x90
> .LBB0_2:                                # %.preheader
>                                         #   Parent Loop BB0_1 Depth=1
>                                         # =>  This Loop Header: Depth=2
>                                         #       Child Loop BB0_3 Depth 3
>                                         #       Child Loop BB0_5 Depth 3
> vpxord zmm8, zmm8, zmm8
> mov ecx, 960
> vmovdqa64 zmm9, zmm23
> vmovdqa64 zmm10, zmm22
> vpxord zmm11, zmm11, zmm11
> vpxord zmm12, zmm12, zmm12
> vpxord zmm13, zmm13, zmm13
> .p2align 4, 0x90
> .LBB0_3:                                # %vector.body
>                                         #   Parent Loop BB0_1 Depth=1
>                                         #     Parent Loop BB0_2 Depth=2
>                                         # =>    This Inner Loop Header:
> Depth=3
>                                         # this bb will run 15 times
> vmovq rax, xmm9
> imul r10, r9, 4000
> lea rbx, [rdi + r10]
> *vpmuludq zmm14, zmm10, zmm2       ; this is BB for vector here we have to
> do gather for B due to arbitrary addresses so here
> zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in
> these indexes? and zmm2=[4000, 4000,.....4000]. these are the indexes for B
> we need to multiple indexes with stride=4000. i know here these indexes are
> 64 bit but the values stored in these locations are 32 bits then  the load
> using zmm10 index will give 8 elements of 32 bits present in these
> locations, so do the registers contain 8 elements of 32 bits present at
> specified indexes?? so after multiplication we get indexes for higher 8
> elements of B i.e [3200,3600,40000,.......54000].*
>
> * vpsrlq zmm15, zmm10, 32              ; i dont understand the need for
> this step, please explain the purpose of all these steps. here vpsrlq will
> shift right zmm10 values by 256 bits (32*8)....zmmm10 initially=**[8,9,10,11,12,13,14,15].
> it will now become [0,0,0,0,8,9,10,11]...Am I correct? Please explain me
> the purpose of this step.*
> * vpmuludq zmm15, zmm15, zmm2  ;    similarly **dont understand the need
> for this step.*
> * vpsllq zmm15, zmm15, 32    ; **dont understand the need for this step*
> * vpaddq zmm14, zmm14, zmm3  ; *
> * vpaddq zmm14, zmm15, zmm14 ; **dont understand the need for this step*
>

vpsrlq zmm15, zmm10, 32 shifts every 64-bit element in zmm10 right by 32
bits. I believe this effectively taking every odd numbered 32-bit element
and moving them to the next lowest even numbered 32-bit element.

vmuludq multiplies all even numbered 32-bit elements and creates 64-bit
results.

The combination of the shifts, vpmuludq, and vpaddq is to multiply 64-bit
elements and create a 64-bit elements result. We don't have an instruction
for this so we have to multiply the low 32-bits of each element and the
high 32-bits of each element separately and add the results together. Looks
like we determined that the high 32-bits of one of the inputs is all zeros
so we skipped 1 of the multiplies and adds that would normally be required
for this operation.

> * vpbroadcastq zmm15, r11 ; **r11 changes when loop variable j changes
> whats the need of this step?*
> * vpsllq zmm15, zmm15, 2   ; **dont understand the need for this step*
> * vpaddq zmm14, zmm14, zmm15 ; **dont understand the need for this step*
> * vpmuludq zmm16, zmm9, zmm2 ; **here same as before the lower 8 elements
> of B indexes are computed as Zmm16=[0,4000,8000,.......28000]*
> * vpsrlq zmm17, zmm9, 32   **; **dont understand the need for this step*
> * vpmuludq zmm17, zmm17, zmm2  **; **dont understand the need for this
> step*
> * vpsllq zmm17, zmm17, 32  **; **dont understand the need for this step*
> * vpaddq zmm16, zmm16, zmm3  *
> * vpaddq zmm16, zmm17, zmm16  **; **dont understand the need for this
> step*
> * vpaddq zmm15, zmm16, zmm15  **; **dont understand the need for this
> step*
> * vpaddq zmm16, zmm15, zmm4*
> * vpaddq zmm17, zmm14, zmm4*
> * vpaddq zmm18, zmm15, zmm5*
> * vpaddq zmm19, zmm14, zmm5*
> * vpaddq zmm20, zmm15, zmm6*
> * vpaddq zmm21, zmm14, zmm6*
> * kmovw k2, k1  **; **dont understand the need for this step*
>

The gather instruction requires a mask of which elements to read. When the
gather completes, if there are no faults it will have written the mask
register to 0. So it needs to reloaded for each gather.

> * vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8
> indexes ( or values at these 8 indexes???) so it will load 8 elements not
> 16. here it should be zmm14**=[3200,3600,40000,.......54000]. but by the
> above computation these indexes are changes??*
> * kxnorw k2, k0, k0  **; **dont understand the need for this step*
>
* vpgatherqd ymm14 {k2}, zmmword ptr [zmm15]   **; **here again issues with
> index zmm15. it should be **[0,4000,8000,.......28000] but its different
> due to above computation.*
> * vinserti64x4 zmm0, zmm14, ymm0, 1*
> * kmovw k2, k1*
> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]*
> * kxnorw k2, k0, k0*
> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]*
> * vinserti64x4 zmm14, zmm15, ymm14, 1*
> * kmovw k2, k1*
> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]*
> * kxnorw k2, k0, k0*
> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]*
> * vinserti64x4 zmm15, zmm16, ymm15, 1*
> * kmovw k2, k1*
> * vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]*
> * kxnorw k2, k0, k0*
> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]*
> * vinserti64x4 zmm1, zmm16, ymm1, 1*
> * vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]*
> vpmulld zmm14, zmm14, zmmword ptr [rbx + 4*rax + 64]
> vpmulld zmm15, zmm15, zmmword ptr [rbx + 4*rax + 128]
> vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192]
> vpaddd zmm8, zmm0, zmm8
> vpaddd zmm11, zmm14, zmm11
> vpaddd zmm12, zmm15, zmm12
> vpaddd zmm13, zmm1, zmm13
> vpaddq zmm9, zmm9, zmm7        #zmm7=64
> vpaddq zmm10, zmm10, zmm7
> add rcx, -64     #decrement counter by 64
> jne .LBB0_3       # if rcx not equal to zero goto .lbbo_3
> # BB#4:                                 # %middle.block
>                                         #   in Loop: Header=BB0_2 Depth=2
> vpaddd zmm0, zmm11, zmm8
> vpaddd zmm0, zmm12, zmm0
> vpaddd zmm0, zmm13, zmm0
> *vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1]   ; please
> explain how shuffle instructions work here. i know of llvm ir shuffle, but
> these assembly ones are difficult for me to understand*
>

You have to look at the size of the register being mentioned and the number
of elements in brackets. In this case the regsiter is 512-bits and the
number of elements is 8. 512/8 is 64. So its a shuffle of a v8i64 vector.
Then we read the element numbers from left to write just like the shuffle
IR instruction.

So element 0 of zmm1 gets the value of element 4 of zmm0. Element 1 of zmm1
gets the value of element 5 of zmm5, etc.

> * vpaddd zmm0, zmm0, zmm1*
> * vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]*
> * vpaddd zmm0, zmm0, zmm1*
> * vpshufd zmm1, zmm0, 238         # zmm1 =
> zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]*
> * vpaddd zmm0, zmm0, zmm1*
> * vpshufd zmm1, zmm0, 229         # zmm1 =
> zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]*
> vpaddd zmm0, zmm0, zmm1
> vmovd ebx, xmm0
> mov rax, r8
> xor r14d, r14d
> .p2align 4, 0x90
> .LBB0_5:                                #   Parent Loop BB0_1 Depth=1
>                                         #     Parent Loop BB0_2 Depth=2
>                                         # =>    This Inner Loop Header:
> Depth=3
> lea r15, [rsi + r14]
> mov r12d, dword ptr [r15 + 4*r11 - 16000]
> imul r12d, dword ptr [rax - 16]
> mov ecx, dword ptr [r15 + 4*r11 - 12000]
> imul ecx, dword ptr [rax - 12]
> mov ebp, dword ptr [r15 + 4*r11 - 8000]
> imul ebp, dword ptr [rax - 8]
> add r12d, ebx
> add ecx, r12d
> add ebp, ecx
> mov ecx, dword ptr [r15 + 4*r11 - 4000]
> imul ecx, dword ptr [rax - 4]
> add ecx, ebp
> mov ebx, dword ptr [r15 + 4*r11]
> imul ebx, dword ptr [rax]
> add ebx, ecx
> add r14, 20000
> add rax, 20
> cmp r14, 160000
> jne .LBB0_5
> # BB#6:                                 # %.loopexit
>                                         #   in Loop: Header=BB0_2 Depth=2
> add r10, rdx                #rdx is c[][]
> mov dword ptr [r10 + 4*r11], ebx
> inc r11
> cmp r11, 1000
> jne .LBB0_2
> # BB#7:                                 #   in Loop: Header=BB0_1 Depth=1
> inc r9
> add r8, 4000
> cmp r9, 1000
> jne .LBB0_1
> # BB#8:
> pop rbx
> pop r12
> pop r14
> pop r15
> pop rbp
> ret
>
>
> Looking forward to your reply
>
> Thank You
>
>
>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170630/26dc30e7/attachment-0001.html>