easyconfigs-it4i/b/BLIS/BLIS-0.8.1_fix_dgemm-fpe-signalling-on-broadwell.patch
easybuild 536c92481f new file: a/aria2/aria2-1.35.0-GCCcore-10.3.0.eb
new file:   b/BLIS/BLIS-0.8.1_fix_dgemm-fpe-signalling-on-broadwell.patch
	new file:   b/BLIS/BLIS-2.2-GCCcore-10.2.0.eb
	new file:   b/BLIS/BLIS-2.2-amd_fix-undefined-reference-blist-abort.patch
	new file:   b/BLIS/BLIS-3.0.1-GCCcore-10.2.0.eb
	new file:   b/Biopython/Biopython-1.72-foss-2020b-Python-2.7.18.eb
	new file:   c/Cordax/Cordax-1.0-Python-2.7.18.eb
	new file:   d/DFTB+/DFTB+-21.2-intel-2020b-Python-3.8.6.eb
	new file:   d/DFTB+/DFTB+-21.2-intel-2020b-TB.eb
	new file:   d/Dakota/Dakota-6.15.0-fix_lapack_detection.patch
	new file:   d/Dakota/Dakota-6.15.0-foss-2021b.eb
	new file:   d/Dakota/Dakota-6.15.0-intel-2021b.eb
	new file:   e/ELPA/ELPA-2020.11.001-fosscuda-2020b.eb
	new file:   f/FoldX/FoldX-5.0.eb
	new file:   f/Forge/Forge-21.1.3.eb
	new file:   g/GROMACS/GROMACS-2018.8-fosscuda-2020b-PLUMED-2.5.6-switch.eb
	new file:   g/GROMACS/GROMACS-2021.4-foss-2020b-PLUMED-2.7.3.eb
	new file:   h/HDF5/HDF5-1.12.1-NVHPC-21.11.eb
	new file:   h/HDF5/HDF5-1.12.1-foss-2021b-parallel.eb
	new file:   h/HDF5/HDF5-1.12.1-iimpi-2021b.eb
	new file:   h/HDF5/HDF5-1.12.1-intel-2021b-parallel.eb
	new file:   h/HyperQueue/HyperQueue-0.8.0.eb
	new file:   h/Hypre/Hypre-2.23.0-intel-2020b.eb
	new file:   h/h5py/h5py-3.6.0-intel-2021b.eb
	new file:   l/LAPACK/LAPACK-3.10.0-GCC-11.2.0.eb
	new file:   l/libFLAME/libFLAME-5.2.0-GCCcore-10.2.0.eb
	new file:   m/MaSuRCA/MaSuRCA-4.0.7-foss-2020a-Perl-5.30.2.eb
	new file:   m/Molpro/Molpro-mpp-2022.1.1.linux_x86_64_mpipr.eb
	new file:   m/Molpro/Molpro-mpp-2022.1.1.linux_x86_64_sockets.eb
	new file:   n/nompi/nompi-2022a.eb
	new file:   o/ORCA/ORCA-5.0.3-OpenMPI-4.1.1.eb
	modified:   o/Octopus/Octopus-11.3-intel-2020b-mpi.eb
	new file:   o/OpenCV/OpenCV-4.5.3-foss-2021a-CUDA-11.3.1-contrib.eb
	new file:   o/OpenCV/OpenCV-4.5.5-foss-2021a-CUDA-11.4.1-contrib.eb
	new file:   o/OpenMPI/OpenMPI-4.1.1-GCC-11.2.0.eb
	new file:   o/OpenMPI/OpenMPI-4.1.2-GCC-11.2.0-Java-1.8.0_221.eb
	new file:   o/OpenMPI/OpenMPI-4.1.2-GCC-11.2.0.eb
	modified:   p/PETSc/PETSc-3.14.4-intel-2020b.eb
	modified:   p/PLUMED/PLUMED-2.5.6-fosscuda-2020b-patch.eb
	new file:   p/PLUMED/PLUMED-2.5.6-fosscuda-2020b-switch.eb
	new file:   p/PLUMED/PLUMED-2.7.3-foss-2020b.eb
	modified:   p/phonopy/phonopy-2.12.0-conda.eb
	modified:   q/QMCPACK/QMCPACK-3.11.0-intel-2020b-Python-3.8.6.eb
	new file:   q/QMCPACK/QMCPACK-3.12.0-intel-2020b-Python-3.8.6.eb
	new file:   q/QMCPACK/QMCPACK-3.12.0-intel-2021b-Python-3.9.6-lowopt.eb
	new file:   q/QMCPACK/QMCPACK-3.13.0-intel-2020b-Python-3.8.6.eb
	new file:   q/QuantumESPRESSO/QuantumESPRESSO-6.7-intel-2021a.eb
	new file:   q/QuantumESPRESSO/QuantumESPRESSO-7.0-NVHPC-21.9.eb
	new file:   r/rocm-cuda2hip/rocm-cuda2hip-4.3.1-gcccuda-2020b.eb
	new file:   s/ScaLAPACK/ScaLAPACK-2.2-NVHPC-21.11.eb
	new file:   t/Tango/Tango.eb
	new file:   t/Tensorflow/TensorFlow-2.5.0-fosscuda-2020b.eb
	new file:   v/VASP/VASP-5.4.1-24Jun15-intel-2020b.eb
	new file:   w/Waltz/Waltz.eb
	new file:   y/Yambo/Yambo-5.0.4-intel-2020a.eb
2022-03-04 13:14:37 +01:00

2220 lines
46 KiB
Diff

Taken from https://github.com/flame/blis/pull/544
Fixes a problem with DGEMM causing FPR signalling on Broadwell
See https://github.com/flame/blis/issues/486
Åke Sandgren, 20210916
commit 5191c43faccf45975f577c60b9089abee25722c9
Author: Devin Matthews <damatthews@smu.edu>
Date: Thu Sep 16 10:16:17 2021 -0500
Fix more copy-paste errors in the haswell gemmsup code.
Fixes #486.
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
index 4c6094b1..21dd3b89 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
@@ -101,7 +101,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
begin_asm()
//vzeroall() // zero all xmm/ymm registers.
-
+
mov(var(a), r14) // load address of a.
mov(var(rs_a), r8) // load rs_a
//mov(var(cs_a), r9) // load cs_a
@@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
-
+
mov(var(c), r12) // load address of c
mov(var(rs_c), rdi) // load rs_c
@@ -172,19 +172,19 @@ void bli_dgemmsup_rd_haswell_asm_6x4
prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
#endif
lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a
-
-
-
+
+
+
mov(var(k_iter16), rsi) // i = k_iter16;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKITER4) // if i == 0, jump to code that
// contains the k_iter4 loop.
-
-
+
+
label(.DLOOPKITER16) // MAIN LOOP
-
-
+
+
// ---------------------------------- iteration 0
#if 0
@@ -219,7 +219,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
vfmadd231pd(ymm1, ymm3, ymm14)
vfmadd231pd(ymm2, ymm3, ymm15)
-
+
// ---------------------------------- iteration 1
vmovupd(mem(rax ), ymm0)
@@ -250,7 +250,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
// ---------------------------------- iteration 2
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -312,27 +312,27 @@ void bli_dgemmsup_rd_haswell_asm_6x4
vfmadd231pd(ymm1, ymm3, ymm14)
vfmadd231pd(ymm2, ymm3, ymm15)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER16) // iterate again if i != 0.
-
-
-
-
-
-
+
+
+
+
+
+
label(.DCONSIDKITER4)
-
+
mov(var(k_iter4), rsi) // i = k_iter4;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
// considers k_left1 loop.
// else, we prepare to enter k_iter4 loop.
-
-
+
+
label(.DLOOPKITER4) // EDGE LOOP (ymm)
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -343,7 +343,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
vmovupd(mem(rax, r8, 1), ymm1)
vmovupd(mem(rax, r8, 2), ymm2)
add(imm(4*8), rax) // a += 4*cs_b = 4*8;
-
+
vmovupd(mem(rbx ), ymm3)
vfmadd231pd(ymm0, ymm3, ymm4)
vfmadd231pd(ymm1, ymm3, ymm5)
@@ -365,21 +365,21 @@ void bli_dgemmsup_rd_haswell_asm_6x4
vfmadd231pd(ymm1, ymm3, ymm14)
vfmadd231pd(ymm2, ymm3, ymm15)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER4) // iterate again if i != 0.
-
-
-
+
+
+
label(.DCONSIDKLEFT1)
-
+
mov(var(k_left1), rsi) // i = k_left1;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left1 loop.
-
-
+
+
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
@@ -387,12 +387,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4
// using the xmm registers would zero out the
// high bits of the destination registers,
// which would destory intermediate results.
-
+
vmovsd(mem(rax ), xmm0)
vmovsd(mem(rax, r8, 1), xmm1)
vmovsd(mem(rax, r8, 2), xmm2)
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
-
+
vmovsd(mem(rbx ), xmm3)
vfmadd231pd(ymm0, ymm3, ymm4)
vfmadd231pd(ymm1, ymm3, ymm5)
@@ -414,12 +414,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4
vfmadd231pd(ymm1, ymm3, ymm14)
vfmadd231pd(ymm2, ymm3, ymm15)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKLEFT1) // iterate again if i != 0.
-
-
-
+
+
+
@@ -427,11 +427,11 @@ void bli_dgemmsup_rd_haswell_asm_6x4
label(.DPOSTACCUM)
-
- // ymm4 ymm7 ymm10 ymm13
+
+ // ymm4 ymm7 ymm10 ymm13
// ymm5 ymm8 ymm11 ymm14
// ymm6 ymm9 ymm12 ymm15
-
+
vhaddpd( ymm7, ymm4, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm0 )
@@ -469,7 +469,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
// xmm6[0:3] = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
+
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
@@ -477,73 +477,73 @@ void bli_dgemmsup_rd_haswell_asm_6x4
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
-
+
vmulpd(ymm0, ymm4, ymm4) // scale by alpha
vmulpd(ymm0, ymm5, ymm5)
vmulpd(ymm0, ymm6, ymm6)
-
-
-
-
-
-
+
+
+
+
+
+
//mov(var(cs_c), rsi) // load cs_c
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
-
-
-
+
+
+
// now avoid loading C if beta == 0
-
+
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-
+
+
label(.DROWSTORED)
-
-
+
+
vfmadd231pd(mem(rcx), ymm3, ymm4)
vmovupd(ymm4, mem(rcx))
add(rdi, rcx)
-
+
vfmadd231pd(mem(rcx), ymm3, ymm5)
vmovupd(ymm5, mem(rcx))
add(rdi, rcx)
-
+
vfmadd231pd(mem(rcx), ymm3, ymm6)
vmovupd(ymm6, mem(rcx))
//add(rdi, rcx)
-
-
-
+
+
+
jmp(.DDONE) // jump to end.
-
-
-
-
+
+
+
+
label(.DBETAZERO)
-
-
+
+
label(.DROWSTORBZ)
-
-
+
+
vmovupd(ymm4, mem(rcx))
add(rdi, rcx)
-
+
vmovupd(ymm5, mem(rcx))
add(rdi, rcx)
-
+
vmovupd(ymm6, mem(rcx))
//add(rdi, rcx)
-
-
-
-
+
+
+
+
label(.DDONE)
-
-
+
+
lea(mem(r12, rdi, 2), r12) //
@@ -560,7 +560,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4
label(.DRETURN)
-
+
end_asm(
: // output operands (none)
@@ -629,7 +629,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
// -------------------------------------------------------------------------
begin_asm()
-
+
//vzeroall() // zero all xmm/ymm registers.
mov(var(a), rax) // load address of a.
@@ -649,7 +649,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
-
+
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
@@ -682,7 +682,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
//lea(mem(r14), rax) // rax = a;
//lea(mem(rdx), rbx) // rbx = b;
-
+
#if 1
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
@@ -690,18 +690,18 @@ void bli_dgemmsup_rd_haswell_asm_2x4
prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
#endif
-
-
-
+
+
+
mov(var(k_iter16), rsi) // i = k_iter16;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKITER4) // if i == 0, jump to code that
// contains the k_iter4 loop.
-
-
+
+
label(.DLOOPKITER16) // MAIN LOOP
-
-
+
+
// ---------------------------------- iteration 0
#if 0
@@ -730,7 +730,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
vfmadd231pd(ymm0, ymm3, ymm13)
vfmadd231pd(ymm1, ymm3, ymm14)
-
+
// ---------------------------------- iteration 1
vmovupd(mem(rax ), ymm0)
@@ -756,7 +756,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
// ---------------------------------- iteration 2
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -807,27 +807,27 @@ void bli_dgemmsup_rd_haswell_asm_2x4
vfmadd231pd(ymm0, ymm3, ymm13)
vfmadd231pd(ymm1, ymm3, ymm14)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER16) // iterate again if i != 0.
-
-
-
-
-
-
+
+
+
+
+
+
label(.DCONSIDKITER4)
-
+
mov(var(k_iter4), rsi) // i = k_iter4;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
// considers k_left1 loop.
// else, we prepare to enter k_iter4 loop.
-
-
+
+
label(.DLOOPKITER4) // EDGE LOOP (ymm)
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -836,7 +836,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4
vmovupd(mem(rax ), ymm0)
vmovupd(mem(rax, r8, 1), ymm1)
add(imm(4*8), rax) // a += 4*cs_b = 4*8;
-
+
vmovupd(mem(rbx ), ymm3)
vfmadd231pd(ymm0, ymm3, ymm4)
vfmadd231pd(ymm1, ymm3, ymm5)
@@ -854,21 +854,21 @@ void bli_dgemmsup_rd_haswell_asm_2x4
vfmadd231pd(ymm0, ymm3, ymm13)
vfmadd231pd(ymm1, ymm3, ymm14)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER4) // iterate again if i != 0.
-
-
-
+
+
+
label(.DCONSIDKLEFT1)
-
+
mov(var(k_left1), rsi) // i = k_left1;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left1 loop.
-
-
+
+
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
@@ -876,11 +876,11 @@ void bli_dgemmsup_rd_haswell_asm_2x4
// using the xmm registers would zero out the
// high bits of the destination registers,
// which would destory intermediate results.
-
+
vmovsd(mem(rax ), xmm0)
vmovsd(mem(rax, r8, 1), xmm1)
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
-
+
vmovsd(mem(rbx ), xmm3)
vfmadd231pd(ymm0, ymm3, ymm4)
vfmadd231pd(ymm1, ymm3, ymm5)
@@ -898,12 +898,12 @@ void bli_dgemmsup_rd_haswell_asm_2x4
vfmadd231pd(ymm0, ymm3, ymm13)
vfmadd231pd(ymm1, ymm3, ymm14)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKLEFT1) // iterate again if i != 0.
-
-
-
+
+
+
@@ -911,10 +911,10 @@ void bli_dgemmsup_rd_haswell_asm_2x4
label(.DPOSTACCUM)
-
- // ymm4 ymm7 ymm10 ymm13
+
+ // ymm4 ymm7 ymm10 ymm13
// ymm5 ymm8 ymm11 ymm14
-
+
vhaddpd( ymm7, ymm4, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm0 )
@@ -943,75 +943,75 @@ void bli_dgemmsup_rd_haswell_asm_2x4
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float)
-
+
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
-
+
vmulpd(ymm0, ymm4, ymm4) // scale by alpha
vmulpd(ymm0, ymm5, ymm5)
-
-
-
-
-
-
+
+
+
+
+
+
//mov(var(cs_c), rsi) // load cs_c
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
-
-
-
+
+
+
// now avoid loading C if beta == 0
-
+
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-
+
+
label(.DROWSTORED)
-
-
+
+
vfmadd231pd(mem(rcx), ymm3, ymm4)
vmovupd(ymm4, mem(rcx))
add(rdi, rcx)
-
+
vfmadd231pd(mem(rcx), ymm3, ymm5)
vmovupd(ymm5, mem(rcx))
//add(rdi, rcx)
-
-
-
+
+
+
jmp(.DDONE) // jump to end.
-
-
-
-
+
+
+
+
label(.DBETAZERO)
-
-
+
+
label(.DROWSTORBZ)
-
-
+
+
vmovupd(ymm4, mem(rcx))
add(rdi, rcx)
-
+
vmovupd(ymm5, mem(rcx))
//add(rdi, rcx)
-
-
-
-
+
+
+
+
label(.DDONE)
label(.DRETURN)
-
-
+
+
end_asm(
: // output operands (none)
@@ -1079,7 +1079,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
// -------------------------------------------------------------------------
begin_asm()
-
+
//vzeroall() // zero all xmm/ymm registers.
mov(var(a), rax) // load address of a.
@@ -1099,7 +1099,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
-
+
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
@@ -1128,26 +1128,26 @@ void bli_dgemmsup_rd_haswell_asm_1x4
//lea(mem(r14), rax) // rax = a;
//lea(mem(rdx), rbx) // rbx = b;
-
+
#if 1
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c
- prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+ //prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
#endif
-
-
-
+
+
+
mov(var(k_iter16), rsi) // i = k_iter16;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKITER4) // if i == 0, jump to code that
// contains the k_iter4 loop.
-
-
+
+
label(.DLOOPKITER16) // MAIN LOOP
-
-
+
+
// ---------------------------------- iteration 0
#if 0
@@ -1170,7 +1170,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
add(imm(4*8), rbx) // b += 4*rs_b = 4*8;
vfmadd231pd(ymm0, ymm3, ymm13)
-
+
// ---------------------------------- iteration 1
vmovupd(mem(rax ), ymm0)
@@ -1191,7 +1191,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
// ---------------------------------- iteration 2
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
#endif
@@ -1231,27 +1231,27 @@ void bli_dgemmsup_rd_haswell_asm_1x4
add(imm(4*8), rbx) // b += 4*rs_b = 4*8;
vfmadd231pd(ymm0, ymm3, ymm13)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER16) // iterate again if i != 0.
-
-
-
-
-
-
+
+
+
+
+
+
label(.DCONSIDKITER4)
-
+
mov(var(k_iter4), rsi) // i = k_iter4;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
// considers k_left1 loop.
// else, we prepare to enter k_iter4 loop.
-
-
+
+
label(.DLOOPKITER4) // EDGE LOOP (ymm)
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -1259,7 +1259,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
vmovupd(mem(rax ), ymm0)
add(imm(4*8), rax) // a += 4*cs_b = 4*8;
-
+
vmovupd(mem(rbx ), ymm3)
vfmadd231pd(ymm0, ymm3, ymm4)
@@ -1273,21 +1273,21 @@ void bli_dgemmsup_rd_haswell_asm_1x4
add(imm(4*8), rbx) // b += 4*rs_b = 4*8;
vfmadd231pd(ymm0, ymm3, ymm13)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER4) // iterate again if i != 0.
-
-
-
+
+
+
label(.DCONSIDKLEFT1)
-
+
mov(var(k_left1), rsi) // i = k_left1;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left1 loop.
-
-
+
+
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
@@ -1295,10 +1295,10 @@ void bli_dgemmsup_rd_haswell_asm_1x4
// using the xmm registers would zero out the
// high bits of the destination registers,
// which would destory intermediate results.
-
+
vmovsd(mem(rax ), xmm0)
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
-
+
vmovsd(mem(rbx ), xmm3)
vfmadd231pd(ymm0, ymm3, ymm4)
@@ -1312,12 +1312,12 @@ void bli_dgemmsup_rd_haswell_asm_1x4
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
vfmadd231pd(ymm0, ymm3, ymm13)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKLEFT1) // iterate again if i != 0.
-
-
-
+
+
+
@@ -1325,9 +1325,9 @@ void bli_dgemmsup_rd_haswell_asm_1x4
label(.DPOSTACCUM)
-
- // ymm4 ymm7 ymm10 ymm13
-
+
+ // ymm4 ymm7 ymm10 ymm13
+
vhaddpd( ymm7, ymm4, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm0 )
@@ -1339,15 +1339,15 @@ void bli_dgemmsup_rd_haswell_asm_1x4
vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
- vhaddpd( ymm8, ymm5, ymm0 )
- vextractf128(imm(1), ymm0, xmm1 )
- vaddpd( xmm0, xmm1, xmm0 )
+ //vhaddpd( ymm8, ymm5, ymm0 )
+ //vextractf128(imm(1), ymm0, xmm1 )
+ //vaddpd( xmm0, xmm1, xmm0 )
- vhaddpd( ymm14, ymm11, ymm2 )
- vextractf128(imm(1), ymm2, xmm1 )
- vaddpd( xmm2, xmm1, xmm2 )
+ //vhaddpd( ymm14, ymm11, ymm2 )
+ //vextractf128(imm(1), ymm2, xmm1 )
+ //vaddpd( xmm2, xmm1, xmm2 )
- vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
+ //vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
// xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
@@ -1355,67 +1355,67 @@ void bli_dgemmsup_rd_haswell_asm_1x4
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float)
-
+
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
-
+
vmulpd(ymm0, ymm4, ymm4) // scale by alpha
-
-
-
-
-
-
+
+
+
+
+
+
//mov(var(cs_c), rsi) // load cs_c
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
-
-
-
+
+
+
// now avoid loading C if beta == 0
-
+
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-
+
+
label(.DROWSTORED)
-
-
+
+
vfmadd231pd(mem(rcx), ymm3, ymm4)
vmovupd(ymm4, mem(rcx))
//add(rdi, rcx)
-
-
-
+
+
+
jmp(.DDONE) // jump to end.
-
-
-
-
+
+
+
+
label(.DBETAZERO)
-
-
+
+
label(.DROWSTORBZ)
-
-
+
+
vmovupd(ymm4, mem(rcx))
//add(rdi, rcx)
-
-
-
-
+
+
+
+
label(.DDONE)
label(.DRETURN)
-
-
+
+
end_asm(
: // output operands (none)
commit e3dc1954ffb5eee2a8b41fce85ba589f75770eea
Author: Devin Matthews <damatthews@smu.edu>
Date: Thu Sep 16 10:59:37 2021 -0500
Fix problem where uninitialized registers are included in vhaddpd in the Mx1 gemmsup kernels for haswell.
The fix is to use the same (valid) source register twice in the horizontal addition.
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
index 6e3c1a0e..457ef9f2 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
@@ -99,9 +99,9 @@ void bli_dgemmsup_rd_haswell_asm_6x1
// -------------------------------------------------------------------------
begin_asm()
-
+
//vzeroall() // zero all xmm/ymm registers.
-
+
mov(var(a), rax) // load address of a.
mov(var(rs_a), r8) // load rs_a
//mov(var(cs_a), r9) // load cs_a
@@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
//lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
-
+
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
@@ -163,19 +163,19 @@ void bli_dgemmsup_rd_haswell_asm_6x1
prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
#endif
-
-
-
+
+
+
mov(var(k_iter16), rsi) // i = k_iter16;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKITER4) // if i == 0, jump to code that
// contains the k_iter4 loop.
-
-
+
+
label(.DLOOPKITER16) // MAIN LOOP
-
-
+
+
// ---------------------------------- iteration 0
#if 0
@@ -206,7 +206,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm14)
-
+
// ---------------------------------- iteration 1
vmovupd(mem(rbx ), ymm0)
@@ -233,7 +233,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
// ---------------------------------- iteration 2
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -287,27 +287,27 @@ void bli_dgemmsup_rd_haswell_asm_6x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm14)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER16) // iterate again if i != 0.
-
-
-
-
-
-
+
+
+
+
+
+
label(.DCONSIDKITER4)
-
+
mov(var(k_iter4), rsi) // i = k_iter4;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
// considers k_left1 loop.
// else, we prepare to enter k_iter4 loop.
-
-
+
+
label(.DLOOPKITER4) // EDGE LOOP (ymm)
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -336,21 +336,21 @@ void bli_dgemmsup_rd_haswell_asm_6x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm14)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER4) // iterate again if i != 0.
-
-
-
+
+
+
label(.DCONSIDKLEFT1)
-
+
mov(var(k_left1), rsi) // i = k_left1;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left1 loop.
-
-
+
+
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
@@ -358,7 +358,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1
// using the xmm registers would zero out the
// high bits of the destination registers,
// which would destory intermediate results.
-
+
vmovsd(mem(rbx ), xmm0)
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
@@ -381,12 +381,12 @@ void bli_dgemmsup_rd_haswell_asm_6x1
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
vfmadd231pd(ymm0, ymm3, ymm14)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKLEFT1) // iterate again if i != 0.
-
-
-
+
+
+
@@ -399,28 +399,28 @@ void bli_dgemmsup_rd_haswell_asm_6x1
// ymm10
// ymm12
// ymm14
-
- vhaddpd( ymm5, ymm4, ymm0 )
+
+ vhaddpd( ymm4, ymm4, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm4 )
- vhaddpd( ymm7, ymm6, ymm0 )
+ vhaddpd( ymm6, ymm6, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm6 )
- vhaddpd( ymm9, ymm8, ymm0 )
+ vhaddpd( ymm8, ymm8, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm8 )
- vhaddpd( ymm11, ymm10, ymm0 )
+ vhaddpd( ymm10, ymm10, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm10 )
- vhaddpd( ymm13, ymm12, ymm0 )
+ vhaddpd( ymm12, ymm12, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm12 )
- vhaddpd( ymm15, ymm14, ymm0 )
+ vhaddpd( ymm14, ymm14, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm14 )
@@ -435,114 +435,114 @@ void bli_dgemmsup_rd_haswell_asm_6x1
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double)
-
+
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
-
+
vmulpd(xmm0, xmm4, xmm4) // scale by alpha
vmulpd(xmm0, xmm6, xmm6)
vmulpd(xmm0, xmm8, xmm8)
vmulpd(xmm0, xmm10, xmm10)
vmulpd(xmm0, xmm12, xmm12)
vmulpd(xmm0, xmm14, xmm14)
-
-
-
-
-
-
+
+
+
+
+
+
//mov(var(cs_c), rsi) // load cs_c
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
-
-
-
+
+
+
// now avoid loading C if beta == 0
-
+
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-
+
+
label(.DROWSTORED)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm4)
vmovsd(xmm4, mem(rcx))
add(rdi, rcx)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm6)
vmovsd(xmm6, mem(rcx))
add(rdi, rcx)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm8)
vmovsd(xmm8, mem(rcx))
add(rdi, rcx)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm10)
vmovsd(xmm10, mem(rcx))
add(rdi, rcx)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm12)
vmovsd(xmm12, mem(rcx))
add(rdi, rcx)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm14)
vmovsd(xmm14, mem(rcx))
//add(rdi, rcx)
-
-
-
+
+
+
jmp(.DDONE) // jump to end.
-
-
-
-
+
+
+
+
label(.DBETAZERO)
-
-
+
+
label(.DROWSTORBZ)
-
-
+
+
vmovsd(xmm4, mem(rcx))
add(rdi, rcx)
-
+
vmovsd(xmm6, mem(rcx))
add(rdi, rcx)
-
+
vmovsd(xmm8, mem(rcx))
add(rdi, rcx)
-
+
vmovsd(xmm10, mem(rcx))
add(rdi, rcx)
-
+
vmovsd(xmm12, mem(rcx))
add(rdi, rcx)
-
+
vmovsd(xmm14, mem(rcx))
//add(rdi, rcx)
-
-
-
-
+
+
+
+
label(.DDONE)
-
+
label(.DRETURN)
-
+
end_asm(
: // output operands (none)
@@ -613,9 +613,9 @@ void bli_dgemmsup_rd_haswell_asm_3x1
// -------------------------------------------------------------------------
begin_asm()
-
+
//vzeroall() // zero all xmm/ymm registers.
-
+
mov(var(a), rax) // load address of a.
mov(var(rs_a), r8) // load rs_a
//mov(var(cs_a), r9) // load cs_a
@@ -633,7 +633,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
//lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
-
+
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
@@ -671,19 +671,19 @@ void bli_dgemmsup_rd_haswell_asm_3x1
prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
#endif
-
-
-
+
+
+
mov(var(k_iter16), rsi) // i = k_iter16;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKITER4) // if i == 0, jump to code that
// contains the k_iter4 loop.
-
-
+
+
label(.DLOOPKITER16) // MAIN LOOP
-
-
+
+
// ---------------------------------- iteration 0
#if 0
@@ -705,7 +705,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm8)
-
+
// ---------------------------------- iteration 1
vmovupd(mem(rbx ), ymm0)
@@ -723,7 +723,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
// ---------------------------------- iteration 2
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -759,27 +759,27 @@ void bli_dgemmsup_rd_haswell_asm_3x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm8)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER16) // iterate again if i != 0.
-
-
-
-
-
-
+
+
+
+
+
+
label(.DCONSIDKITER4)
-
+
mov(var(k_iter4), rsi) // i = k_iter4;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
// considers k_left1 loop.
// else, we prepare to enter k_iter4 loop.
-
-
+
+
label(.DLOOPKITER4) // EDGE LOOP (ymm)
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -799,21 +799,21 @@ void bli_dgemmsup_rd_haswell_asm_3x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm8)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER4) // iterate again if i != 0.
-
-
-
+
+
+
label(.DCONSIDKLEFT1)
-
+
mov(var(k_left1), rsi) // i = k_left1;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left1 loop.
-
-
+
+
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
@@ -821,7 +821,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1
// using the xmm registers would zero out the
// high bits of the destination registers,
// which would destory intermediate results.
-
+
vmovsd(mem(rbx ), xmm0)
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
@@ -835,12 +835,12 @@ void bli_dgemmsup_rd_haswell_asm_3x1
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
vfmadd231pd(ymm0, ymm3, ymm8)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKLEFT1) // iterate again if i != 0.
-
-
-
+
+
+
@@ -850,16 +850,16 @@ void bli_dgemmsup_rd_haswell_asm_3x1
// ymm4
// ymm6
// ymm8
-
- vhaddpd( ymm5, ymm4, ymm0 )
+
+ vhaddpd( ymm4, ymm4, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm4 )
- vhaddpd( ymm7, ymm6, ymm0 )
+ vhaddpd( ymm6, ymm6, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm6 )
- vhaddpd( ymm9, ymm8, ymm0 )
+ vhaddpd( ymm8, ymm8, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm8 )
@@ -871,87 +871,87 @@ void bli_dgemmsup_rd_haswell_asm_3x1
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double)
-
+
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
-
+
vmulpd(xmm0, xmm4, xmm4) // scale by alpha
vmulpd(xmm0, xmm6, xmm6)
vmulpd(xmm0, xmm8, xmm8)
-
-
-
-
-
-
+
+
+
+
+
+
//mov(var(cs_c), rsi) // load cs_c
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
-
-
-
+
+
+
// now avoid loading C if beta == 0
-
+
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-
+
+
label(.DROWSTORED)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm4)
vmovsd(xmm4, mem(rcx))
add(rdi, rcx)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm6)
vmovsd(xmm6, mem(rcx))
add(rdi, rcx)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm8)
vmovsd(xmm8, mem(rcx))
//add(rdi, rcx)
-
-
-
+
+
+
jmp(.DDONE) // jump to end.
-
-
-
-
+
+
+
+
label(.DBETAZERO)
-
-
+
+
label(.DROWSTORBZ)
-
-
+
+
vmovsd(xmm4, mem(rcx))
add(rdi, rcx)
-
+
vmovsd(xmm6, mem(rcx))
add(rdi, rcx)
-
+
vmovsd(xmm8, mem(rcx))
//add(rdi, rcx)
-
-
-
-
+
+
+
+
label(.DDONE)
-
+
label(.DRETURN)
-
+
end_asm(
: // output operands (none)
@@ -1022,9 +1022,9 @@ void bli_dgemmsup_rd_haswell_asm_2x1
// -------------------------------------------------------------------------
begin_asm()
-
+
//vzeroall() // zero all xmm/ymm registers.
-
+
mov(var(a), rax) // load address of a.
mov(var(rs_a), r8) // load rs_a
//mov(var(cs_a), r9) // load cs_a
@@ -1042,7 +1042,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
//lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
-
+
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
@@ -1078,19 +1078,19 @@ void bli_dgemmsup_rd_haswell_asm_2x1
prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c
prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
#endif
-
-
-
+
+
+
mov(var(k_iter16), rsi) // i = k_iter16;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKITER4) // if i == 0, jump to code that
// contains the k_iter4 loop.
-
-
+
+
label(.DLOOPKITER16) // MAIN LOOP
-
-
+
+
// ---------------------------------- iteration 0
#if 0
@@ -1109,7 +1109,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm6)
-
+
// ---------------------------------- iteration 1
vmovupd(mem(rbx ), ymm0)
@@ -1124,7 +1124,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
// ---------------------------------- iteration 2
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -1154,27 +1154,27 @@ void bli_dgemmsup_rd_haswell_asm_2x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm6)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER16) // iterate again if i != 0.
-
-
-
-
-
-
+
+
+
+
+
+
label(.DCONSIDKITER4)
-
+
mov(var(k_iter4), rsi) // i = k_iter4;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
// considers k_left1 loop.
// else, we prepare to enter k_iter4 loop.
-
-
+
+
label(.DLOOPKITER4) // EDGE LOOP (ymm)
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -1191,21 +1191,21 @@ void bli_dgemmsup_rd_haswell_asm_2x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm6)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER4) // iterate again if i != 0.
-
-
-
+
+
+
label(.DCONSIDKLEFT1)
-
+
mov(var(k_left1), rsi) // i = k_left1;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left1 loop.
-
-
+
+
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
@@ -1213,7 +1213,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1
// using the xmm registers would zero out the
// high bits of the destination registers,
// which would destory intermediate results.
-
+
vmovsd(mem(rbx ), xmm0)
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
@@ -1224,12 +1224,12 @@ void bli_dgemmsup_rd_haswell_asm_2x1
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
vfmadd231pd(ymm0, ymm3, ymm6)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKLEFT1) // iterate again if i != 0.
-
-
-
+
+
+
@@ -1238,12 +1238,12 @@ void bli_dgemmsup_rd_haswell_asm_2x1
// ymm4
// ymm6
-
- vhaddpd( ymm5, ymm4, ymm0 )
+
+ vhaddpd( ymm4, ymm4, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm4 )
- vhaddpd( ymm7, ymm6, ymm0 )
+ vhaddpd( ymm6, ymm6, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm6 )
@@ -1254,78 +1254,78 @@ void bli_dgemmsup_rd_haswell_asm_2x1
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double)
-
+
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
-
+
vmulpd(xmm0, xmm4, xmm4) // scale by alpha
vmulpd(xmm0, xmm6, xmm6)
-
-
-
-
-
-
+
+
+
+
+
+
//mov(var(cs_c), rsi) // load cs_c
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
-
-
-
+
+
+
// now avoid loading C if beta == 0
-
+
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-
+
+
label(.DROWSTORED)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm4)
vmovsd(xmm4, mem(rcx))
add(rdi, rcx)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm6)
vmovsd(xmm6, mem(rcx))
//add(rdi, rcx)
-
-
-
+
+
+
jmp(.DDONE) // jump to end.
-
-
-
-
+
+
+
+
label(.DBETAZERO)
-
-
+
+
label(.DROWSTORBZ)
-
-
+
+
vmovsd(xmm4, mem(rcx))
add(rdi, rcx)
-
+
vmovsd(xmm6, mem(rcx))
//add(rdi, rcx)
-
-
-
-
+
+
+
+
label(.DDONE)
-
+
label(.DRETURN)
-
+
end_asm(
: // output operands (none)
@@ -1396,9 +1396,9 @@ void bli_dgemmsup_rd_haswell_asm_1x1
// -------------------------------------------------------------------------
begin_asm()
-
+
//vzeroall() // zero all xmm/ymm registers.
-
+
mov(var(a), rax) // load address of a.
mov(var(rs_a), r8) // load rs_a
//mov(var(cs_a), r9) // load cs_a
@@ -1416,7 +1416,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
//lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b
//lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a
-
+
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
@@ -1450,19 +1450,19 @@ void bli_dgemmsup_rd_haswell_asm_1x1
//lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c;
prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c
#endif
-
-
-
+
+
+
mov(var(k_iter16), rsi) // i = k_iter16;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKITER4) // if i == 0, jump to code that
// contains the k_iter4 loop.
-
-
+
+
label(.DLOOPKITER16) // MAIN LOOP
-
-
+
+
// ---------------------------------- iteration 0
#if 0
@@ -1478,7 +1478,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm4)
-
+
// ---------------------------------- iteration 1
vmovupd(mem(rbx ), ymm0)
@@ -1490,7 +1490,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
// ---------------------------------- iteration 2
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -1514,27 +1514,27 @@ void bli_dgemmsup_rd_haswell_asm_1x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm4)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER16) // iterate again if i != 0.
-
-
-
-
-
-
+
+
+
+
+
+
label(.DCONSIDKITER4)
-
+
mov(var(k_iter4), rsi) // i = k_iter4;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT1) // if i == 0, jump to code that
// considers k_left1 loop.
// else, we prepare to enter k_iter4 loop.
-
-
+
+
label(.DLOOPKITER4) // EDGE LOOP (ymm)
-
+
#if 0
prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a
@@ -1548,21 +1548,21 @@ void bli_dgemmsup_rd_haswell_asm_1x1
add(imm(4*8), rax) // a += 4*cs_a = 4*8;
vfmadd231pd(ymm0, ymm3, ymm4)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKITER4) // iterate again if i != 0.
-
-
-
+
+
+
label(.DCONSIDKLEFT1)
-
+
mov(var(k_left1), rsi) // i = k_left1;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left1 loop.
-
-
+
+
label(.DLOOPKLEFT1) // EDGE LOOP (scalar)
@@ -1570,7 +1570,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1
// using the xmm registers would zero out the
// high bits of the destination registers,
// which would destory intermediate results.
-
+
vmovsd(mem(rbx ), xmm0)
add(imm(1*8), rbx) // b += 1*rs_b = 1*8;
@@ -1578,12 +1578,12 @@ void bli_dgemmsup_rd_haswell_asm_1x1
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
vfmadd231pd(ymm0, ymm3, ymm4)
-
+
dec(rsi) // i -= 1;
jne(.DLOOPKLEFT1) // iterate again if i != 0.
-
-
-
+
+
+
@@ -1591,8 +1591,8 @@ void bli_dgemmsup_rd_haswell_asm_1x1
label(.DPOSTACCUM)
// ymm4
-
- vhaddpd( ymm5, ymm4, ymm0 )
+
+ vhaddpd( ymm4, ymm4, ymm0 )
vextractf128(imm(1), ymm0, xmm1 )
vaddpd( xmm0, xmm1, xmm4 )
@@ -1602,69 +1602,69 @@ void bli_dgemmsup_rd_haswell_asm_1x1
//mov(var(rs_c), rdi) // load rs_c
//lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double)
-
+
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
-
+
vmulpd(xmm0, xmm4, xmm4) // scale by alpha
-
-
-
-
-
-
+
+
+
+
+
+
//mov(var(cs_c), rsi) // load cs_c
//lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
-
-
-
+
+
+
// now avoid loading C if beta == 0
-
+
vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
vucomisd(xmm0, xmm3) // set ZF if beta == 0.
je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-
+
+
label(.DROWSTORED)
-
- vmovsd(mem(rcx), xmm0)
+
+ vmovsd(mem(rcx), xmm0)
vfmadd231pd(xmm0, xmm3, xmm4)
vmovsd(xmm4, mem(rcx))
//add(rdi, rcx)
-
-
-
+
+
+
jmp(.DDONE) // jump to end.
-
-
-
-
+
+
+
+
label(.DBETAZERO)
-
-
+
+
label(.DROWSTORBZ)
-
-
+
+
vmovsd(xmm4, mem(rcx))
//add(rdi, rcx)
-
-
-
-
+
+
+
+
label(.DDONE)
-
+
label(.DRETURN)
-
+
end_asm(
: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
index 21dd3b89..516bfced 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
@@ -1338,17 +1338,6 @@ void bli_dgemmsup_rd_haswell_asm_1x4
vperm2f128(imm(0x20), ymm2, ymm0, ymm4 )
-
- //vhaddpd( ymm8, ymm5, ymm0 )
- //vextractf128(imm(1), ymm0, xmm1 )
- //vaddpd( xmm0, xmm1, xmm0 )
-
- //vhaddpd( ymm14, ymm11, ymm2 )
- //vextractf128(imm(1), ymm2, xmm1 )
- //vaddpd( xmm2, xmm1, xmm2 )
-
- //vperm2f128(imm(0x20), ymm2, ymm0, ymm5 )
-
// xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)