diff --git a/Examples_async_target.tex b/Examples_async_target.tex index 04c79cd..e7ae503 100644 --- a/Examples_async_target.tex +++ b/Examples_async_target.tex @@ -24,7 +24,30 @@ accessed. \cexample{async_target}{2c} -The Fortran example uses allocatable arrays for dynamic memory on the device. +The Fortran example below is similar to the C version above. Instead of pointers, though, it uses +the convenience of Fortran allocatable arrays on the device. An allocatable array has the +same behavior in a \code{map} clause as a C pointer, in this case. + +If there is no shape specified for an allocatable array in a \code{map} clause, only the array descriptor +(also called a dope vector) is mapped. That is, device space is created for the descriptor, and it +is initially populated with host values. In this case, the \plc{v1} and \plc{v2} arrays will be in a +non-associated state on the device. When space for \plc{v1} and \plc{v2} is allocated on the device +the addresses to the space will be included in their descriptors. + +At the end of the first \code{target} region, the descriptor (of an unshaped specification of an allocatable +array in a \code{map} clause) is returned with the raw device address of the allocated space. +The content of the array is not returned. In the example the data in arrays \plc{v1} and \plc{v2} +are not returned. In the second \code{target} directive, the \plc{v1} and \plc{v2} descriptors are +re-created on the device with the descriptive information; and references to the +vectors point to the correct local storage, of the space that was not freed in the first \code{target} +directive. At the end of the second \code{target} region, the data in array \plc{p} is copied back +to the host since \plc{p} is not an allocatable array. + +A \code{depend} clause is used in the \code{task} directive to provide a wait at the beginning of the second +\code{target} region, to insure that there is no race condition with \plc{v1} and \plc{v2} in the two tasks. +It would be noncompliant to use \plc{v1} and/or \plc{v2} in lieu of \plc{N} in the \code{depend} clauses, +because the use of non-allocated allocatable arrays as list items in the first \code{depend} clause would +lead to unspecified behavior. \fexample{async_target}{2f} diff --git a/Title_Page.tex b/Title_Page.tex index a6d1f58..bcd08eb 100644 --- a/Title_Page.tex +++ b/Title_Page.tex @@ -17,7 +17,7 @@ \vspace{1.0in} - \textbf{Version 4.0.2 -- February, 2015} + \textbf{Version 4.0.2rv1 -- February, 2015} \end{center} \end{adjustwidth} @@ -38,5 +38,7 @@ permission of OpenMP Architecture Review Board.\end{adjustwidth} \thispagestyle{empty} \phantom{a} \emph{This page intentionally left blank} + +This working version enacted the following tickets: 299, 342, and a few other editorial changes. \vfill diff --git a/openmp.sty b/openmp.sty index 5379a76..f1de19b 100644 --- a/openmp.sty +++ b/openmp.sty @@ -439,9 +439,9 @@ }{ \def\cname{#1.#2} % Use following line for old numbering - \def\ename{\thechapter.#2} +% \def\ename{\thechapter.#2} % Use following for mneumonics -% \def\ename{\escstr{#1}.#2} + \def\ename{\escstr{#1}.#2} } \noindent \textit{Example \ename} diff --git a/sources/Example_async_target.2c.c b/sources/Example_async_target.2c.c index 5b65b7a..8564881 100644 --- a/sources/Example_async_target.2c.c +++ b/sources/Example_async_target.2c.c @@ -12,10 +12,11 @@ extern void init(float *, float *, int); #pragma omp end declare target extern void foo(); extern void output(float *, int); -void vec_mult(float *p, float *v1, float *v2, int N, int dev) +void vec_mult(float *p, int N, int dev) { + float *v1, *v2; int i; - #pragma omp task depend(out: v1, v2) + #pragma omp task shared(v1, v2) depend(out: v1, v2) #pragma omp target device(dev) map(v1, v2) { // check whether on device dev @@ -26,7 +27,7 @@ void vec_mult(float *p, float *v1, float *v2, int N, int dev) init(v1, v2, N); } foo(); // execute other work asychronously - #pragma omp task depend(in: v1, v2) + #pragma omp task shared(v1, v2, p) depend(in: v1, v2) #pragma omp target device(dev) map(to: v1, v2) map(from: p[0:N]) { // check whether on device dev @@ -38,5 +39,6 @@ void vec_mult(float *p, float *v1, float *v2, int N, int dev) free(v1); free(v2); } + #pragma omp taskwait output(p, N); } diff --git a/sources/Example_async_target.2f.f b/sources/Example_async_target.2f.f index 3953ece..acaaee9 100644 --- a/sources/Example_async_target.2f.f +++ b/sources/Example_async_target.2f.f @@ -10,7 +10,7 @@ integer :: i, idev !$omp declare target (init) - !$omp task depend(out: v1,v2) + !$omp task shared(v1,v2) depend(out: N) !$omp target device(idev) map(v1,v2) if( omp_is_initial_device() ) & stop "not executing on target device" @@ -21,7 +21,7 @@ call foo() ! execute other work asychronously - !$omp task depend(in: v1,v2) + !$omp task shared(v1,v2,p) depend(in: N) !$omp target device(idev) map(to: v1,v2) map(from: p) if( omp_is_initial_device() ) & stop "not executing on target device" @@ -34,6 +34,7 @@ !$omp end target !$omp end task + !$omp taskwait call output(p, N) end subroutine diff --git a/sources/Example_task_dep.5c.c b/sources/Example_task_dep.5c.c index d40657b..87417ad 100644 --- a/sources/Example_task_dep.5c.c +++ b/sources/Example_task_dep.5c.c @@ -13,7 +13,10 @@ C[N][N] ) for (i = 0; i < N; i+=BS) { for (j = 0; j < N; j+=BS) { for (k = 0; k < N; k+=BS) { -#pragma omp task depend ( in: A[i:BS][k:BS], B[k:BS][j:BS] ) \ +// Note 1: i, j, k, A, B, C are firstprivate by default +// Note 2: A, B and C are just pointers +#pragma omp task private(ii, jj, kk) \ + depend ( in: A[i:BS][k:BS], B[k:BS][j:BS] ) \ depend ( inout: C[i:BS][j:BS] ) for (ii = i; ii < i+BS; ii++ ) for (jj = j; jj < j+BS; jj++ ) diff --git a/sources/Example_task_dep.5f.f b/sources/Example_task_dep.5f.f index 466f38e..c41e9f1 100644 --- a/sources/Example_task_dep.5f.f +++ b/sources/Example_task_dep.5f.f @@ -3,19 +3,22 @@ ! @@compilable: yes ! @@linkable: no ! @@expect: success +! Assume BS divides N perfectly subroutine matmul_depend (N, BS, A, B, C) + implicit none integer :: N, BS, BM real, dimension(N, N) :: A, B, C integer :: i, j, k, ii, jj, kk - BM = BS -1 + BM = BS - 1 do i = 1, N, BS do j = 1, N, BS do k = 1, N, BS -!$omp task depend ( in: A(i:i+BM, k:k+BM), B(k:k+BM, j:j+BM) ) & +!$omp task shared(A,B,C) private(ii,jj,kk) & ! I,J,K are firstprivate by default +!$omp depend ( in: A(i:i+BM, k:k+BM), B(k:k+BM, j:j+BM) ) & !$omp depend ( inout: C(i:i+BM, j:j+BM) ) - do ii = i, i+BS - do jj = j, j+BS - do kk = k, k+BS + do ii = i, i+BM + do jj = j, j+BM + do kk = k, k+BM C(jj,ii) = C(jj,ii) + A(kk,ii) * B(jj,kk) end do end do