diff --git a/Changes.log b/Changes.log index b47860c..aad560f 100644 --- a/Changes.log +++ b/Changes.log @@ -1,3 +1,47 @@ +[2-Feb-2015] Version 4.0.2 +Changes from 4.0.1ltx + +1. Source code changes (Ticket #342) + +2. New addition (Ticket #299) + +Examples_SIMD.tex +Example_SIMD.1c.c +Example_SIMD.1f.f +Example_SIMD.2c.c +Example_SIMD.2f.f +Example_SIMD.3c.c +Example_SIMD.3f.f +Example_SIMD.4c.c +Example_SIMD.4f.f +Example_SIMD.5c.c +Example_SIMD.5f.f +Example_SIMD.6c.c +Example_SIMD.6f.f +Example_SIMD.7c.c +Example_SIMD.7f.f +Example_SIMD.8c.c +Example_SIMD.8f.f + +3. Other changes + +- Move task depedence examples from tasking to a separate chapter. + tasking.15-19 -> task_dep.1-5 + +- Fix broken links + -Chap-4 (icv), page 11: "According to $" + According to Section 2.3 of the OpenMP 4.0 specification + + -Chap-10 (fort_loopvar), page 31: "see $ and $" + see Section 2.7.1 and Section 2.14.1 of the OpenMP 4.0 specification + + -Chap-12 (collapse), page 39: "According to $" + According to Section 2.12.8 of the OpenMP 4.0 specification + + -Chap-16 (tasking). page 54, 57: "illustrated in $" + illustrated in Section 2.11.3 of the OpenMP 4.0 specification + + [6-Jan-2015] Version 4.0.1ltx Changes from 4.0.1ltx-21Nov-2014 diff --git a/Examples_SIMD.tex b/Examples_SIMD.tex new file mode 100644 index 0000000..24a24f5 --- /dev/null +++ b/Examples_SIMD.tex @@ -0,0 +1,109 @@ +\pagebreak +\chapter{SIMD Constructs} +\label{chap:SIMD} + +The following examples illustrate the use of SIMD constructs for vectorization. + +Compilers may not vectorize loops when they are complex or possibly have +dependencies, even though the programmer is certain the loop will execute +correctly as a vectorized loop. The \code{simd} construct assures the compiler +that the loop can be vectorized. + +\cexample{SIMD}{1c} + +\fexample{SIMD}{1f} + + +When a function can be inlined within a loop the compiler has an opportunity to +vectorize the loop. By guaranteeing SIMD behavior of a function's operations, +characterizing the arguments of the function and privatizing temporary +variables of the loop, the compiler can often create faster, vector code for +the loop. In the examples below the \code{declare} \code{simd} construct is +used on the \plc{add1} and \plc{add2} functions to enable creation of their +corresponding SIMD function versions for execution within the associated SIMD +loop. The functions characterize two different approaches of accessing data +within the function: by a single variable and as an element in a data array, +respectively. The \plc{add3} C function uses dereferencing. + +The \code{declare} \code{simd} constructs also illustrate the use of +\code{uniform} and \code{linear} clauses. The \code{uniform(fact)} clause +indicates that the variable \plc{fact} is invariant across the SIMD lanes. In +the \plc{add2} function \plc{a} and \plc{b} are included in the \code{unform} +list because the C pointer and the Fortran array references are constant. The +\plc{i} index used in the \plc{add2} function is included in a \code{linear} +clause with a constant-linear-step of 1, to guarantee a unity increment of the +associated loop. In the \code{declare} \code{simd} construct for the \plc{add3} +C function the \code{linear(a,b:1)} clause instructs the compiler to generate +unit-stride loads across the SIMD lanes; otherwise, costly \emph{gather} +instructions would be generated for the unknown sequence of access of the +pointer dereferences. + +In the \code{simd} constructs for the loops the \code{private(tmp)} clause is +necessary to assure that the each vector operation has its own \plc{tmp} +variable. + +\cexample{SIMD}{2c} + +\fexample{SIMD}{2f} + + +A thread that encounters a SIMD construct executes a vectorized code of the +iterations. Similar to the concerns of a worksharing loop a loop vectorized +with a SIMD construct must assure that temporary and reduction variables are +privatized and declared as reductions with clauses. The example below +illustrates the use of \code{private} and \code{reduction} clauses in a SIMD +construct. + +\cexample{SIMD}{3c} + +\fexample{SIMD}{3f} + + +A \code{safelen(N)} clause in a \code{simd} construct assures the compiler that +there are no loop-carried dependencies for vectors of size \plc{N} or below. If +the \code{safelen} clause is not specified, then the default safelen value is +the number of loop iterations. + +The \code{safelen(16)} clause in the example below guarantees that the vector +code is safe for vectors up to and including size 16. In the loop, \plc{m} can +be 16 or greater, for correct code execution. If the value of \plc{m} is less +than 16, the behavior is undefined. + +\cexample{SIMD}{4c} + +\fexample{SIMD}{4f} + + +The following SIMD construct instructs the compiler to collapse the \plc{i} and +\plc{j} loops into a single SIMD loop in which SIMD chunks are executed by +threads of the team. Within the workshared loop chunks of a thread, the SIMD +chunks are executed in the lanes of the vector units. + +\cexample{SIMD}{5c} + +\fexample{SIMD}{5f} + + +The following examples illustrate the use of the \code{declare} \code{simd} +construct with the \code{inbranch} and \code{notinbranch} clauses. The +\code{notinbranch} clause informs the compiler that the function \plc{foo} is +never called conditionally in the SIMD loop of the function \plc{myaddint}. On +the other hand, the \code{inbranch} clause for the function goo indicates that +the function is always called conditionally in the SIMD loop inside +the function \plc{myaddfloat}. + +\cexample{SIMD}{6c} + +\fexample{SIMD}{6f} + + +In the code below, the function \plc{fib()} is called in the main program and +also recursively called in the function \plc{fib()} within an \code{if} +condition. The compiler creates a masked vector version and a non-masked vector +version for the function \plc{fib()} while retaining the original scalar +version of the \plc{fib()} function. + +\cexample{SIMD}{7c} + +\fexample{SIMD}{7f} + diff --git a/Examples_collapse.tex b/Examples_collapse.tex index 6fd4dfd..fd88dad 100644 --- a/Examples_collapse.tex +++ b/Examples_collapse.tex @@ -47,7 +47,8 @@ that loop is divided among the threads in the current team. An \code{ordered} clause is added to the loop construct, because an ordered region binds to the loop region arising from the loop construct. -According to \$, a thread must not execute more than one ordered region that binds +According to Section 2.12.8 of the OpenMP 4.0 specification, +a thread must not execute more than one ordered region that binds to the same loop region. So the \code{collapse} clause is required for the example to be conforming. With the \code{collapse} clause, the iterations of the \code{k} and \code{j} loops are collapsed into one loop, and therefore only one ordered diff --git a/Examples_fort_loopvar.tex b/Examples_fort_loopvar.tex index e32d107..14d988a 100644 --- a/Examples_fort_loopvar.tex +++ b/Examples_fort_loopvar.tex @@ -5,7 +5,8 @@ In general loop iteration variables will be private, when used in the \plc{do-loop} of a \code{do} and \code{parallel do} construct or in sequential loops in a -\code{parallel} construct (see \$ and \$). In the following example of a sequential +\code{parallel} construct (see Section 2.7.1 and Section 2.14.1 of +the OpenMP 4.0 specification). In the following example of a sequential loop in a \code{parallel} construct the loop iteration variable \plc{I} will be private. diff --git a/Examples_icv.tex b/Examples_icv.tex index c0d858f..51b7b45 100644 --- a/Examples_icv.tex +++ b/Examples_icv.tex @@ -2,7 +2,7 @@ \chapter{Internal Control Variables (ICVs)} \label{chap:icv} -According to \$, an OpenMP implementation must act as if there are ICVs that control +According to Section 2.3 of the OpenMP 4.0 specification, an OpenMP implementation must act as if there are ICVs that control the behavior of the program. This example illustrates two ICVs, \plc{nthreads-var} and \plc{max-active-levels-var}. The \plc{nthreads-var} ICV controls the number of threads requested for encountered parallel regions; there is one copy diff --git a/Examples_task_dep.tex b/Examples_task_dep.tex new file mode 100644 index 0000000..98089af --- /dev/null +++ b/Examples_task_dep.tex @@ -0,0 +1,72 @@ +\pagebreak +\chapter{Task Dependences} +\label{chap:task_dep} + +\section{Flow Dependence} + +In this example we show a simple flow dependence expressed using the \code{depend} +clause on the \code{task} construct. + +\cexample{task_dep}{1c} + +\fexample{task_dep}{1f} + +The program will always print \texttt{"}x = 2\texttt{"}, because the \code{depend} +clauses enforce the ordering of the tasks. If the \code{depend} clauses had been +omitted, then the tasks could execute in any order and the program and the program +would have a race condition. + +\section{Anti-dependence} + +In this example we show an anti-dependence expressed using the \code{depend} +clause on the \code{task} construct. + +\cexample{task_dep}{2c} + +\fexample{task_dep}{2f} + +The program will always print \texttt{"}x = 1\texttt{"}, because the \code{depend} +clauses enforce the ordering of the tasks. If the \code{depend} clauses had been +omitted, then the tasks could execute in any order and the program would have a +race condition. + +\section{Output Dependence} + +In this example we show an output dependence expressed using the \code{depend} +clause on the \code{task} construct. + +\cexample{task_dep}{3c} + +\fexample{task_dep}{3f} + +The program will always print \texttt{"}x = 2\texttt{"}, because the \code{depend} +clauses enforce the ordering of the tasks. If the \code{depend} clauses had been +omitted, then the tasks could execute in any order and the program would have a +race condition. + +\section{Concurrent Execution with Dependences} + +In this example we show potentially concurrent execution of tasks using multiple +flow dependences expressed using the \code{depend} clause on the \code{task} +construct. + +\cexample{task_dep}{4c} + +\fexample{task_dep}{4f} + +The last two tasks are dependent on the first task. However there is no dependence +between the last two tasks, which may execute in any order (or concurrently if +more than one thread is available). Thus, the possible outputs are \texttt{"}x ++ 1 = 3. x + 2 = 4. \texttt{"} and \texttt{"}x + 2 = 4. x + 1 = 3. \texttt{"}. +If the \code{depend} clauses had been omitted, then all of the tasks could execute +in any order and the program would have a race condition. + +\section{Matrix multiplication} + +This example shows a task-based blocked matrix multiplication. Matrices are of +NxN elements, and the multiplication is implemented using blocks of BSxBS elements. + +\cexample{task_dep}{5c} + +\fexample{task_dep}{5f} + diff --git a/Examples_tasking.tex b/Examples_tasking.tex index f112319..e2d6b76 100644 --- a/Examples_tasking.tex +++ b/Examples_tasking.tex @@ -74,7 +74,8 @@ the task generating loop was in a tied task. \fexample{tasking}{6f} The following two examples demonstrate how the scheduling rules illustrated in -\$ affect the usage of \code{threadprivate} variables in tasks. A \code{threadprivate} +Section 2.11.3 of the OpenMP 4.0 specification affect the usage of +\code{threadprivate} variables in tasks. A \code{threadprivate} variable can be modified by another task that is executed by the same thread. Thus, the value of a \code{threadprivate} variable cannot be assumed to be unchanged across a task scheduling point. In untied tasks, task scheduling points may be @@ -101,7 +102,8 @@ task scheduling point. \fexample{tasking}{8f} The following two examples demonstrate how the scheduling rules illustrated in -\$ affect the usage of locks and critical sections in tasks. If a lock is held +Section 2.11.3 of the OpenMP 4.0 specification affect the usage of locks +and critical sections in tasks. If a lock is held across a task scheduling point, no attempt should be made to acquire the same lock in any code that may be interleaved. Otherwise, a deadlock is possible. @@ -186,73 +188,3 @@ are usually the opposite. \fexample{tasking}{14f} -\section*{Task Dependences} - -\section{Flow Dependence} - -In this example we show a simple flow dependence expressed using the \code{depend} -clause on the \code{task} construct. - -\cexample{tasking}{15c} - -\fexample{tasking}{15f} - -The program will always print \texttt{"}x = 2\texttt{"}, because the \code{depend} -clauses enforce the ordering of the tasks. If the \code{depend} clauses had been -omitted, then the tasks could execute in any order and the program and the program -would have a race condition. - -\section{Anti-dependence} - -In this example we show an anti-dependence expressed using the \code{depend} -clause on the \code{task} construct. - -\cexample{tasking}{16c} - -\fexample{tasking}{16f} - -The program will always print \texttt{"}x = 1\texttt{"}, because the \code{depend} -clauses enforce the ordering of the tasks. If the \code{depend} clauses had been -omitted, then the tasks could execute in any order and the program would have a -race condition. - -\section{Output Dependence} - -In this example we show an output dependence expressed using the \code{depend} -clause on the \code{task} construct. - -\cexample{tasking}{17c} - -\fexample{tasking}{17f} - -The program will always print \texttt{"}x = 2\texttt{"}, because the \code{depend} -clauses enforce the ordering of the tasks. If the \code{depend} clauses had been -omitted, then the tasks could execute in any order and the program would have a -race condition. - -\section{Concurrent Execution with Dependences} - -In this example we show potentially concurrent execution of tasks using multiple -flow dependences expressed using the \code{depend} clause on the \code{task} -construct. - -\cexample{tasking}{18c} - -\fexample{tasking}{18f} - -The last two tasks are dependent on the first task. However there is no dependence -between the last two tasks, which may execute in any order (or concurrently if -more than one thread is available). Thus, the possible outputs are \texttt{"}x -+ 1 = 3. x + 2 = 4. \texttt{"} and \texttt{"}x + 2 = 4. x + 1 = 3. \texttt{"}. -If the \code{depend} clauses had been omitted, then all of the tasks could execute -in any order and the program would have a race condition. - -\section{Matrix multiplication} - -This example shows a task-based blocked matrix multiplication. Matrices are of -NxN elements, and the multiplication is implemented using blocks of BSxBS elements. - -\cexample{tasking}{19c} - -\fexample{tasking}{19f} - diff --git a/Makefile b/Makefile index 4ddb3af..f472daf 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Makefile for the OpenMP Examples document in LaTex format. # For more information, see the master document, openmp-examples.tex. -version=4.0.1ltx +version=4.0.2 default: openmp-examples.pdf @@ -24,6 +24,7 @@ CHAPTERS=Title_Page.tex \ Examples_fpriv_sections.tex \ Examples_single.tex \ Examples_tasking.tex \ + Examples_task_dep.tex \ Examples_taskgroup.tex \ Examples_taskyield.tex \ Examples_workshare.tex \ @@ -57,6 +58,7 @@ CHAPTERS=Title_Page.tex \ Examples_lock_owner.tex \ Examples_simple_lock.tex \ Examples_nestable_lock.tex \ + Examples_SIMD.tex \ Examples_target.tex \ Examples_target_data.tex \ Examples_target_update.tex \ diff --git a/Title_Page.tex b/Title_Page.tex index a4e557a..a6d1f58 100644 --- a/Title_Page.tex +++ b/Title_Page.tex @@ -17,14 +17,14 @@ \vspace{1.0in} - \textbf{Version 4.0.1.ltx -- February, 2014} + \textbf{Version 4.0.2 -- February, 2015} \end{center} \end{adjustwidth} \vspace{3.0in} \begin{adjustwidth}{0pt}{1em}\setlength{\parskip}{0.25\baselineskip}% -Copyright © 1997-2014 OpenMP Architecture Review Board.\\ +Copyright © 1997-2015 OpenMP Architecture Review Board.\\ Permission to copy without fee all or part of this material is granted, provided the OpenMP Architecture Review Board copyright notice and the title of this document appear. Notice is given that copying is by diff --git a/openmp-examples.tex b/openmp-examples.tex index d001188..194fdfb 100644 --- a/openmp-examples.tex +++ b/openmp-examples.tex @@ -48,7 +48,7 @@ \documentclass[10pt,letterpaper,twoside,makeidx,hidelinks]{scrreprt} % Text to appear in the footer on even-numbered pages: -\newcommand{\footerText}{OpenMP Examples Version 4.0.1 - February 2014} +\newcommand{\footerText}{OpenMP Examples Version 4.0.2 - February 2015} % Unified style sheet for OpenMP documents: \input{openmp.sty} @@ -91,6 +91,7 @@ \input{Examples_fpriv_sections} \input{Examples_single} \input{Examples_tasking} + \input{Examples_task_dep} \input{Examples_taskgroup} \input{Examples_taskyield} \input{Examples_workshare} @@ -124,6 +125,7 @@ \input{Examples_lock_owner} \input{Examples_simple_lock} \input{Examples_nestable_lock} + \input{Examples_SIMD} \input{Examples_target} \input{Examples_target_data} \input{Examples_target_update} diff --git a/sources/Example_SIMD.1c.c b/sources/Example_SIMD.1c.c new file mode 100644 index 0000000..64a06f9 --- /dev/null +++ b/sources/Example_SIMD.1c.c @@ -0,0 +1,14 @@ +/* +* @@name: SIMD.1c +* @@type: C +* @@compilable: yes +* @@linkable: no +* @@expect: success +*/ +void star( double *a, double *b, double *c, int n, int *ioff ) +{ + int i; + #pragma omp simd + for ( i = 0; i < n; i++ ) + a[i] *= b[i] * c[i+ *ioff]; +} diff --git a/sources/Example_SIMD.1f.f b/sources/Example_SIMD.1f.f new file mode 100644 index 0000000..5d4073a --- /dev/null +++ b/sources/Example_SIMD.1f.f @@ -0,0 +1,17 @@ +! @@name: SIMD.1f +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success +subroutine star(a,b,c,n,ioff_ptr) + implicit none + double precision :: a(*),b(*),c(*) + integer :: n, i + integer, pointer :: ioff_ptr + + !$omp simd + do i = 1,n + a(i) = a(i) * b(i) * c(i+ioff_ptr) + end do + +end subroutine diff --git a/sources/Example_SIMD.2c.c b/sources/Example_SIMD.2c.c new file mode 100644 index 0000000..d42014c --- /dev/null +++ b/sources/Example_SIMD.2c.c @@ -0,0 +1,62 @@ +/* +* @@name: SIMD.2c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include + +#pragma omp declare simd uniform(fact) +double add1(double a, double b, double fact) +{ + double c; + c = a + b + fact; + return c; +} + +#pragma omp declare simd uniform(a,b,fact) linear(i:1) +double add2(double *a, double *b, int i, double fact) +{ + double c; + c = a[i] + b[i] + fact; + return c; +} + +#pragma omp declare simd uniform(fact) linear(a,b:1) +double add3(double *a, double *b, double fact) +{ + double c; + c = *a + *b + fact; + return c; +} + +void work( double *a, double *b, int n ) +{ + int i; + double tmp; + #pragma omp simd private(tmp) + for ( i = 0; i < n; i++ ) { + tmp = add1( a[i], b[i], 1.0); + a[i] = add2( a, b, i, 1.0) + tmp; + a[i] = add3(&a[i], &b[i], 1.0); + } +} + +int main(){ + int i; + const int N=32; + double a[N], b[N]; + + for ( i=0; i y[i]) ? goo(&y[i]) : y[i]; + /* goo is called under the condition (or within a branch) */ + } + return x[n-1]; +} diff --git a/sources/Example_SIMD.6f.f b/sources/Example_SIMD.6f.f new file mode 100644 index 0000000..5cc5340 --- /dev/null +++ b/sources/Example_SIMD.6f.f @@ -0,0 +1,54 @@ +! @@name: SIMD.6f +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success +function foo(p) result(r) +!$omp declare simd(foo) notinbranch + implicit none + integer :: p, r + p = p + 10 + r = p +end function foo + +function myaddint(int *a, int *b, int n) result(r) + implicit none + integer :: a(*), b(*), n, r + integer :: i + integer, external :: foo + + !$omp simd + do i=1, n + a(i) = foo(b[i]) ! foo is not called under a condition + end do + r = a(n) + +end function myaddint + +function goo(p) result(r) +!$omp declare simd(goo) inbranch + implicit none + real :: p, r + p = p + 18.5 + r = p +end function goo + +function myaddfloat(x, y, n) result(r) + implicit none + real :: x(*), y(*), r + integer :: n + integer :: i + real, external :: goo + + !$omp simd + do i=1, n + if (x(i) > y(i)) then + x(i) = goo(y(i)) + ! goo is called under the condition (or within a branch) + else + x(i) = y(i) + endif + end do + + r = x(n) +end function myaddfloat diff --git a/sources/Example_SIMD.7c.c b/sources/Example_SIMD.7c.c new file mode 100644 index 0000000..e083186 --- /dev/null +++ b/sources/Example_SIMD.7c.c @@ -0,0 +1,37 @@ +/* +* @@name: SIMD.7c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +#include + +#define N 45 +int a[N], b[N], c[N]; + +#pragma omp declare simd inbranch +int fib( int n ) +{ + if (n <= 2) + return n; + else { + return fib(n-1) + fib(n-2); + } +} + +int main(void) +{ + int i; + + #pragma omp simd + for (i=0; i < N; i++) b[i] = i; + + #pragma omp simd + for (i=0; i < N; i++) { + a[i] = fib(b[i]); + } + printf("Done a[%d] = %d\n", N-1, a[N-1]); + return 0; +} diff --git a/sources/Example_SIMD.7f.f b/sources/Example_SIMD.7f.f new file mode 100644 index 0000000..212735f --- /dev/null +++ b/sources/Example_SIMD.7f.f @@ -0,0 +1,38 @@ +! @@name: SIMD.7f +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +program fibonacci + implicit none + integer,parameter :: N=45 + integer :: a(0:N-1), b(0:N-1) + integer :: i + integer, external :: fib + + !$omp simd + do i = 0,N-1 + b(i) = i + end do + + !$omp simd + do i=0,N-1 + a(i) = fib(b(i)) + end do + + write(*,*) "Done a(", N-1, ") = ", a(N-1) + ! 44 1134903168 +end program + +recursive function fib(n) result(r) +!$omp declare simd(fib) inbranch + implicit none + integer :: n, r + + if (n <= 2) then + r = n + else + r = fib(n-1) + fib(n-2) + endif + +end function fib diff --git a/sources/Example_SIMD.8c.c b/sources/Example_SIMD.8c.c new file mode 100644 index 0000000..ce00d68 --- /dev/null +++ b/sources/Example_SIMD.8c.c @@ -0,0 +1,48 @@ +/* +* @@name: SIMD.8c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +#include + +int P[1000]; +float A[1000]; + +float do_work(float *arr) +{ + float pri; +#pragma omp simd lastprivate(pri) + for (int i = 0; i < 999; ++i) { + int j = P[i]; + + pri = 0.5f; + if (j % 2 == 0) { + pri = A[j+1] + arr[i]; + } + A[j] = pri * 1.5f; + pri = pri + A[j]; + } + return pri; +} + +int main(void) +{ + float pri, arr[1000]; + + for (int i = 0; i < 1000; ++i) { + P[i] = i; + A[i] = i * 1.5f; + arr[i] = i * 1.8f; + } + pri = do_work(&arr[0]); + if (pri == 8237.25) { + printf("passed: result pri = %7.2f (8237.25) \n", pri); + } + else { + printf("failed: result pri = %7.2f (8237.25) \n", pri); + } + return 0; +} diff --git a/sources/Example_SIMD.8f.f b/sources/Example_SIMD.8f.f new file mode 100644 index 0000000..885d811 --- /dev/null +++ b/sources/Example_SIMD.8f.f @@ -0,0 +1,54 @@ +! @@name: SIMD.8f +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +module work + +integer :: P(1000) +real :: A(1000) + +contains +function do_work(arr) result(pri) + implicit none + real, dimension(*) :: arr + + real :: pri + integer :: i, j + + !$omp simd private(j) lastprivate(pri) + do i = 1, 999 + j = P(i) + + pri = 0.5 + if (mod(j-1, 2) == 0) then + pri = A(j+1) + arr(i) + endif + A(j) = pri * 1.5 + pri = pri + A(j) + end do + +end function do_work + +end module work + +program simd_8f + use work + implicit none + real :: pri, arr(1000) + integer :: i + + do i = 1, 1000 + P(i) = i + A(i) = (i-1) * 1.5 + arr(i) = (i-1) * 1.8 + end do + pri = do_work(arr) + if (pri == 8237.25) then + print 2, "passed", pri + else + print 2, "failed", pri + endif +2 format(a, ": result pri = ", f7.2, " (8237.25)") + +end program diff --git a/sources/Example_array_sections.4c.c b/sources/Example_array_sections.4c.c index 1147583..6d1774e 100644 --- a/sources/Example_array_sections.4c.c +++ b/sources/Example_array_sections.4c.c @@ -7,7 +7,7 @@ */ void foo () { - int A[30]; + int A[30], *p; #pragma omp target data map( A[0:10] ) { p = &A[0]; diff --git a/sources/Example_async_target.1f.f b/sources/Example_async_target.1f.f index 926c175..5961015 100644 --- a/sources/Example_async_target.1f.f +++ b/sources/Example_async_target.1f.f @@ -33,6 +33,7 @@ end interface !$omp end task end do + !$omp taskwait print*, z end subroutine pipedF diff --git a/sources/Example_async_target.2c.c b/sources/Example_async_target.2c.c index c4c6c3d..5b65b7a 100644 --- a/sources/Example_async_target.2c.c +++ b/sources/Example_async_target.2c.c @@ -6,12 +6,15 @@ * @@expect: success */ #include +#include +#pragma omp declare target extern void init(float *, float *, int); +#pragma omp end declare target +extern void foo(); extern void output(float *, int); void vec_mult(float *p, float *v1, float *v2, int N, int dev) { int i; - init(p, N); #pragma omp task depend(out: v1, v2) #pragma omp target device(dev) map(v1, v2) { @@ -20,7 +23,7 @@ void vec_mult(float *p, float *v1, float *v2, int N, int dev) abort(); v1 = malloc(N*sizeof(float)); v2 = malloc(N*sizeof(float)); - init(v1,v2); + init(v1, v2, N); } foo(); // execute other work asychronously #pragma omp task depend(in: v1, v2) @@ -32,8 +35,8 @@ void vec_mult(float *p, float *v1, float *v2, int N, int dev) #pragma omp parallel for for (i=0; i +#include + +#define N 10000 + +extern void causes_an_exception(); +extern void phase_1(); +extern void phase_2(); + void example() { std::exception *ex = NULL; #pragma omp parallel shared(ex) @@ -15,7 +24,7 @@ void example() { try { causes_an_exception(); } - catch (const std::exception *e) { + catch (std::exception *e) { // still must remember exception for later handling #pragma omp atomic write ex = e; diff --git a/sources/Example_cancellation.1f.f b/sources/Example_cancellation.1f.f index c7543c1..ed94cae 100644 --- a/sources/Example_cancellation.1f.f +++ b/sources/Example_cancellation.1f.f @@ -12,7 +12,7 @@ subroutine example(n, dim) ! ... !$omp do private(s, B) do i=1, n -!$omp cancellation point +!$omp cancellation point do allocate(B(dim(i)), stat=s) if (s .gt. 0) then !$omp atomic write diff --git a/sources/Example_cancellation.2c.c b/sources/Example_cancellation.2c.c index 1db99bd..bb29c6f 100644 --- a/sources/Example_cancellation.2c.c +++ b/sources/Example_cancellation.2c.c @@ -5,6 +5,11 @@ * @@linkable: no * @@expect: success */ +typedef struct binary_tree_s { + int value; + struct binary_tree_s *left, *right; +} binary_tree_t; + binary_tree_t *search_tree(binary_tree_t *tree, int value, int level) { binary_tree_t *found = NULL; if (tree) { diff --git a/sources/Example_cancellation.2f.f b/sources/Example_cancellation.2f.f index e0c3c0c..f9132aa 100644 --- a/sources/Example_cancellation.2f.f +++ b/sources/Example_cancellation.2f.f @@ -15,22 +15,18 @@ contains type(binary_tree), intent(in), pointer :: tree integer, intent(in) :: value, level type(binary_tree), pointer :: found - type(binary_tree), pointer :: found_left => NULL(), & - found_right => NULL() - - if (.not. associated(found)) then - allocate(found) - endif + type(binary_tree), pointer :: found_left => NULL(), found_right => NULL() if (associated(tree)) then if (tree%value .eq. value) then - found = tree + found => tree else !$omp task shared(found) if(level<10) call search_tree(tree%left, value, level+1, found_left) if (associated(found_left)) then -!$omp atomic write - found = found_left +!$omp critical + found => found_left +!$omp end critical !$omp cancel taskgroup endif @@ -39,8 +35,9 @@ contains !$omp task shared(found) if(level<10) call search_tree(tree%right, value, level+1, found_right) if (associated(found_right)) then -!$omp atomic write - found = found_right +!$omp critical + found => found_right +!$omp end critical !$omp cancel taskgroup endif @@ -56,9 +53,7 @@ contains integer, intent(in) :: value type(binary_tree), pointer :: found - if (associated(found)) then - allocate(found) - endif + found => NULL() !$omp parallel shared(found, tree, value) !$omp master !$omp taskgroup diff --git a/sources/Example_declare_target.2c.c b/sources/Example_declare_target.2c.c index 35bef88..f151cd8 100644 --- a/sources/Example_declare_target.2c.c +++ b/sources/Example_declare_target.2c.c @@ -8,12 +8,13 @@ struct typeX { int a; -} +}; class typeY { - int foo() { return a^0x01;} int a; -} + public: + int foo() { return a^0x01;} +}; #pragma omp declare target struct typeX varX; // ok class typeY varY; // ok if varY.foo() not called on target device diff --git a/sources/Example_device.1c.c b/sources/Example_device.1c.c index ced1e8f..b719f50 100644 --- a/sources/Example_device.1c.c +++ b/sources/Example_device.1c.c @@ -16,7 +16,7 @@ extern void init_vars(float *, float *, int); extern void output(float *, int); void foo() { - N = init_vars(&p, &v1, &v2); + init_vars(v1, v2, N); #pragma omp target device(42) map(p[:N], v1[:N], v2[:N]) { vec_mult(p, v1, v2, N); @@ -26,7 +26,7 @@ void foo() void vec_mult(float *p, float *v1, float *v2, int N) { int i; - int nthreads = omp_is_initial_device() ? 8 : 1024; + int nthreads; if (!omp_is_initial_device()) { printf("1024 threads on target device\n"); diff --git a/sources/Example_target_data.3c.c b/sources/Example_target_data.3c.c index e389c53..85d9514 100644 --- a/sources/Example_target_data.3c.c +++ b/sources/Example_target_data.3c.c @@ -6,8 +6,10 @@ * @@expect: success */ #include -void gramSchmidt(restrict float Q[][COLS], const int rows, const int cols) +#define COLS 100 +void gramSchmidt(float Q[][COLS], const int rows) { + int cols = COLS; #pragma omp target data map(Q[0:rows][0:cols]) for(int k=0; k < cols; k++) { diff --git a/sources/Example_target_data.4c.c b/sources/Example_target_data.4c.c index e2de896..11574ce 100644 --- a/sources/Example_target_data.4c.c +++ b/sources/Example_target_data.4c.c @@ -19,6 +19,7 @@ void foo(float *p0, float *v1, float *v2, int N) } void vec_mult(float *p1, float *v3, float *v4, int N) { + int i; #pragma omp target map(to: v3[0:N], v4[:N]) map(from: p1[0:N]) #pragma omp parallel for for (i=0; i +int main() +{ + int x = 1; + #pragma omp parallel + #pragma omp single + { + #pragma omp task shared(x) depend(out: x) + x = 2; + #pragma omp task shared(x) depend(in: x) + printf("x = %d\n", x); + } + return 0; +} diff --git a/sources/Example_task_dep.1f.f b/sources/Example_task_dep.1f.f new file mode 100644 index 0000000..356d553 --- /dev/null +++ b/sources/Example_task_dep.1f.f @@ -0,0 +1,19 @@ +! @@name: task_dep.1f +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +program example + integer :: x + x = 1 + !$omp parallel + !$omp single + !$omp task shared(x) depend(out: x) + x = 2 + !$omp end task + !$omp task shared(x) depend(in: x) + print*, "x = ", x + !$omp end task + !$omp end single + !$omp end parallel +end program diff --git a/sources/Example_task_dep.2c.c b/sources/Example_task_dep.2c.c new file mode 100644 index 0000000..7bb082d --- /dev/null +++ b/sources/Example_task_dep.2c.c @@ -0,0 +1,21 @@ +/* +* @@name: task_dep.2c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +int main() +{ + int x = 1; + #pragma omp parallel + #pragma omp single + { + #pragma omp task shared(x) depend(in: x) + printf("x = %d\n", x); + #pragma omp task shared(x) depend(out: x) + x = 2; + } + return 0; +} diff --git a/sources/Example_task_dep.2f.f b/sources/Example_task_dep.2f.f new file mode 100644 index 0000000..445785d --- /dev/null +++ b/sources/Example_task_dep.2f.f @@ -0,0 +1,19 @@ +! @@name: task_dep.2f +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +program example + integer :: x + x = 1 + !$omp parallel + !$omp single + !$omp task shared(x) depend(in: x) + print*, "x = ", x + !$omp end task + !$omp task shared(x) depend(out: x) + x = 2 + !$omp end task + !$omp end single + !$omp end parallel +end program diff --git a/sources/Example_task_dep.3c.c b/sources/Example_task_dep.3c.c new file mode 100644 index 0000000..e3147d4 --- /dev/null +++ b/sources/Example_task_dep.3c.c @@ -0,0 +1,23 @@ +/* +* @@name: task_dep.3c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +int main() +{ + int x; + #pragma omp parallel + #pragma omp single + { + #pragma omp task shared(x) depend(out: x) + x = 1; + #pragma omp task shared(x) depend(out: x) + x = 2; + #pragma omp taskwait + printf("x = %d\n", x); + } + return 0; +} diff --git a/sources/Example_task_dep.3f.f b/sources/Example_task_dep.3f.f new file mode 100644 index 0000000..dea2b79 --- /dev/null +++ b/sources/Example_task_dep.3f.f @@ -0,0 +1,20 @@ +! @@name: task_dep.3f +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +program example + integer :: x + !$omp parallel + !$omp single + !$omp task shared(x) depend(out: x) + x = 1 + !$omp end task + !$omp task shared(x) depend(out: x) + x = 2 + !$omp end task + !$omp taskwait + print*, "x = ", x + !$omp end single + !$omp end parallel +end program diff --git a/sources/Example_task_dep.4c.c b/sources/Example_task_dep.4c.c new file mode 100644 index 0000000..f476e9e --- /dev/null +++ b/sources/Example_task_dep.4c.c @@ -0,0 +1,23 @@ +/* +* @@name: task_dep.4c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +int main() +{ + int x = 1; + #pragma omp parallel + #pragma omp single + { + #pragma omp task shared(x) depend(out: x) + x = 2; + #pragma omp task shared(x) depend(in: x) + printf("x + 1 = %d. ", x+1); + #pragma omp task shared(x) depend(in: x) + printf("x + 2 = %d\n", x+2); + } + return 0; +} diff --git a/sources/Example_task_dep.4f.f b/sources/Example_task_dep.4f.f new file mode 100644 index 0000000..9106625 --- /dev/null +++ b/sources/Example_task_dep.4f.f @@ -0,0 +1,22 @@ +! @@name: task_dep.4f +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +program example + integer :: x + x = 1 + !$omp parallel + !$omp single + !$omp task shared(x) depend(out: x) + x = 2 + !$omp end task + !$omp task shared(x) depend(in: x) + print*, "x + 1 = ", x+1, "." + !$omp end task + !$omp task shared(x) depend(in: x) + print*, "x + 2 = ", x+2, "." + !$omp end task + !$omp end single + !$omp end parallel +end program diff --git a/sources/Example_task_dep.5c.c b/sources/Example_task_dep.5c.c new file mode 100644 index 0000000..d40657b --- /dev/null +++ b/sources/Example_task_dep.5c.c @@ -0,0 +1,25 @@ +/* +* @@name: task_dep.5c +* @@type: C +* @@compilable: yes +* @@linkable: no +* @@expect: success +*/ +// Assume BS divides N perfectly +void matmul_depend(int N, int BS, float A[N][N], float B[N][N], float +C[N][N] ) +{ + int i, j, k, ii, jj, kk; + for (i = 0; i < N; i+=BS) { + for (j = 0; j < N; j+=BS) { + for (k = 0; k < N; k+=BS) { +#pragma omp task depend ( in: A[i:BS][k:BS], B[k:BS][j:BS] ) \ + depend ( inout: C[i:BS][j:BS] ) + for (ii = i; ii < i+BS; ii++ ) + for (jj = j; jj < j+BS; jj++ ) + for (kk = k; kk < k+BS; kk++ ) + C[ii][jj] = C[ii][jj] + A[ii][kk] * B[kk][jj]; + } + } + } +} diff --git a/sources/Example_task_dep.5f.f b/sources/Example_task_dep.5f.f new file mode 100644 index 0000000..466f38e --- /dev/null +++ b/sources/Example_task_dep.5f.f @@ -0,0 +1,27 @@ +! @@name: task_dep.5f +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success +subroutine matmul_depend (N, BS, A, B, C) + integer :: N, BS, BM + real, dimension(N, N) :: A, B, C + integer :: i, j, k, ii, jj, kk + BM = BS -1 + do i = 1, N, BS + do j = 1, N, BS + do k = 1, N, BS +!$omp task depend ( in: A(i:i+BM, k:k+BM), B(k:k+BM, j:j+BM) ) & +!$omp depend ( inout: C(i:i+BM, j:j+BM) ) + do ii = i, i+BS + do jj = j, j+BS + do kk = k, k+BS + C(jj,ii) = C(jj,ii) + A(kk,ii) * B(jj,kk) + end do + end do + end do +!$omp end task + end do + end do + end do +end subroutine diff --git a/sources/Example_tasking.7f.f b/sources/Example_tasking.7f.f index 3962002..3b612cb 100644 --- a/sources/Example_tasking.7f.f +++ b/sources/Example_tasking.7f.f @@ -9,7 +9,6 @@ integer var contains subroutine work - use globals !$omp task ! do work here !$omp task diff --git a/sources/Example_teams.2f.f b/sources/Example_teams.2f.f index c36dcd3..2777d51 100644 --- a/sources/Example_teams.2f.f +++ b/sources/Example_teams.2f.f @@ -10,7 +10,7 @@ implicit none sum = 0.0e0 !$omp target map(to: B, C) !$omp teams num_teams(num_teams) thread_limit(block_threads) & - reduction(+:sum) + !$omp& reduction(+:sum) !$omp distribute do i0=1,N, block_size !$omp parallel do reduction(+:sum) diff --git a/sources/Example_teams.4c.c b/sources/Example_teams.4c.c index fcafa10..acdf752 100644 --- a/sources/Example_teams.4c.c +++ b/sources/Example_teams.4c.c @@ -6,7 +6,7 @@ * @@expect: success */ #define N 1024*1024 -float dotprod(float B[], float C[], int N) +float dotprod(float B[], float C[]) { float sum = 0; int i;