mirror of
https://github.com/OpenMP/Examples.git
synced 2025-04-11 00:42:12 +01:00
Compare commits
12 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
415024c369 | ||
![]() |
00bdf88b63 | ||
![]() |
3346a30ce2 | ||
![]() |
11f2efcccf | ||
![]() |
075683d574 | ||
![]() |
08859e6029 | ||
![]() |
03b9a00df9 | ||
![]() |
a5e3d8b3f2 | ||
![]() |
fb0edc81e7 | ||
![]() |
60e8ece384 | ||
![]() |
3052c10566 | ||
![]() |
eaec9ede64 |
@ -1,3 +1,8 @@
|
||||
[02-Feb-2018] Note
|
||||
This "Changes.log" is no longer updated. Please use History.tex and
|
||||
the git log messages for changes.
|
||||
|
||||
|
||||
[20-May-2016] Version 4.5.0
|
||||
Changes from 4.0.2ltx
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
\pagebreak
|
||||
\chapter{SIMD}
|
||||
\cchapter{SIMD}{SIMD}
|
||||
\label{chap:simd}
|
||||
|
||||
Single instruction, multiple data (SIMD) is a form of parallel execution
|
||||
@ -9,34 +8,34 @@ The addition of two vectors to form a third vector is a SIMD operation.
|
||||
Many processors have SIMD (vector) units that can perform simultaneously
|
||||
2, 4, 8 or more executions of the same operation (by a single SIMD unit).
|
||||
|
||||
Loops without loop-carried backward dependency (or with dependency preserved using
|
||||
ordered simd) are candidates for vectorization by the compiler for
|
||||
Loops without loop-carried backward dependences (or with dependences preserved using
|
||||
\kcode{ordered simd}) are candidates for vectorization by the compiler for
|
||||
execution with SIMD units. In addition, with state-of-the-art vectorization
|
||||
technology and \code{declare simd} construct extensions for function vectorization
|
||||
technology and \kcode{declare simd} directive extensions for function vectorization
|
||||
in the OpenMP 4.5 specification, loops with function calls can be vectorized as well.
|
||||
The basic idea is that a scalar function call in a loop can be replaced by a vector version
|
||||
of the function, and the loop can be vectorized simultaneously by combining a loop
|
||||
vectorization (\code{simd} directive on the loop) and a function
|
||||
vectorization (\code{declare simd} directive on the function).
|
||||
vectorization (\kcode{simd} directive on the loop) and a function
|
||||
vectorization (\kcode{declare simd} directive on the function).
|
||||
|
||||
A \code{simd} construct states that SIMD operations be performed on the
|
||||
A \kcode{simd} construct states that SIMD operations be performed on the
|
||||
data within the loop. A number of clauses are available to provide
|
||||
data-sharing attributes (\code{private}, \code{linear}, \code{reduction} and
|
||||
\code{lastprivate}). Other clauses provide vector length preference/restrictions
|
||||
(\code{simdlen} / \code{safelen}), loop fusion (\code{collapse}), and data
|
||||
alignment (\code{aligned}).
|
||||
data-sharing attributes (\kcode{private}, \kcode{linear}, \kcode{reduction} and
|
||||
\kcode{lastprivate}). Other clauses provide vector length preference/restrictions
|
||||
(\kcode{simdlen} / \kcode{safelen}), loop fusion (\kcode{collapse}), and data
|
||||
alignment (\kcode{aligned}).
|
||||
|
||||
The \code{declare simd} directive designates
|
||||
The \kcode{declare simd} directive designates
|
||||
that a vector version of the function should also be constructed for
|
||||
execution within loops that contain the function and have a \code{simd}
|
||||
directive. Clauses provide argument specifications (\code{linear},
|
||||
\code{uniform}, and \code{aligned}), a requested vector length
|
||||
(\code{simdlen}), and designate whether the function is always/never
|
||||
called conditionally in a loop (\code{branch}/\code{inbranch}).
|
||||
The latter is for optimizing peformance.
|
||||
execution within loops that contain the function and have a \kcode{simd}
|
||||
directive. Clauses provide argument specifications (\kcode{linear},
|
||||
\kcode{uniform}, and \kcode{aligned}), a requested vector length
|
||||
(\kcode{simdlen}), and designate whether the function is always/never
|
||||
called conditionally in a loop (\kcode{notinbranch}/\kcode{inbranch}).
|
||||
The latter is for optimizing performance.
|
||||
|
||||
Also, the \code{simd} construct has been combined with the worksharing loop
|
||||
constructs (\code{for simd} and \code{do simd}) to enable simultaneous thread
|
||||
Also, the \kcode{simd} construct has been combined with the worksharing loop
|
||||
constructs (\kcode{for simd} and \kcode{do simd}) to enable simultaneous thread
|
||||
execution in different SIMD units.
|
||||
%Hence, the \code{simd} construct can be
|
||||
%used alone on a loop to direct vectorization (SIMD execution), or in
|
||||
@ -46,3 +45,8 @@ execution in different SIMD units.
|
||||
%\code{parallel for simd}).
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{SIMD/SIMD}
|
||||
\input{SIMD/linear_modifier}
|
||||
|
||||
|
||||
|
@ -1,9 +1,8 @@
|
||||
\pagebreak
|
||||
\chapter{OpenMP Affinity}
|
||||
\cchapter{OpenMP Affinity}{affinity}
|
||||
\label{chap:openmp_affinity}
|
||||
|
||||
OpenMP Affinity consists of a \code{proc\_bind} policy (thread affinity policy) and a specification of
|
||||
places (\texttt{"}location units\texttt{"} or \plc{processors} that may be cores, hardware
|
||||
OpenMP Affinity consists of a \kcode{proc_bind} policy (thread affinity policy) and a specification of
|
||||
places (``location units'' or \plc{processors} that may be cores, hardware
|
||||
threads, sockets, etc.).
|
||||
OpenMP Affinity enables users to bind computations on specific places.
|
||||
The placement will hold for the duration of the parallel region.
|
||||
@ -12,13 +11,13 @@ to different cores (hardware threads, sockets, etc.) prescribed within a given p
|
||||
if two or more cores (hardware threads, sockets, etc.) have been assigned to a given place.
|
||||
|
||||
Often the binding can be managed without resorting to explicitly setting places.
|
||||
Without the specification of places in the \code{OMP\_PLACES} variable,
|
||||
Without the specification of places in the \kcode{OMP_PLACES} variable,
|
||||
the OpenMP runtime will distribute and bind threads using the entire range of processors for
|
||||
the OpenMP program, according to the \code{OMP\_PROC\_BIND} environment variable
|
||||
or the \code{proc\_bind} clause. When places are specified, the OMP runtime
|
||||
the OpenMP program, according to the \kcode{OMP_PROC_BIND} environment variable
|
||||
or the \kcode{proc_bind} clause. When places are specified, the OMP runtime
|
||||
binds threads to the places according to a default distribution policy, or
|
||||
those specified in the \code{OMP\_PROC\_BIND} environment variable or the
|
||||
\code{proc\_bind} clause.
|
||||
those specified in the \kcode{OMP_PROC_BIND} environment variable or the
|
||||
\kcode{proc_bind} clause.
|
||||
|
||||
In the OpenMP Specifications document a processor refers to an execution unit that
|
||||
is enabled for an OpenMP thread to use. A processor is a core when there is
|
||||
@ -27,12 +26,12 @@ SMT is enabled, a processor is a hardware thread (HW-thread). (This is the
|
||||
usual case; but actually, the execution unit is implementation defined.) Processor
|
||||
numbers are numbered sequentially from 0 to the number of cores less one (without SMT), or
|
||||
0 to the number HW-threads less one (with SMT). OpenMP places use the processor number to designate
|
||||
binding locations (unless an \texttt{"}abstract name\texttt{"} is used.)
|
||||
binding locations (unless an ``abstract name'' is used.)
|
||||
|
||||
|
||||
The processors available to a process may be a subset of the system's
|
||||
processors. This restriction may be the result of a
|
||||
wrapper process controlling the execution (such as \code{numactl} on Linux systems),
|
||||
wrapper process controlling the execution (such as \plc{numactl} on Linux systems),
|
||||
compiler options, library-specific environment variables, or default
|
||||
kernel settings. For instance, the execution of multiple MPI processes,
|
||||
launched on a single compute node, will each have a subset of processors as
|
||||
@ -53,21 +52,21 @@ variables for the MPI library. %Forked threads within an MPI process
|
||||
%which sets \code{OMP\_PLACES} specifically for the MPI process.
|
||||
|
||||
Threads of a team are positioned onto places in a compact manner, a
|
||||
scattered distribution, or onto the master's place, by setting the
|
||||
\code{OMP\_PROC\_BIND} environment variable or the \code{proc\_bind} clause to
|
||||
\plc{close}, \plc{spread}, or \plc{master}, respectively. When
|
||||
\code{OMP\_PROC\_BIND} is set to FALSE no binding is enforced; and
|
||||
scattered distribution, or onto the primary thread's place, by setting the
|
||||
\kcode{OMP_PROC_BIND} environment variable or the \kcode{proc_bind} clause to
|
||||
\kcode{close}, \kcode{spread}, or \kcode{primary} (\kcode{master} has been deprecated), respectively. When
|
||||
\kcode{OMP_PROC_BIND} is set to FALSE no binding is enforced; and
|
||||
when the value is TRUE, the binding is implementation defined to
|
||||
a set of places in the \code{OMP\_PLACES} variable or to places
|
||||
defined by the implementation if the \code{OMP\_PLACES} variable
|
||||
is not set.
|
||||
a set of places in the \kcode{OMP_PLACES} variable or to places
|
||||
defined by the implementation if the \kcode{OMP_PLACES} variable
|
||||
is not set.
|
||||
|
||||
The \code{OMP\_PLACES} variable can also be set to an abstract name
|
||||
(\plc{threads}, \plc{cores}, \plc{sockets}) to specify that a place is
|
||||
The \kcode{OMP_PLACES} variable can also be set to an abstract name
|
||||
(\kcode{threads}, \kcode{cores}, \kcode{sockets}) to specify that a place is
|
||||
either a single hardware thread, a core, or a socket, respectively.
|
||||
This description of the \code{OMP\_PLACES} is most useful when the
|
||||
This description of the \kcode{OMP_PLACES} is most useful when the
|
||||
number of threads is equal to the number of hardware thread, cores
|
||||
or sockets. It can also be used with a \plc{close} or \plc{spread}
|
||||
or sockets. It can also be used with a \kcode{close} or \kcode{spread}
|
||||
distribution policy when the equality doesn't hold.
|
||||
|
||||
|
||||
@ -116,3 +115,11 @@ distribution policy when the equality doesn't hold.
|
||||
% thread # 0 * * * * _ _ _ _ _ _ _ _ #mask for thread 0
|
||||
% thread # 0 _ _ _ _ * * * * _ _ _ _ #mask for thread 1
|
||||
% thread # 0 _ _ _ _ _ _ _ _ * * * * #mask for thread 2
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{affinity/affinity}
|
||||
\input{affinity/task_affinity}
|
||||
\input{affinity/affinity_display}
|
||||
\input{affinity/affinity_query}
|
||||
|
||||
|
@ -1,13 +1,12 @@
|
||||
\pagebreak
|
||||
\chapter{Data Environment}
|
||||
\cchapter{Data Environment}{data_environment}
|
||||
\label{chap:data_environment}
|
||||
The OpenMP \plc{data environment} contains data attributes of variables and
|
||||
objects. Many constructs (such as \code{parallel}, \code{simd}, \code{task})
|
||||
objects. Many constructs (such as \kcode{parallel}, \kcode{simd}, \kcode{task})
|
||||
accept clauses to control \plc{data-sharing} attributes
|
||||
of referenced variables in the construct, where \plc{data-sharing} applies to
|
||||
whether the attribute of the variable is \plc{shared},
|
||||
is \plc{private} storage, or has special operational characteristics
|
||||
(as found in the \code{firstprivate}, \code{lastprivate}, \code{linear}, or \code{reduction} clause).
|
||||
(as found in the \kcode{firstprivate}, \kcode{lastprivate}, \kcode{linear}, or \kcode{reduction} clause).
|
||||
|
||||
The data environment for a device (distinguished as a \plc{device data environment})
|
||||
is controlled on the host by \plc{data-mapping} attributes, which determine the
|
||||
@ -22,15 +21,15 @@ Data-sharing attributes of variables can be classified as being \plc{predetermin
|
||||
|
||||
Certain variables and objects have predetermined attributes.
|
||||
A commonly found case is the loop iteration variable in associated loops
|
||||
of a \code{for} or \code{do} construct. It has a private data-sharing attribute.
|
||||
Variables with predetermined data-sharing attributes can not be listed in a data-sharing clause; but there are some
|
||||
of a \kcode{for} or \kcode{do} construct. It has a private data-sharing attribute.
|
||||
Variables with predetermined data-sharing attributes cannot be listed in a data-sharing clause; but there are some
|
||||
exceptions (mainly concerning loop iteration variables).
|
||||
|
||||
Variables with explicitly determined data-sharing attributes are those that are
|
||||
referenced in a given construct and are listed in a data-sharing attribute
|
||||
clause on the construct. Some of the common data-sharing clauses are:
|
||||
\code{shared}, \code{private}, \code{firstprivate}, \code{lastprivate},
|
||||
\code{linear}, and \code{reduction}. % Are these all of them?
|
||||
\kcode{shared}, \kcode{private}, \kcode{firstprivate}, \kcode{lastprivate},
|
||||
\kcode{linear}, and \kcode{reduction}. % Are these all of them?
|
||||
|
||||
Variables with implicitly determined data-sharing attributes are those
|
||||
that are referenced in a given construct, do not have predetermined
|
||||
@ -38,38 +37,59 @@ data-sharing attributes, and are not listed in a data-sharing
|
||||
attribute clause of an enclosing construct.
|
||||
For a complete list of variables and objects with predetermined and
|
||||
implicitly determined attributes, please refer to the
|
||||
\plc{Data-sharing Attribute Rules for Variables Referenced in a Construct}
|
||||
\docref{Data-sharing Attribute Rules for Variables Referenced in a Construct}
|
||||
subsection of the OpenMP Specifications document.
|
||||
|
||||
\bigskip
|
||||
DATA-MAPPING ATTRIBUTES
|
||||
|
||||
The \code{map} clause on a device construct explictly specifies how the list items in
|
||||
The \kcode{map} clause on a device construct explicitly specifies how the list items in
|
||||
the clause are mapped from the encountering task's data environment (on the host)
|
||||
to the corresponding item in the device data environment (on the device).
|
||||
The common \plc{list items} are arrays, array sections, scalars, pointers, and
|
||||
structure elements (members).
|
||||
|
||||
Procedures and global variables have predetermined data mapping if they appear
|
||||
within the list or block of a \code{declare target} directive. Also, a C/C++ pointer
|
||||
within the list or block of a \kcode{declare target} directive. Also, a C/C++ pointer
|
||||
is mapped as a zero-length array section, as is a C++ variable that is a reference to a pointer.
|
||||
% Waiting for response from Eric on this.
|
||||
|
||||
Without explict mapping, non-scalar and non-pointer variables within the scope of the \code{target}
|
||||
construct are implicitly mapped with a \plc{map-type} of \code{tofrom}.
|
||||
Without explicit mapping, scalar variables within the scope of the \code{target}
|
||||
Without explicit mapping, non-scalar and non-pointer variables within the scope of the \kcode{target}
|
||||
construct are implicitly mapped with a \plc{map-type} of \kcode{tofrom}.
|
||||
Without explicit mapping, scalar variables within the scope of the \kcode{target}
|
||||
construct are not mapped, but have an implicit firstprivate data-sharing
|
||||
attribute. (That is, the value of the original variable is given to a private
|
||||
variable of the same name on the device.) This behavior can be changed with
|
||||
the \code{defaultmap} clause.
|
||||
the \kcode{defaultmap} clause.
|
||||
|
||||
The \code{map} clause can appear on \code{target}, \code{target data} and
|
||||
\code{target enter/exit data} constructs. The operations of creation and
|
||||
The \kcode{map} clause can appear on \kcode{target}, \kcode{target data} and
|
||||
\kcode{target enter/exit data} constructs. The operations of creation and
|
||||
removal of device storage as well as assignment of the original list item
|
||||
values to the corresponding list items may be complicated when the list
|
||||
item appears on multiple constructs or when the host and device storage
|
||||
is shared. In these cases the item's reference count, the number of times
|
||||
it has been referenced (+1 on entry and -1 on exited) in nested (structured)
|
||||
it has been referenced (increment by 1 on entry and decrement by 1 on exit) in nested (structured)
|
||||
map regions and/or accumulative (unstructured) mappings, determines the operation.
|
||||
Details of the \code{map} clause and reference count operation are specified
|
||||
in the \plc{map Clause} subsection of the OpenMP Specifications document.
|
||||
Details of the \kcode{map} clause and reference count operation are specified
|
||||
in the \docref{\kcode{map} Clause} subsection of the OpenMP Specifications document.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{data_environment/threadprivate}
|
||||
\input{data_environment/default_none}
|
||||
\input{data_environment/private}
|
||||
\input{data_environment/fort_loopvar}
|
||||
\input{data_environment/fort_sp_common}
|
||||
\input{data_environment/fort_sa_private}
|
||||
\input{data_environment/fort_shared_var}
|
||||
\input{data_environment/carrays_fpriv}
|
||||
\input{data_environment/lastprivate}
|
||||
\input{data_environment/reduction}
|
||||
\input{data_environment/udr}
|
||||
\input{data_environment/induction}
|
||||
\input{data_environment/scan}
|
||||
\input{data_environment/copyin}
|
||||
\input{data_environment/copyprivate}
|
||||
\input{data_environment/cpp_reference}
|
||||
\input{data_environment/associate}
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
\pagebreak
|
||||
\chapter{Devices}
|
||||
\cchapter{Devices}{devices}
|
||||
\label{chap:devices}
|
||||
|
||||
The \code{target} construct consists of a \code{target} directive
|
||||
and an execution region. The \code{target} region is executed on
|
||||
the default device or the device specified in the \code{device}
|
||||
The \kcode{target} construct consists of a \kcode{target} directive
|
||||
and an execution region. The \kcode{target} region is executed on
|
||||
the default device or the device specified in the \kcode{device}
|
||||
clause.
|
||||
|
||||
In OpenMP version 4.0, by default, all variables within the lexical
|
||||
@ -16,38 +15,65 @@ data to the device storage.
|
||||
|
||||
The constructs that explicitly
|
||||
create storage, transfer data, and free storage on the device
|
||||
are catagorized as structured and unstructured. The
|
||||
\code{target} \code{data} construct is structured. It creates
|
||||
a data region around \code{target} constructs, and is
|
||||
are categorized as structured and unstructured. The
|
||||
\kcode{target data} construct is structured. It creates
|
||||
a data region around \kcode{target} constructs, and is
|
||||
convenient for providing persistent data throughout multiple
|
||||
\code{target} regions. The \code{target} \code{enter} \code{data} and
|
||||
\code{target} \code{exit} \code{data} constructs are unstructured, because
|
||||
they can occur anywhere and do not support a "structure"
|
||||
(a region) for enclosing \code{target} constructs, as does the
|
||||
\code{target} \code{data} construct.
|
||||
\kcode{target} regions. The \kcode{target enter data} and
|
||||
\kcode{target exit data} constructs are unstructured, because
|
||||
they can occur anywhere and do not support a ``structure''
|
||||
(a region) for enclosing \kcode{target} constructs, as does the
|
||||
\kcode{target data} construct.
|
||||
|
||||
The \code{map} clause is used on \code{target}
|
||||
The \kcode{map} clause is used on \kcode{target}
|
||||
constructs and the data-type constructs to map host data. It
|
||||
specifies the device storage and data movement \code{to} and \code{from}
|
||||
specifies the device storage and data movement \plc{to} and \plc{from}
|
||||
the device, and controls on the storage duration.
|
||||
|
||||
There is an important change in the OpenMP 4.5 specification
|
||||
that alters the data model for scalar variables and C/C++ pointer variables.
|
||||
The default behavior for scalar variables and C/C++ pointer variables
|
||||
in an 4.5 compliant code is \code{firstprivate}. Example
|
||||
in a 4.5 compliant code is \kcode{firstprivate}. Example
|
||||
codes that have been updated to reflect this new behavior are
|
||||
annotated with a description that describes changes required
|
||||
for correct execution. Often it is a simple matter of mapping
|
||||
the variable as \code{tofrom} to obtain the intended 4.0 behavior.
|
||||
the variable as \kcode{tofrom} to obtain the intended 4.0 behavior.
|
||||
|
||||
In OpenMP version 4.5 the mechanism for target
|
||||
execution is specified as occuring through a \plc{target task}.
|
||||
When the \code{target} construct is encountered a new
|
||||
\plc{target task} is generated. The \plc{target task}
|
||||
completes after the \code{target} region has executed and all data
|
||||
execution is specified as occurring through a \plc{target task}.
|
||||
When the \kcode{target} construct is encountered a new
|
||||
target task is generated. The target task
|
||||
completes after the \kcode{target} region has executed and all data
|
||||
transfers have finished.
|
||||
|
||||
This new specification does not affect the execution of
|
||||
pre-4.5 code; it is a necessary element for asynchronous
|
||||
execution of the \code{target} region when using the new \code{nowait}
|
||||
execution of the \kcode{target} region when using the new \kcode{nowait}
|
||||
clause introduced in OpenMP 4.5.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{devices/target}
|
||||
\input{devices/target_defaultmap}
|
||||
\input{devices/target_pointer_mapping}
|
||||
\input{devices/target_structure_mapping}
|
||||
\input{devices/target_fort_allocatable_array_mapping}
|
||||
\input{devices/array_sections}
|
||||
\input{devices/usm}
|
||||
\input{devices/C++_virtual_functions}
|
||||
\input{devices/array_shaping}
|
||||
\input{devices/target_mapper}
|
||||
\input{devices/target_data}
|
||||
\input{devices/target_unstructured_data}
|
||||
\input{devices/target_update}
|
||||
\input{devices/declare_target}
|
||||
\input{devices/lambda_expressions}
|
||||
\input{devices/teams}
|
||||
\input{devices/async_target_depend}
|
||||
\input{devices/async_target_with_tasks}
|
||||
\input{devices/async_target_nowait}
|
||||
\input{devices/async_target_nowait_depend}
|
||||
\input{devices/async_target_nowait_arg}
|
||||
\input{devices/device}
|
||||
\input{devices/device_env_traits}
|
||||
|
||||
|
71
Chap_directives.tex
Normal file
71
Chap_directives.tex
Normal file
@ -0,0 +1,71 @@
|
||||
\cchapter{OpenMP Directive Syntax}{directives}
|
||||
\label{chap:directive_syntax}
|
||||
\index{directive syntax}
|
||||
|
||||
OpenMP \plc{directives} use base-language mechanisms to specify OpenMP program behavior.
|
||||
In C/C++ code, the directives are formed with
|
||||
either pragmas or attributes.
|
||||
Fortran directives are formed with comments in free form and fixed form sources (codes).
|
||||
All of these mechanisms allow the compilation to ignore the OpenMP directives if
|
||||
OpenMP is not supported or enabled.
|
||||
|
||||
|
||||
The OpenMP directive is a combination of the base-language mechanism and a \plc{directive-specification},
|
||||
as shown below. The \plc{directive-specification} consists
|
||||
of the \plc{directive-name} which may seldomly have arguments,
|
||||
followed by optional \plc{clauses}. Full details of the syntax can be found in the OpenMP Specification.
|
||||
Illustrations of the syntax is given in the examples.
|
||||
|
||||
The formats for combining a base-language mechanism and a \plc{directive-specification} are:
|
||||
|
||||
C/C++ pragmas
|
||||
\begin{indentedcodelist}
|
||||
#pragma omp \plc{directive-specification}
|
||||
\end{indentedcodelist}
|
||||
|
||||
C/C++ attribute specifiers
|
||||
\begin{indentedcodelist}
|
||||
[[omp :: directive( \plc{directive-specification} )]]
|
||||
[[omp :: decl( \plc{directive-specification} )]]
|
||||
\end{indentedcodelist}
|
||||
|
||||
C++ attribute specifiers
|
||||
\begin{indentedcodelist}
|
||||
[[using omp : directive( \plc{directive-specification} )]]
|
||||
[[using omp : decl( \plc{directive-specification} )]]
|
||||
\end{indentedcodelist}
|
||||
|
||||
where the \kcode{decl} attribute may be used for declarative
|
||||
directives alternatively.
|
||||
|
||||
Fortran comments
|
||||
\begin{indentedcodelist}
|
||||
!$omp \plc{directive-specification}
|
||||
\end{indentedcodelist}
|
||||
|
||||
where \scode{c$omp} and \scode{*$omp} may be used in Fortran fixed form sources.
|
||||
|
||||
Most OpenMP directives accept clauses that alter the semantics of the directive in some way,
|
||||
and some directives also accept parenthesized arguments that follow the directive name.
|
||||
A clause may just be a keyword (e.g., \kcode{untied}) or it may also accept argument lists
|
||||
(e.g., \kcode{shared(\ucode{x,y,z})}) and/or optional modifiers (e.g., \kcode{tofrom} in
|
||||
\kcode{map(tofrom: \ucode{x,y,z})}).
|
||||
Clause modifiers may be ``simple'' or ``complex'' -- a complex modifier consists of a
|
||||
keyword followed by one or more parameters, bracketed by parentheses, while a simple
|
||||
modifier does not. An example of a complex modifier is the \kcode{iterator} modifier,
|
||||
as in \kcode{map(iterator(\ucode{i=0:n}), tofrom: \ucode{p[i]})}, or the \kcode{step} modifier, as in
|
||||
\kcode{linear(\ucode{x}: ref, step(\ucode{4}))}.
|
||||
In the preceding examples, \kcode{tofrom} and \kcode{ref} are simple modifiers.
|
||||
|
||||
For Fortran, a declarative directive (such as \kcode{declare reduction})
|
||||
must appear after any \bcode{USE}, \bcode{IMPORT}, and \bcode{IMPLICIT} statements
|
||||
in the specification part.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{directives/pragmas}
|
||||
\input{directives/attributes}
|
||||
\input{directives/fixed_format_comments}
|
||||
\input{directives/free_format_comments}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
% This is the introduction for the OpenMP Examples document.
|
||||
% This is an included file. See the master file (openmp-examples.tex) for more information.
|
||||
% This is an included file. See the main file (openmp-examples.tex) for more information.
|
||||
%
|
||||
% When editing this file:
|
||||
%
|
||||
@ -32,46 +32,42 @@
|
||||
% This is a \plc{var-name}.
|
||||
%
|
||||
|
||||
\chapter*{Introduction}
|
||||
\cchapter{Introduction}{introduction}
|
||||
\label{chap:introduction}
|
||||
\addcontentsline{toc}{chapter}{\protect\numberline{}Introduction}
|
||||
|
||||
This collection of programming examples supplements the OpenMP API for Shared
|
||||
Memory Parallelization specifications, and is not part of the formal specifications. It
|
||||
assumes familiarity with the OpenMP specifications, and shares the typographical
|
||||
conventions used in that document.
|
||||
|
||||
\notestart
|
||||
\noteheader – This first release of the OpenMP Examples reflects the OpenMP Version 4.5
|
||||
specifications. Additional examples are being developed and will be published in future
|
||||
releases of this document.
|
||||
\noteend
|
||||
|
||||
The OpenMP API specification provides a model for parallel programming that is
|
||||
portable across shared memory architectures from different vendors. Compilers from
|
||||
numerous vendors support the OpenMP API.
|
||||
|
||||
The directives, library routines, and environment variables demonstrated in this
|
||||
document allow users to create and manage parallel programs while permitting
|
||||
portability. The directives extend the C, C++ and Fortran base languages with single
|
||||
program multiple data (SPMD) constructs, tasking constructs, device constructs,
|
||||
worksharing constructs, and synchronization constructs, and they provide support for
|
||||
portability. The directives extend the C, C++ and Fortran base languages with \plc{single
|
||||
program multiple data} (SPMD) constructs, \plc{tasking} constructs, \plc{device} constructs,
|
||||
\plc{worksharing} constructs, and \plc{synchronization} constructs, and they provide support for
|
||||
sharing and privatizing data. The functionality to control the runtime environment is
|
||||
provided by library routines and environment variables. Compilers that support the
|
||||
OpenMP API often include a command line option to the compiler that activates and
|
||||
allows interpretation of all OpenMP directives.
|
||||
|
||||
The latest source codes for OpenMP Examples can be downloaded from the \code{sources}
|
||||
directory at
|
||||
\href{https://github.com/OpenMP/Examples}{https://github.com/OpenMP/Examples}.
|
||||
The codes for this OpenMP \VER{} Examples document have the tag \plc{v\VER}.
|
||||
|
||||
%\href{https://github.com/OpenMP/Examples/tree/master/sources}{https://github.com/OpenMP/Examples/sources}.
|
||||
The documents and source codes for OpenMP Examples can be downloaded from
|
||||
\href{\examplesrepo}{\examplesrepo}.
|
||||
Each directory holds the contents of a chapter and has a \plc{sources} subdirectory of its codes.
|
||||
This OpenMP Examples \VER{} document and its codes are tagged as
|
||||
\examplestree{\VER}{\plc{v\VER}}.
|
||||
|
||||
Complete information about the OpenMP API and a list of the compilers that support
|
||||
the OpenMP API can be found at the OpenMP.org web site
|
||||
|
||||
\code{http://www.openmp.org}
|
||||
\scode{https://www.openmp.org}
|
||||
|
||||
\clearpage
|
||||
|
||||
\input{introduction/Examples}
|
||||
|
||||
% This is the end of introduction.tex of the OpenMP Examples document.
|
||||
|
27
Chap_loop_transformations.tex
Normal file
27
Chap_loop_transformations.tex
Normal file
@ -0,0 +1,27 @@
|
||||
\cchapter{Loop Transformations}{loop_transformations}
|
||||
\label{chap:loop_transformations}
|
||||
|
||||
To obtain better performance on a platform, code may need to be restructured
|
||||
relative to the way it is written (which is often for best readability).
|
||||
User-directed loop transformations accomplish this goal by providing a means
|
||||
to separate code semantics and its optimization.
|
||||
|
||||
A loop transformation construct states that a transformation operation is to be
|
||||
performed on set of nested loops. This directive approach can target specific loops
|
||||
for transformation, rather than applying more time-consuming general compiler
|
||||
heuristics methods with compiler options that may not be able to discover
|
||||
optimal transformations.
|
||||
|
||||
Loop transformations can be augmented by preprocessor support or OpenMP \kcode{metadirective}
|
||||
directives, to select optimal dimension and size parameters for specific platforms,
|
||||
facilitating a single code base for multiple platforms.
|
||||
Moreover, directive-based transformations make experimenting easier:
|
||||
whereby specific hot spots can be affected by transformation directives.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{loop_transformations/tile}
|
||||
\input{loop_transformations/partial_tile}
|
||||
\input{loop_transformations/unroll}
|
||||
\input{loop_transformations/apply}
|
||||
|
@ -1,45 +1,70 @@
|
||||
\pagebreak
|
||||
\chapter{Memory Model}
|
||||
\cchapter{Memory Model}{memory_model}
|
||||
\label{chap:memory_model}
|
||||
|
||||
In this chapter, examples illustrate race conditions on access to variables with
|
||||
shared data-sharing attributes. A race condition can exist when two
|
||||
or more threads are involved in accessing a variable in which not all
|
||||
of the accesses are reads; that is, a WaR, RaW or WaW condition
|
||||
exists (R=read, a=after, W=write). A RaR does not produce a race condition.
|
||||
Ensuring thread execution order at
|
||||
the processor level is not enough to avoid race conditions, because the
|
||||
local storage at the processor level (registers, caches, etc.)
|
||||
must be synchronized so that a consistent view of the variable in the
|
||||
memory hierarchy can be seen by the threads accessing the variable.
|
||||
OpenMP provides a shared-memory model that allows all threads on a given
|
||||
device shared access to \emph{memory}. For a given OpenMP region that may be
|
||||
executed by more than one thread or SIMD lane, variables in memory may be
|
||||
\plc{shared} or \plc{private} with respect to those threads or SIMD lanes. A
|
||||
variable's data-sharing attribute indicates whether it is shared (the
|
||||
\plc{shared} attribute) or private (the \plc{private}, \plc{firstprivate},
|
||||
\plc{lastprivate}, \plc{linear}, and \plc{reduction} attributes) in the data
|
||||
environment of an OpenMP region. While private variables in an OpenMP region
|
||||
are new copies of the original variable (with same name) that may then be
|
||||
concurrently accessed or modified by their respective threads or SIMD lanes, a
|
||||
shared variable in an OpenMP region is the same as the variable of the same
|
||||
name in the enclosing region. Concurrent accesses or modifications to a
|
||||
shared variable may therefore require synchronization to avoid data races.
|
||||
|
||||
OpenMP provides a shared-memory model which allows all threads access
|
||||
to \plc{memory} (shared data). Each thread also has exclusive
|
||||
access to \plc{threadprivate memory} (private data). A private
|
||||
variable referenced in an OpenMP directive's structured block is a
|
||||
new version of the original variable (with the same name) for each
|
||||
task (or SIMD lane) within the code block. A private variable is
|
||||
initially undefined (except for variables in \code{firstprivate}
|
||||
and \code{linear} clauses), and the original variable value is
|
||||
unaltered by assignments to the private variable, (except for
|
||||
\code{reduction}, \code{lastprivate} and \code{linear} clauses).
|
||||
OpenMP's memory model also includes a \emph{temporary view} of memory that is
|
||||
associated with each thread. Two different threads may see different values for
|
||||
a given variable in their respective temporary views. Threads may employ flush
|
||||
operations for the purposes of making their temporary view of a variable
|
||||
consistent with the value of the variable in memory. The effect of a given
|
||||
flush operation is characterized by its flush properties -- some combination of
|
||||
\plc{strong}, \plc{release}, and \plc{acquire} -- and, for \plc{strong}
|
||||
flushes, a \plc{flush-set}.
|
||||
|
||||
Private variables in an outer \code{parallel} region can be
|
||||
shared by implicit tasks of an inner \code{parallel} region
|
||||
(with a \code{share} clause on the inner \code{parallel} directive).
|
||||
Likewise, a private variable may be shared in the region of an
|
||||
explicit \code{task} (through a \code{shared} clause).
|
||||
A \plc{strong} flush will force consistency between the temporary view and the
|
||||
memory for all variables in its \plc{flush-set}. Furthermore, all strong flushes in a
|
||||
program that have intersecting flush-sets will execute in some total order, and
|
||||
within a thread strong flushes may not be reordered with respect to other
|
||||
memory operations on variables in its flush-set. \plc{Release} and
|
||||
\plc{acquire} flushes operate in pairs. A release flush may ``synchronize''
|
||||
with an acquire flush, and when it does so the local memory operations that
|
||||
precede the release flush will appear to have been completed before the local
|
||||
memory operations on the same variables that follow the acquire flush.
|
||||
|
||||
Flush operations arise from explicit \kcode{flush} directives, implicit
|
||||
\kcode{flush} directives, and also from the execution of \kcode{atomic}
|
||||
constructs. The \kcode{flush} directive forces a consistent view of local
|
||||
variables of the thread executing the \kcode{flush}. When a list is supplied on
|
||||
the directive, only the items (variables) in the list are guaranteed to be
|
||||
flushed. Implied flushes exist at prescribed locations of certain constructs.
|
||||
For the complete list of these locations and associated constructs, please
|
||||
refer to the \docref{\kcode{flush} Construct} section of the OpenMP Specifications
|
||||
document.
|
||||
|
||||
In this chapter, examples illustrate how race conditions may arise for accesses
|
||||
to variables with a \plc{shared} data-sharing attribute when flush operations
|
||||
are not properly employed. A race condition can exist when two or more threads
|
||||
are involved in accessing a variable and at least one of the accesses modifies
|
||||
the variable. In particular, a data race will arise when conflicting accesses
|
||||
do not have a well-defined \emph{completion order}. The existence of data
|
||||
races in OpenMP programs result in undefined behavior, and so they should
|
||||
generally be avoided for programs to be correct. The completion order of
|
||||
accesses to a shared variable is guaranteed in OpenMP through a set of memory
|
||||
consistency rules that are described in the \docref{OpenMP Memory Consistency}
|
||||
section of the OpenMP Specifications document.
|
||||
|
||||
%This chapter also includes examples that exhibit non-sequentially consistent
|
||||
%(\emph{non-SC}) behavior. Sequential consistency (\emph{SC}) is the desirable
|
||||
%property that the results of a multi-threaded program are as if all operations
|
||||
%are performed in some total order, consistent with the program order of
|
||||
%operations performed by each thread. OpenMP guarantees that a correct program
|
||||
%(i.e. a program that does not have a data race) will exhibit SC behavior
|
||||
%so long as the only \code{atomic} constructs it uses are SC atomic directives.
|
||||
|
||||
|
||||
The \code{flush} directive forces a consistent view of local variables
|
||||
of the thread executing the \code{flush}.
|
||||
When a list is supplied on the directive, only the items (variables)
|
||||
in the list are guaranteed to be flushed.
|
||||
|
||||
Implied flushes exist at prescribed locations of certain constructs.
|
||||
For the complete list of these locations and associated constructs,
|
||||
please refer to the \plc{flush Construct} section of the OpenMP
|
||||
Specifications document.
|
||||
|
||||
% The following table lists construct in which implied flushes exist, and the
|
||||
% location of their execution.
|
||||
@ -102,4 +127,11 @@ Specifications document.
|
||||
% specific storage location accessed atomically (specified as the \plc{x} variable
|
||||
% in \plc{atomic Construct} subsection of the OpenMP Specifications document).
|
||||
|
||||
Examples 1-3 show the difficulty of synchronizing threads through \code{flush} and \code{atomic} directives.
|
||||
% Examples 1-3 show the difficulty of synchronizing threads through \code{flush} and \code{atomic} directives.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{memory_model/mem_model}
|
||||
\input{memory_model/allocators}
|
||||
\input{memory_model/fort_race}
|
||||
|
||||
|
19
Chap_ompt_interface.tex
Normal file
19
Chap_ompt_interface.tex
Normal file
@ -0,0 +1,19 @@
|
||||
\cchapter{OMPT Interface}{ompt_interface}
|
||||
\label{chap:ompt_interface}
|
||||
OMPT defines mechanisms and an API for interfacing with tools in the OpenMP program.
|
||||
|
||||
The OMPT API provides the following functionality:
|
||||
\begin{itemize}
|
||||
\addtolength{\itemindent}{1cm}
|
||||
\item examines the state associated with an OpenMP thread
|
||||
\item interprets the call stack of an OpenMP thread
|
||||
\item receives notification about OpenMP events
|
||||
\item traces activity on OpenMP target devices
|
||||
\item assesses implementation-dependent details
|
||||
\item controls a tool from an OpenMP application
|
||||
\end{itemize}
|
||||
|
||||
The following sections will illustrate basic mechanisms and operations of the OMPT API.
|
||||
|
||||
|
||||
\input{ompt_interface/ompt_start}
|
@ -1,104 +1,130 @@
|
||||
\pagebreak
|
||||
\chapter{Parallel Execution}
|
||||
\cchapter{Parallel Execution}{parallel_execution}
|
||||
\label{chap:parallel_execution}
|
||||
|
||||
A single thread, the \plc{initial thread}, begins sequential execution of
|
||||
an OpenMP enabled program, as if the whole program is in an implicit parallel
|
||||
region consisting of an implicit task executed by the \plc{initial thread}.
|
||||
|
||||
A \code{parallel} construct encloses code,
|
||||
forming a parallel region. An \plc{initial thread} encountering a \code{parallel}
|
||||
A \kcode{parallel} construct encloses code,
|
||||
forming a parallel region. An \plc{initial thread} encountering a \kcode{parallel}
|
||||
region forks (creates) a team of threads at the beginning of the
|
||||
\code{parallel} region, and joins them (removes from execution) at the
|
||||
end of the region. The initial thread becomes the master thread of the team in a
|
||||
\code{parallel} region with a \plc{thread} number equal to zero, the other
|
||||
\kcode{parallel} region, and joins them (removes from execution) at the
|
||||
end of the region. The initial thread becomes the primary thread of the team in a
|
||||
\kcode{parallel} region with a \plc{thread} number equal to zero, the other
|
||||
threads are numbered from 1 to number of threads minus 1.
|
||||
A team may be comprised of just a single thread.
|
||||
|
||||
Each thread of a team is assigned an implicit task consisting of code within the
|
||||
parallel region. The task that creates a parallel region is suspended while the
|
||||
Each \plc{thread} of a team is assigned an implicit task consisting of code within the
|
||||
\kcode{parallel} region. The task that creates a \kcode{parallel} region is suspended while the
|
||||
tasks of the team are executed. A thread is tied to its task; that is,
|
||||
only the thread assigned to the task can execute that task. After completion
|
||||
of the \code{parallel} region, the master thread resumes execution of the generating task.
|
||||
of the \kcode{parallel} region, the primary thread resumes execution of the generating task.
|
||||
|
||||
%After the \code{parallel} region the master thread becomes the initial
|
||||
%After the \code{parallel} region the primary thread becomes the initial
|
||||
%thread again, and continues to execute the \plc{sequential part}.
|
||||
|
||||
Any task within a \code{parallel} region is allowed to encounter another
|
||||
\code{parallel} region to form a nested \code{parallel} region. The
|
||||
parallelism of a nested \code{parallel} region (whether it forks additional
|
||||
Any task within a \kcode{parallel} region is allowed to encounter another
|
||||
\kcode{parallel} region to form a nested \kcode{parallel} region. The
|
||||
parallelism of a nested \kcode{parallel} region (whether it forks additional
|
||||
threads, or is executed serially by the encountering task) can be controlled by the
|
||||
\code{OMP\_NESTED} environment variable or the \code{omp\_set\_nested()}
|
||||
\kcode{OMP_NESTED} environment variable or the \kcode{omp_set_nested()}
|
||||
API routine with arguments indicating true or false.
|
||||
|
||||
The number of threads of a \code{parallel} region can be set by the \code{OMP\_NUM\_THREADS}
|
||||
environment variable, the \code{omp\_set\_num\_threads()} routine, or on the \code{parallel}
|
||||
directive with the \code{num\_threads}
|
||||
The number of threads of a \kcode{parallel} region can be set by the \kcode{OMP_NUM_THREADS}
|
||||
environment variable, the \kcode{omp_set_num_threads()} routine, or on the \kcode{parallel}
|
||||
directive with the \kcode{num_threads}
|
||||
clause. The routine overrides the environment variable, and the clause overrides all.
|
||||
Use the \code{OMP\_DYNAMIC}
|
||||
or the \code{omp\_set\_dynamic()} function to specify that the OpenMP
|
||||
Use the \kcode{OMP_DYNAMIC}
|
||||
or the \kcode{omp_set_dynamic()} function to specify that the OpenMP
|
||||
implementation dynamically adjust the number of threads for
|
||||
\code{parallel} regions. The default setting for dynamic adjustment is implementation
|
||||
\kcode{parallel} regions. The default setting for dynamic adjustment is implementation
|
||||
defined. When dynamic adjustment is on and the number of threads is specified,
|
||||
the number of threads becomes an upper limit for the number of threads to be
|
||||
provided by the OpenMP runtime.
|
||||
|
||||
\pagebreak
|
||||
%\pagebreak
|
||||
\bigskip
|
||||
WORKSHARING CONSTRUCTS
|
||||
|
||||
A worksharing construct distributes the execution of the associated region
|
||||
among the members of the team that encounter it. There is an
|
||||
implied barrier at the end of the worksharing region
|
||||
(there is no barrier at the beginning). The worksharing
|
||||
constructs are:
|
||||
(there is no barrier at the beginning).
|
||||
|
||||
\newpage
|
||||
The worksharing constructs are:
|
||||
|
||||
\begin{compactitem}
|
||||
|
||||
\item loop constructs: {\code{for} and \code{do} }
|
||||
\item \code{sections}
|
||||
\item \code{single}
|
||||
\item \code{workshare}
|
||||
\item loop constructs: {\kcode{for} and \kcode{do} }
|
||||
\item \kcode{sections}
|
||||
\item \kcode{single}
|
||||
\item \kcode{workshare}
|
||||
|
||||
\end{compactitem}
|
||||
|
||||
The \code{for} and \code{do} constructs (loop constructs) create a region
|
||||
The \kcode{for} and \kcode{do} constructs (loop constructs) create a region
|
||||
consisting of a loop. A loop controlled by a loop construct is called
|
||||
an \plc{associated} loop. Nested loops can form a single region when the
|
||||
\code{collapse} clause (with an integer argument) designates the number of
|
||||
\kcode{collapse} clause (with an integer argument) designates the number of
|
||||
\plc{associated} loops to be executed in parallel, by forming a
|
||||
"single iteration space" for the specified number of nested loops.
|
||||
The \code{ordered} clause can also control multiple associated loops.
|
||||
``single iteration space'' for the specified number of nested loops.
|
||||
The \kcode{ordered} clause can also control multiple associated loops.
|
||||
|
||||
An associated loop must adhere to a "canonical form" (specified in the
|
||||
\plc{Canonical Loop Form} of the OpenMP Specifications document) which allows the
|
||||
An associated loop must adhere to a ``canonical form'' (specified in the
|
||||
\docref{Canonical Loop Form} of the OpenMP Specifications document) which allows the
|
||||
iteration count (of all associated loops) to be computed before the
|
||||
(outermost) loop is executed. %[58:27-29].
|
||||
Most common loops comply with the canonical form, including C++ iterators.
|
||||
|
||||
A \code{single} construct forms a region in which only one thread (any one
|
||||
A \kcode{single} construct forms a region in which only one thread (any one
|
||||
of the team) executes the region.
|
||||
The other threads wait at the implied
|
||||
barrier at the end, unless the \code{nowait} clause is specified.
|
||||
barrier at the end, unless the \kcode{nowait} clause is specified.
|
||||
|
||||
The \code{sections} construct forms a region that contains one or more
|
||||
structured blocks. Each block of a \code{sections} directive is
|
||||
constructed with a \code{section} construct, and executed once by
|
||||
The \kcode{sections} construct forms a region that contains one or more
|
||||
structured blocks. Each block of a \kcode{sections} directive is
|
||||
constructed with a \kcode{section} construct, and executed once by
|
||||
one of the threads (any one) in the team. (If only one block is
|
||||
formed in the region, the \code{section} construct, which is used to
|
||||
formed in the region, the \kcode{section} construct, which is used to
|
||||
separate blocks, is not required.)
|
||||
The other threads wait at the implied
|
||||
barrier at the end, unless the \code{nowait} clause is specified.
|
||||
barrier at the end, unless the \kcode{nowait} clause is specified.
|
||||
|
||||
|
||||
The \code{workshare} construct is a Fortran feature that consists of a
|
||||
The \kcode{workshare} construct is a Fortran feature that consists of a
|
||||
region with a single structure block (section of code). Statements in the
|
||||
\code{workshare} region are divided into units of work, and executed (once)
|
||||
\kcode{workshare} region are divided into units of work, and executed (once)
|
||||
by threads of the team.
|
||||
|
||||
\bigskip
|
||||
MASTER CONSTRUCT
|
||||
MASKED CONSTRUCT
|
||||
|
||||
The \kcode{masked} construct is not a worksharing construct. The \kcode{masked} region is
|
||||
executed only by the primary thread. There is no implicit barrier (and flush)
|
||||
at the end of the \kcode{masked} region; hence the other threads of the team continue
|
||||
execution beyond code statements beyond the \kcode{masked} region.
|
||||
The \kcode{master} construct, which has been deprecated in OpenMP 5.1, has identical semantics
|
||||
to the \kcode{masked} construct with no \kcode{filter} clause.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{parallel_execution/ploop}
|
||||
\input{parallel_execution/parallel}
|
||||
\input{parallel_execution/host_teams}
|
||||
\input{parallel_execution/nthrs_nesting}
|
||||
\input{parallel_execution/nthrs_dynamic}
|
||||
\input{parallel_execution/fort_do}
|
||||
\input{parallel_execution/nowait}
|
||||
\input{parallel_execution/collapse}
|
||||
\input{parallel_execution/linear_in_loop}
|
||||
\input{parallel_execution/psections}
|
||||
\input{parallel_execution/fpriv_sections}
|
||||
\input{parallel_execution/single}
|
||||
\input{parallel_execution/workshare}
|
||||
\input{parallel_execution/masked}
|
||||
\input{parallel_execution/loop}
|
||||
\input{parallel_execution/pra_iterator}
|
||||
\input{parallel_execution/set_dynamic_nthrs}
|
||||
\input{parallel_execution/get_nthrs}
|
||||
|
||||
The \code{master} construct is not a worksharing construct. The master region is
|
||||
is executed only by the master thread. There is no implicit barrier (and flush)
|
||||
at the end of the \code{master} region; hence the other threads of the team continue
|
||||
execution beyond code statements beyond the \code{master} region.
|
||||
|
@ -1,52 +1,62 @@
|
||||
\pagebreak
|
||||
\chapter{Program Control}
|
||||
\label{sec:program_control}
|
||||
\cchapter{Program Control}{program_control}
|
||||
\label{chap:program_control}
|
||||
|
||||
Some specific and elementary concepts of controlling program execution are
|
||||
illustrated in the examples of this chapter. Control can be directly
|
||||
managed with conditional control code (ifdef's with the \code{\_OPENMP}
|
||||
macro, and the Fortran sentinel (\code{!\$})
|
||||
for conditionally compiling). The \code{if} clause on some constructs
|
||||
Basic concepts and mechanisms for directing and controlling a program compilation and execution
|
||||
are provided in this introduction and illustrated in subsequent examples.
|
||||
|
||||
\bigskip
|
||||
CONDITIONAL COMPILATION and EXECUTION
|
||||
|
||||
Conditional compilation can be performed with conventional \bcode{\#ifdef} directives
|
||||
in C, C++, and Fortran, and additionally with OpenMP sentinel (\scode{!$}) in Fortran.
|
||||
The \kcode{if} clause on some directives
|
||||
can direct the runtime to ignore or alter the behavior of the construct.
|
||||
Of course, the base-language \code{if} statements can be used to control the "execution"
|
||||
of stand-alone directives (such as \code{flush}, \code{barrier}, \code{taskwait},
|
||||
and \code{taskyield}).
|
||||
However, the directives must appear in a block structure, and not as a substatement as shown in examples 1 and 2 of this chapter.
|
||||
Of course, the base-language \bcode{if} statements can be used to control the execution
|
||||
of stand-alone directives (such as \kcode{flush}, \kcode{barrier}, \kcode{taskwait},
|
||||
and \kcode{taskyield}).
|
||||
However, the directives must appear in a block structure, and not as a substatement.
|
||||
The \kcode{metadirective} and \kcode{declare variant} directives provide conditional
|
||||
selection of directives and routines for compilation (and use), respectively.
|
||||
The \kcode{assume} and \kcode{requires} directives provide invariants
|
||||
for optimizing compilation, and essential features for compilation
|
||||
and correct execution, respectively.
|
||||
|
||||
|
||||
\bigskip
|
||||
CANCELLATION
|
||||
|
||||
Cancellation (termination) of the normal sequence of execution for the threads in an OpenMP region can
|
||||
be accomplished with the \code{cancel} construct. The construct uses a
|
||||
be accomplished with the \kcode{cancel} construct. The construct uses a
|
||||
\plc{construct-type-clause} to set the region-type to activate for the cancellation.
|
||||
That is, inclusion of one of the \plc{construct-type-clause} names \code{parallel}, \code{for},
|
||||
\code{do}, \code{sections} or \code{taskgroup} on the directive line
|
||||
That is, inclusion of one of the \plc{construct-type-clause} names \kcode{parallel}, \kcode{for},
|
||||
\kcode{do}, \kcode{sections} or \kcode{taskgroup} on the directive line
|
||||
activates the corresponding region.
|
||||
The \code{cancel} construct is activated by the first encountering thread, and it
|
||||
The \kcode{cancel} construct is activated by the first encountering thread, and it
|
||||
continues execution at the end of the named region.
|
||||
The \code{cancel} construct is also a concellation point for any other thread of the team
|
||||
The \kcode{cancel} construct is also a cancellation point for any other thread of the team
|
||||
to also continue execution at the end of the named region.
|
||||
|
||||
Also, once the specified region has been activated for cancellation any thread that encounnters
|
||||
a \code{cancellation point} construct with the same named region (\plc{construct-type-clause}),
|
||||
Also, once the specified region has been activated for cancellation any thread that encounters
|
||||
a \kcode{cancellation point} construct with the same named region (\plc{construct-type-clause}),
|
||||
continues execution at the end of the region.
|
||||
|
||||
For an activated \code{cancel taskgroup} construct, the tasks that
|
||||
For an activated \kcode{cancel taskgroup} construct, the tasks that
|
||||
belong to the taskgroup set of the innermost enclosing taskgroup region will be canceled.
|
||||
|
||||
A task that encounters the cancel taskgroup construct continues execution at the end of its
|
||||
A task that encounters a \kcode{cancel taskgroup} construct continues execution at the end of its
|
||||
task region. Any task of the taskgroup that has already begun execution will run to completion,
|
||||
unless it encounters a \code{cancellation point}; tasks that have not begun execution "may" be
|
||||
unless it encounters a \kcode{cancellation point}; tasks that have not begun execution may be
|
||||
discarded as completed tasks.
|
||||
|
||||
\bigskip
|
||||
\pagebreak
|
||||
CONTROL VARIABLES
|
||||
|
||||
Internal control variables (ICV) are used by implementations to hold values which control the execution
|
||||
of OpenMP regions. Control (and hence the ICVs) may be set as implementation defaults,
|
||||
or set and adjusted through environment variables, clauses, and API functions. Many of the ICV control
|
||||
values are accessible through API function calls. Also, initial ICV values are reported by the runtime
|
||||
if the \code{OMP\_DISPLAY\_ENV} environment variable has been set to \code{TRUE}.
|
||||
or set and adjusted through environment variables, clauses, and API functions.
|
||||
%Many of the ICV control values are accessible through API function calls.
|
||||
Initial ICV values are reported by the runtime
|
||||
if the \kcode{OMP_DISPLAY_ENV} environment variable has been set to \vcode{TRUE} or \vcode{VERBOSE}.
|
||||
|
||||
%As an example, the \plc{nthreads-var} is the ICV that holds the number of threads
|
||||
%to be used in a \code{parallel} region. It can be set with the \code{OMP\_NUM\_THREADS} environment variable,
|
||||
@ -59,8 +69,8 @@ CONTROL VARIABLES
|
||||
\bigskip
|
||||
NESTED CONSTRUCTS
|
||||
|
||||
Certain combinations of nested constructs are permitted, giving rise to a \plc{combined} construct
|
||||
consisting of two or more constructs. These can be used when the two (or several) constructs would be used
|
||||
Certain combinations of nested constructs are permitted, giving rise to \plc{combined} constructs
|
||||
consisting of two or more directives. These can be used when the two (or several) constructs would be used
|
||||
immediately in succession (closely nested). A combined construct can use the clauses of the component
|
||||
constructs without restrictions.
|
||||
A \plc{composite} construct is a combined construct which has one or more clauses with (an often obviously)
|
||||
@ -70,16 +80,37 @@ modified or restricted meaning, relative to when the constructs are uncombined.
|
||||
%construct with one of the loops constructs \code{do} or \code{for}. The
|
||||
%\code{parallel do SIMD} and \code{parallel for SIMD} constructs are composite constructs (composed from
|
||||
%the parallel loop constructs and the \code{SIMD} construct), because the \code{collapse} clause must
|
||||
%explicitly address the ordering of loop chunking \plc{and} SIMD "combined" execution.
|
||||
%explicitly address the ordering of loop chunking \plc{and} SIMD ``combined'' execution.
|
||||
|
||||
Certain nestings are forbidden, and often the reasoning is obvious. Worksharing constructs cannot be nested, and
|
||||
the \code{barrier} construct cannot be nested inside a worksharing construct, or a \code{critical} construct.
|
||||
Also, \code{target} constructs cannot be nested.
|
||||
Certain nestings are forbidden, and often the reasoning is obvious. For example, worksharing constructs cannot be nested, and
|
||||
the \kcode{barrier} construct cannot be nested inside a worksharing construct, or a \kcode{critical} construct.
|
||||
Also, \kcode{target} constructs cannot be nested, unless the nested target is a reverse offload.
|
||||
|
||||
The \code{parallel} construct can be nested, as well as the \code{task} construct. The parallel
|
||||
execution in the nested \code{parallel} construct(s) is control by the \code{OMP\_NESTED} and
|
||||
\code{OMP\_MAX\_ACTIVE\_LEVELS} environment variables, and the \code{omp\_set\_nested()} and
|
||||
\code{omp\_set\_max\_active\_levels()} functions.
|
||||
The \kcode{parallel} construct can be nested, as well as the \kcode{task} construct.
|
||||
The parallel execution in the nested \kcode{parallel} construct(s) is controlled by the
|
||||
\kcode{OMP_MAX_ACTIVE_LEVELS} environment variable, and the \kcode{omp_set_max_active_levels} routine.
|
||||
Use the \kcode{omp_get_max_active_levels} routine to determine the maximum levels provided by an implementation.
|
||||
As of OpenMP 5.0, use of the \kcode{OMP_NESTED} environment variable and the \kcode{omp_set_nested} routine
|
||||
has been deprecated.
|
||||
|
||||
More details on nesting can be found in the \plc{Nesting of Regions} of the \plc{Directives}
|
||||
More details on nesting can be found in the \docref{Nesting of Regions} of the \docref{Directives}
|
||||
chapter in the OpenMP Specifications document.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{program_control/assumption}
|
||||
\input{program_control/cond_comp}
|
||||
\input{program_control/icv}
|
||||
\input{program_control/standalone}
|
||||
\input{program_control/cancellation}
|
||||
\input{program_control/requires}
|
||||
\input{program_control/context_based_variants}
|
||||
\input{program_control/dispatch}
|
||||
\input{program_control/nested_loop}
|
||||
\input{program_control/nesting_restrict}
|
||||
\input{program_control/target_offload}
|
||||
\input{program_control/pause_resource}
|
||||
\input{program_control/reproducible}
|
||||
\input{program_control/interop}
|
||||
\input{program_control/utilities}
|
||||
|
||||
|
@ -1,51 +1,62 @@
|
||||
\pagebreak
|
||||
\chapter{Synchronization}
|
||||
\cchapter{Synchronization}{synchronization}
|
||||
\label{chap:synchronization}
|
||||
|
||||
The \code{barrier} construct is a stand-alone directive that requires all threads
|
||||
The \kcode{barrier} construct is a stand-alone directive that requires all threads
|
||||
of a team (within a contention group) to execute the barrier and complete
|
||||
execution of all tasks within the region, before continuing past the barrier.
|
||||
|
||||
The \code{critical} construct is a directive that contains a structured block.
|
||||
The \kcode{critical} construct is a directive that contains a structured block.
|
||||
The construct allows only a single thread at a time to execute the structured block (region).
|
||||
Multiple critical regions may exist in a parallel region, and may
|
||||
act cooperatively (only one thread at a time in all \code{critical} regions),
|
||||
or separately (only one thread at a time in each \code{critical} regions when
|
||||
a unique name is supplied on each \code{critical} construct).
|
||||
An optional (lock) \code{hint} clause may be specified on a named \code{critical}
|
||||
Multiple \kcode{critical} regions may exist in a parallel region, and may
|
||||
act cooperatively (only one thread at a time in all \kcode{critical} regions),
|
||||
or separately (only one thread at a time in each \kcode{critical} regions when
|
||||
a unique name is supplied on each \kcode{critical} construct).
|
||||
An optional (lock) \kcode{hint} clause may be specified on a named \kcode{critical}
|
||||
construct to provide the OpenMP runtime guidance in selection a locking
|
||||
mechanism.
|
||||
|
||||
On a finer scale the \code{atomic} construct allows only a single thread at
|
||||
On a finer scale the \kcode{atomic} construct allows only a single thread at
|
||||
a time to have atomic access to a storage location involving a single read,
|
||||
write, update or capture statement, and a limited number of combinations
|
||||
when specifying the \code{capture} \plc{atomic-clause} clause. The \plc{atomic-clause} clause
|
||||
is required for some expression statements, but are not required for
|
||||
\code{update} statements. Please see the details in the \plc{atomic Construct}
|
||||
subsection of the \plc{Directives} chapter in the OpenMP Specifications document.
|
||||
when specifying the \kcode{capture} \plc{atomic-clause} clause. The
|
||||
\plc{atomic-clause} clause is required for some expression statements, but is
|
||||
not required for \kcode{update} statements. The \plc{memory-order} clause can be
|
||||
used to specify the degree of memory ordering enforced by an \kcode{atomic}
|
||||
construct. From weakest to strongest, they are \kcode{relaxed} (the default),
|
||||
\plc{acquire} and/or \plc{release} clauses (specified with \kcode{acquire}, \kcode{release},
|
||||
or \kcode{acq_rel}), and \kcode{seq_cst}. Please see the details in the
|
||||
\docref{atomic Construct} subsection of the \docref{Directives} chapter in the OpenMP
|
||||
Specifications document.
|
||||
|
||||
% The following three sentences were stolen from the spec.
|
||||
The \code{ordered} construct either specifies a structured block in a loop,
|
||||
The \kcode{ordered} construct either specifies a structured block in a loop,
|
||||
simd, or loop SIMD region that will be executed in the order of the loop
|
||||
iterations. The ordered construct sequentializes and orders the execution
|
||||
of ordered regions while allowing code outside the region to run in parallel.
|
||||
iterations. The \kcode{ordered} construct sequentializes and orders the execution
|
||||
of \kcode{ordered} regions while allowing code outside the region to run in parallel.
|
||||
|
||||
Since OpenMP 4.5 the \code{ordered} construct can also be a stand-alone
|
||||
directive that specifies cross-iteration dependences in a doacross loop nest.
|
||||
The \code{depend} clause uses a \code{sink} \plc{dependence-type}, along with a
|
||||
iteration vector argument (vec) to indicate the iteration that satisfies the
|
||||
dependence. The \code{depend} clause with a \code{source}
|
||||
Since OpenMP 4.5 the \kcode{ordered} construct can also be a stand-alone
|
||||
directive that specifies cross-iteration dependences in a \plc{doacross} loop nest.
|
||||
The \kcode{depend} clause uses a \kcode{sink} \plc{dependence-type}, along with an
|
||||
iteration vector argument (\plc{vec}) to indicate the iteration that satisfies the
|
||||
dependence. The \kcode{depend} clause with a \kcode{source}
|
||||
\plc{dependence-type} specifies dependence satisfaction.
|
||||
|
||||
The \code{flush} directive is a stand-alone construct that forces a thread's
|
||||
temporal local storage (view) of a variable to memory where a consistent view
|
||||
of the variable storage can be accesses. When the construct is used without
|
||||
a variable list, all the locally thread-visible data as defined by the
|
||||
base language are flushed. A construct with a list applies the flush
|
||||
operation only to the items in the list. The \code{flush} construct also
|
||||
effectively insures that no memory (load or store) operation for
|
||||
the variable set (list items, or default set) may be reordered across
|
||||
the \code{flush} directive.
|
||||
The \kcode{flush} directive is a stand-alone construct for enforcing consistency
|
||||
between a thread's view of memory and the view of memory for other threads (see
|
||||
the Memory Model chapter of this document for more details). When the construct
|
||||
is used with an explicit variable list, a \plc{strong flush} that forces a
|
||||
thread's temporary view of memory to be consistent with the actual memory is
|
||||
applied to all listed variables. When the construct is used without an explicit
|
||||
variable list and without a \plc{memory-order} clause, a strong flush is
|
||||
applied to all locally thread-visible data as defined by the base language, and
|
||||
additionally the construct provides both acquire and release memory ordering
|
||||
semantics. When an explicit variable list is not present and a
|
||||
\plc{memory-order} clause is present, the construct provides acquire and/or
|
||||
release memory ordering semantics according to the \plc{memory-order} clause,
|
||||
but no strong flush is performed. A resulting strong flush that applies to a
|
||||
set of variables effectively ensures that no memory (load or store)
|
||||
operation for the affected variables may be reordered across the \kcode{flush}
|
||||
directive.
|
||||
|
||||
General-purpose routines provide mutual exclusion semantics through locks,
|
||||
represented by lock variables.
|
||||
@ -58,12 +69,33 @@ types of locks, and the variable of a specific lock type cannot be used by the
|
||||
other lock type.
|
||||
|
||||
Any explicit task will observe the synchronization prescribed in a
|
||||
\code{barrier} construct and an implied barrier. Also, additional synchronizations
|
||||
are available for tasks. All children of a task will wait at a \code{taskwait} (for
|
||||
their siblings to complete). A \code{taskgroup} construct creates a region in which the
|
||||
\kcode{barrier} construct and an implied barrier. Also, additional synchronizations
|
||||
are available for tasks. All children of a task will wait at a \kcode{taskwait} (for
|
||||
their siblings to complete). A \kcode{taskgroup} construct creates a region in which the
|
||||
current task is suspended at the end of the region until all sibling tasks,
|
||||
and their descendants, have completed.
|
||||
Scheduling constraints on task execution can be prescribed by the \code{depend}
|
||||
Scheduling constraints on task execution can be prescribed by the \kcode{depend}
|
||||
clause to enforce dependence on previously generated tasks.
|
||||
More details on controlling task executions can be found in the \plc{Tasking} Chapter
|
||||
More details on controlling task executions can be found in the \docref{Tasking} Chapter
|
||||
in the OpenMP Specifications document. %(DO REF. RIGHT.)
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{synchronization/critical}
|
||||
\input{synchronization/worksharing_critical}
|
||||
\input{synchronization/barrier_regions}
|
||||
\input{synchronization/atomic}
|
||||
\input{synchronization/atomic_cas}
|
||||
\input{synchronization/atomic_restrict}
|
||||
\input{synchronization/atomic_hint}
|
||||
\input{synchronization/acquire_release}
|
||||
\input{synchronization/ordered}
|
||||
\input{synchronization/depobj}
|
||||
\input{synchronization/doacross}
|
||||
\input{synchronization/locks}
|
||||
\input{synchronization/init_lock}
|
||||
\input{synchronization/init_lock_with_hint}
|
||||
\input{synchronization/lock_owner}
|
||||
\input{synchronization/simple_lock}
|
||||
\input{synchronization/nestable_lock}
|
||||
|
||||
|
@ -1,35 +1,34 @@
|
||||
\pagebreak
|
||||
\chapter{Tasking}
|
||||
\cchapter{Tasking}{tasking}
|
||||
\label{chap:tasking}
|
||||
|
||||
Tasking constructs provide units of work to a thread for execution.
|
||||
Worksharing constructs do this, too (e.g. \code{for}, \code{do},
|
||||
\code{sections}, and \code{singles} constructs);
|
||||
Worksharing constructs do this, too (e.g. \kcode{for}, \kcode{do},
|
||||
\kcode{sections}, and \kcode{single} constructs);
|
||||
but the work units are tightly controlled by an iteration limit and limited
|
||||
scheduling, or a limited number of \code{sections} or \code{single} regions.
|
||||
scheduling, or a limited number of \kcode{sections} or \kcode{single} regions.
|
||||
Worksharing was designed
|
||||
with \texttt{"}data parallel\texttt{"} computing in mind. Tasking was designed for
|
||||
\texttt{"}task parallel\texttt{"} computing and often involves non-locality or irregularity
|
||||
with ``data parallel'' computing in mind. Tasking was designed for
|
||||
``task parallel'' computing and often involves non-locality or irregularity
|
||||
in memory access.
|
||||
|
||||
The \code{task} construct can be used to execute work chunks: in a while loop;
|
||||
The \kcode{task} construct can be used to execute work chunks: in a while loop;
|
||||
while traversing nodes in a list; at nodes in a tree graph;
|
||||
or in a normal loop (with a \code{taskloop} construct).
|
||||
or in a normal loop (with a \kcode{taskloop} construct).
|
||||
Unlike the statically scheduled loop iterations of worksharing, a task is
|
||||
often enqueued, and then dequeued for execution by any of the threads of the
|
||||
team within a parallel region. The generation of tasks can be from a single
|
||||
generating thread (creating sibling tasks), or from multiple generators
|
||||
in a recursive graph tree traversals.
|
||||
%(creating a parent-descendents hierarchy of tasks, see example 4 and 7 below).
|
||||
A \code{taskloop} construct
|
||||
A \kcode{taskloop} construct
|
||||
bundles iterations of an associated loop into tasks, and provides
|
||||
similar controls found in the \code{task} construct.
|
||||
similar controls found in the \kcode{task} construct.
|
||||
|
||||
Sibling tasks are synchronized by the \code{taskwait} construct, and tasks
|
||||
Sibling tasks are synchronized by the \kcode{taskwait} construct, and tasks
|
||||
and their descendent tasks can be synchronized by containing them in
|
||||
a \code{taskgroup} region. Ordered execution is accomplished by specifying
|
||||
dependences with a \code{depend} clause. Also, priorities can be
|
||||
specified as hints to the scheduler through a \code{priority} clause.
|
||||
a \kcode{taskgroup} region. Ordered execution is accomplished by specifying
|
||||
dependences with a \kcode{depend} clause. Also, priorities can be
|
||||
specified as hints to the scheduler through a \kcode{priority} clause.
|
||||
|
||||
Various clauses can be used to manage and optimize task generation,
|
||||
as well as reduce the overhead of execution and to relinquish
|
||||
@ -37,15 +36,28 @@ control of threads for work balance and forward progress.
|
||||
|
||||
Once a thread starts executing a task, it is the designated thread
|
||||
for executing the task to completion, even though it may leave the
|
||||
execution at a scheduling point and return later. The thread is tied
|
||||
to the task. Scheduling points can be introduced with the \code{taskyield}
|
||||
construct. With an \code{untied} clause any other thread is allowed to continue
|
||||
the task. An \code{if} clause with a \plc{true} expression allows the
|
||||
generating thread to immediately execute the task as an undeferred task.
|
||||
execution at a scheduling point and return later. The thread is \plc{tied}
|
||||
to the task. Scheduling points can be introduced with the \kcode{taskyield}
|
||||
construct. With an \kcode{untied} clause any other thread is allowed to continue
|
||||
the task. An \kcode{if} clause with an expression that evaluates to \plc{false}
|
||||
results in an \plc{undeferred} task, which instructs the runtime to suspend
|
||||
the generating task until the undeferred task completes its execution.
|
||||
By including the data environment of the generating task into the generated task with the
|
||||
\code{mergeable} and \code{final} clauses, task generation overhead can be reduced.
|
||||
\kcode{mergeable} and \kcode{final} clauses, task generation overhead can be reduced.
|
||||
|
||||
A complete list of the tasking constructs and details of their clauses
|
||||
can be found in the \plc{Tasking Constructs} chapter of the OpenMP Specifications,
|
||||
in the \plc{OpenMP Application Programming Interface} section.
|
||||
can be found in the \docref{Tasking Constructs} chapter of the OpenMP Specifications.
|
||||
%in the \docref{OpenMP Application Programming Interface} section.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{tasking/tasking}
|
||||
\input{tasking/task_priority}
|
||||
\input{tasking/task_dep}
|
||||
\input{tasking/task_detach}
|
||||
\input{tasking/taskgroup}
|
||||
\input{tasking/taskyield}
|
||||
\input{tasking/taskloop}
|
||||
\input{tasking/parallel_masked_taskloop}
|
||||
\input{tasking/taskloop_dep}
|
||||
|
||||
|
234
Contributions.md
Normal file
234
Contributions.md
Normal file
@ -0,0 +1,234 @@
|
||||
# Contributing
|
||||
|
||||
The usual process for adding new examples, making changes or adding corrections
|
||||
is to submit an issue for discussion and initial evaluation of changes or example additions.
|
||||
When there is a consensus at a meeting about the contribution,
|
||||
the issue will be brought forward for voting at the OpenMP Language
|
||||
Committee meetings and you will be asked to submit a pull request.
|
||||
|
||||
Of course, if your contribution is an obvious correction, clarification, or note, you
|
||||
may want to submit a pull request directly.
|
||||
|
||||
-----------------------------------------------------------
|
||||
|
||||
## The OpenMP Examples document
|
||||
|
||||
The OpenMP Examples document is in LaTeX format.
|
||||
Please see the main LaTeX file, `openmp-examples.tex`, for more information.
|
||||
|
||||
## Maintainer
|
||||
|
||||
[OpenMP Examples Subcommittee](http://twiki.openmp.org/twiki/bin/view/OpenMPLang/OpenMPExamplesSubCommittee)
|
||||
For a brief revision history, see `Changes.log` in the repo.
|
||||
|
||||
## Git procedure
|
||||
|
||||
* Fork your own branch of the OpenMP [examples-internal repo](https://github.com/OpenMP/examples-internal)
|
||||
* Clone your fork locally
|
||||
* If you are working on generic or old-version updates, create a branch off main.
|
||||
* If you are working on an example for a release candidate for version #.#, create a branch off work_#.#.
|
||||
1) `git clone --branch <main|work_#.#> https://github.com/<my_account>/examples-internal`
|
||||
2) `git checkout -b <branch_name>`
|
||||
3) ... `add`, `commit`
|
||||
4) `git push -u origin <branch_name>`
|
||||
5) `make` or `make diff` will create a full-document pdf or just a pdf with differences (do this at any point).
|
||||
* `git status` and `git branch -a` are your friends
|
||||
* Submit an issue for your work (usually with a diff pdf), and then you will be asked to submit a pull request
|
||||
* Create an issue by selecting the (issue tab)[https://github.com/OpenMP/examples-internal/issues] and clicking on `new issue`.
|
||||
* Use this MarkDown Cheatsheet for (issue formatting)[https://wordpress.com/support/markdown-quick-reference/]
|
||||
* More MarkDown details are available (here)[https://markdown-it.github.io]
|
||||
* You can cut and paste markdown formatted text in a (reader)[https://dillinger.io] to see formatting effects.
|
||||
* Forced spaces are available in Markdown. On a Mac it is "option+space".
|
||||
* Polling is available. Go to (gh-poll)[https://app.gh-polls.com/]. Type an option on each line, then click `copy markdown`, and paste the contents into the issue. (Use preview to check your poll, and then submit it.)
|
||||
* Create a pull request
|
||||
|
||||
|
||||
## Processing source code
|
||||
|
||||
* Prepare source code (C/C++ and Fortran) and a text description (use similar styles found in recent examples)
|
||||
* Determine the *example* name `<ename>`, *sequence* identifier `<seq-id>` and *compiler* suffix `<csuffix>` for the example
|
||||
* The syntax is: `<ename>.<seq-id>.<csuffix>` (e.g. `affinity_display.1.f90`)
|
||||
* The example name may be a Section name (e.g. affinity), or a Subsection name (affinity_display)
|
||||
* If you are creating a new Chapter, it may be the chapter name.
|
||||
* New examples are usually added at the end of a Section or Subsection. Number it as the next number in the sequence numbers for examples in that Section or Subsection.
|
||||
* The compiler suffix `<csuffix>` is `c`, `cpp`, `f`, and `f90` for C, C++ and Fortran (fixed/free form) codes.
|
||||
* Insert the code in the sources directory for each chapter, and include the following metadata:
|
||||
* Metadata Tags for example sources:
|
||||
```
|
||||
@@name: <ename>.<seq-no>
|
||||
@@type: C|C++|F-fixed|F-free
|
||||
@@operation: view|compile|link|run
|
||||
@@expect: success|ct-error|rt-error|unspecified
|
||||
@@version: [pre_]omp_<verno>
|
||||
@@env: <environment_variables>
|
||||
@@depend: <source_code_name>
|
||||
```
|
||||
* **name**
|
||||
- is the name of an example
|
||||
* **type**
|
||||
- is the source code type, which can be translated into or from proper file extension (C:c,C++:cpp,F-fixed:f,F-free:f90)
|
||||
* **operation**
|
||||
- indicates how the source code is treated. Possible values are:
|
||||
- `view` - code for illustration only, not compilable;
|
||||
- `compile` - incomplete program, such as function or subroutine;
|
||||
- `link` - complete program, but no verification value;
|
||||
- `run` - complete program with verification value.
|
||||
* **expect**
|
||||
- indicates some expected result for testing purpose.
|
||||
- `success` means no issue;
|
||||
- `ct-error` applies to the result of code compilation;
|
||||
- `rt-error` is for a case where compilation may be successful, but the code
|
||||
contains potential runtime issues (including race condition);
|
||||
- `unspecified` could result from a non-conforming code or is for code
|
||||
that is viewable only.
|
||||
* **version**
|
||||
- indicates that the example uses features in a specific OpenMP version, such as "`omp_5.0`".
|
||||
The prefix `pre_` indicates that the example uses features prior to a specific version, such as "`pre_omp_3.0`".
|
||||
* **env**
|
||||
- specifies any environment variables needed to run the code.
|
||||
This tag is optional and can be repeated.
|
||||
* **depend**
|
||||
- specifies a source code file on which the current code depends.
|
||||
This tag is optional and can be repeated.
|
||||
* For **env** and **depend**, make sure to specify
|
||||
a proper skipping number `<s>` in the LaTeX macros described below
|
||||
to match with the number of `env` and `depend` tags.
|
||||
|
||||
|
||||
## Process for text
|
||||
* Create or update the description text in a Section/Subsection file under each chapter directory, usually `<chap_directory>/<ename>.tex`
|
||||
* If adding a new Subsection, just include it in the appropriate subsection file (`<subsection>.tex`)
|
||||
* If adding a new Section, create an `<section>.tex` file and add an entry in the corresponding chapter file, such as `Chap_affinity.tex`
|
||||
* If adding a new Chapter, create a `Chap_<chap_name>.tex` file with introductory text, and add a new `<section>.tex` file with text and links to the code. Update `Makefile` and `openmp-examples.tex` to include the new chapter file.
|
||||
* Commit your changes into your fork of examples-internal
|
||||
* Summit your issue at [OpenMP Examples internal repo]( https://github.com/openmp/examples-internal/issues), and include a PDF when ready.
|
||||
* Examples subcommittee members can view [meeting schedule and notes](http://twiki.openmp.org/twiki/bin/view/OpenMPLang/ExamplesSchedules)
|
||||
* Shepherd your issue to acceptance (discussed at weekly Examples meeting and in issue comments)
|
||||
* When it is in a ready state, you should then submit a pull request.
|
||||
* It will be reviewed and voted on, and changes will be requested.
|
||||
* Once the last changes are made, it will be verified and merged into an appropriate branch (either the `main` branch or a working branch).
|
||||
|
||||
|
||||
|
||||
|
||||
## LaTeX macros for examples
|
||||
|
||||
The following describes LaTeX macros defined specifically for examples.
|
||||
* Source code with language h-rules
|
||||
* Source code without language h-rules
|
||||
* Language h-rules
|
||||
* Macros for keywords in text description
|
||||
* Other macros
|
||||
* See `openmp.sty` for more information
|
||||
|
||||
### Source code with language h-rules
|
||||
```
|
||||
\cexample[<verno>]{<ename>}{<seq-no>}[<s>] % for C/C++ examples
|
||||
\cppexample[<verno>]{<ename>}{<seq-no>}[<s>] % for C++ examples
|
||||
\fexample[<verno>]{<ename>}{<seq-no>}[<s>] % for fixed-form Fortran examples
|
||||
\ffreeexample[<verno>]{<ename>}{<seq-no>}[<s>] % for free-form Fortran examples
|
||||
```
|
||||
|
||||
### Source code without language h-rules
|
||||
```
|
||||
\cnexample[<verno>]{<ename>}{<seq-no>}[<s>]
|
||||
\cppnexample[<verno>]{<ename>}{<seq-no>}[<s>]
|
||||
\fnexample[<verno>]{<ename>}{<seq-no>}[<s>]
|
||||
\ffreenexample[<verno>]{<ename>}{<seq-no>}[<s>]
|
||||
\srcnexample[<verno>]{<ename>}{<seq-no>}{<ext>}[<s>]
|
||||
```
|
||||
|
||||
Optional `<verno>` can be supplied in a macro to include a specific OpenMP
|
||||
version in the example header. This option also suggests one additional
|
||||
tag (`@@version`) line is included in the corresponding source code.
|
||||
If this is not the case (i.e., no `@@version` tag line), one needs to
|
||||
prefix `<verno>` with an underscore '\_' symbol in the macro.
|
||||
|
||||
The exception is macro `\srcnexample`, for which the corresponding
|
||||
source code might not contain any `@@` metadata tags. The `ext` argument
|
||||
to this macro is the file extension (such as `h`, `hpp`, `inc`).
|
||||
|
||||
The `<s>` option to each macro allows finer-control of any additional lines
|
||||
to be skipped due to addition of new `@@` tags, such as `@@env`.
|
||||
The default value for `<s>` is 0.
|
||||
|
||||
### Language h-rules
|
||||
```
|
||||
\cspecificstart, \cspecificend
|
||||
\cppspecificstart, \cppspecificend
|
||||
\ccppspecificstart, \ccppspecificend
|
||||
\fortranspecificstart, \fortranspecificend
|
||||
\begin{cspecific}[s] ... \end{cspecific}
|
||||
\begin{cppspecific}[s] ... \end{cppspecific}
|
||||
\begin{ccppspecific}[s] ... \end{ccppspecific}
|
||||
\begin{fortranspecific}[s] ... \end{fortranspecific}
|
||||
\topmarker{Lang}
|
||||
```
|
||||
|
||||
Use of the structured `\begin{} .. \end{}` environments is the preferred
|
||||
way of specifying language-dependent text over the unstructured approach
|
||||
of using `\*specificstart` and `\*specificend`.
|
||||
The option `[s]` to each of the environments can specify a vertical shift
|
||||
for the beginning rule, such as when followed by a section header.
|
||||
|
||||
The macro `\topmarker` puts a dashed blue line floater at top of a page for
|
||||
"Lang (cont.)" where `Lang` can be `C/C++`, `C++`, `Fortran`.
|
||||
|
||||
### Macros for keywords in text description
|
||||
A partial list:
|
||||
- `\kcode{}` - for OpenMP keywords, such as directives, clauses, environment variables, API routines. Support direct use of '_' (underscore) and ' ' (space)
|
||||
- `\scode{}` - OpenMP specifier with special chars, such as '`$`' in "`!$omp`"
|
||||
- `\bcode{}` - base language keywords (such as `ASSOCIATE` in Fortran)
|
||||
- `\vcode{}` - values of a keyword, such as `TRUE`, `FALSE`, `VERBOSE`
|
||||
- `\plc{}` - OpenMP concept, such ICV names; `\splc{}` - escape '_' (underscore)
|
||||
- `\example{}` - example names, such as `\example{taskloop_reduction.1}`
|
||||
- `\docref{}` - chapter or section name of a document, such as the spec
|
||||
- `\ucode{}` - program variables, procedure names, or expression in examples codes. Support direct use of '_' (underscore) and ' ' (space).
|
||||
- `\pout{}` - program outputs
|
||||
|
||||
Examples:
|
||||
- `\kcode{declare reduction}` for **declare reduction**
|
||||
- `\scode{!$omp}` sentinel, however, `\kcode{\#pragma omp}`
|
||||
- `\kcode{map(iterator(\ucode{i=0:n}), tofrom: \ucode{p[i]})}` for **map(iterator(**_i=0:n_**), tofrom:** _p[i]_**)**
|
||||
- Fortran `\bcode{IMPLICIT NONE}` statement
|
||||
- The `\vcode{VERBOSE}` value for `\kcode{OMP_DISPLAY_ENV}`
|
||||
- OpenMP `\plc{directives}`, the `\plc{num-threads}` ICV
|
||||
- This is an example name `\example{taskloop_reduction.1}`
|
||||
- `(\ucode{x,y,z})` argument for procedure `\ucode{a_proc_name}`
|
||||
- structure constructor `\ucode{point($\ldots$)}`
|
||||
- This is a code output `"\pout{x = 1}"`
|
||||
|
||||
### Other macros
|
||||
```
|
||||
\cchapter{<Chapter Name>}{<chap_directory>}
|
||||
\hexentry[ext1]{<example_name>}[ext2]{<earlier_tag>}
|
||||
\hexmentry[ext1]{<example_name>}[ext2]{<earlier_tag>}{<prior_name>}
|
||||
\examplesref{<verno>}
|
||||
\examplesblob{<verno/file>}
|
||||
```
|
||||
|
||||
The `\cchapter` macro is used for starting a chapter with proper page spacing.
|
||||
`<Chapter Name>` is the name of a chapter and `<chap_directory>` is the name
|
||||
of the chapter directory. All section and subsection files for the chapter
|
||||
should be placed under `<chap_directory>`. The corresponding example sources
|
||||
should be placed under the `sources` directory inside `<chap_directory>`.
|
||||
|
||||
A previously-defined macro `\sinput{<section_file>}` to import a section
|
||||
file from `<chap_directory>` is no longer supported. Please use
|
||||
`\input{<chap_directory>/<section_file>}` explicitly.
|
||||
|
||||
The two macros `\hexentry` and `\hexmentry` are defined for simplifying
|
||||
entries in the feature deprecation and update tables. Option `[ext1]` is
|
||||
the file extension with a default value of `c` and option `[ext2]` is
|
||||
the file extension for the associated second file if present.
|
||||
`<earlier_tag>` is the version tag of the corresponding example
|
||||
in the earlier version. `\hexentry` assumes no name change for an example
|
||||
in different versions; `\hexmentry` can be used to specify a prior name
|
||||
if it is different.
|
||||
|
||||
The two macros `\examplesref` and `\examplesblob` are for referencing
|
||||
a specific version of or a file in the github Examples repository.
|
||||
|
||||
## License
|
||||
|
||||
For copyright information, please see [omp_copyright.txt](omp_copyright.txt).
|
282
Deprecated_Features.tex
Normal file
282
Deprecated_Features.tex
Normal file
@ -0,0 +1,282 @@
|
||||
\cchapter{Feature Deprecations and Updates in Examples}{deprecated_features}
|
||||
\label{chap:deprecated_features}
|
||||
\label{sec:deprecated_features}
|
||||
\index{deprecated features}
|
||||
|
||||
\newcommand\tabpcont[1]{\multicolumn{2}{l}{\small\slshape table continued #1 page}}
|
||||
\newcommand\tabpheader{\textbf{Version} & \textbf{Deprecated Feature} &
|
||||
\textbf{Replacement}}
|
||||
\newcommand\tabuheader{\textbf{Example Name} & \textbf{Earlier Version} &
|
||||
\textbf{Feature Updated}}
|
||||
\newcommand\dpftable[1]{
|
||||
\renewcommand{\arraystretch}{1.0}
|
||||
\tablefirsthead{%
|
||||
\hline\\[-2ex]
|
||||
\tabuheader\\[2pt]
|
||||
\hline\\[-2ex]
|
||||
}
|
||||
\tablehead{%
|
||||
\tabpcont{from previous}\\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\tabuheader\\[2pt]
|
||||
\hline\\[-2ex]
|
||||
}
|
||||
\tabletail{%
|
||||
\hline\\[-2.5ex]
|
||||
\tabpcont{on next}\\
|
||||
}
|
||||
\tablelasttail{\hline\\[-1ex]}
|
||||
\tablecaption{Updated Examples for Features Deprecated in Version #1\label{tab:Updated Examples #1}}
|
||||
}
|
||||
|
||||
|
||||
Deprecation of features began in OpenMP 5.0.
|
||||
Examples that use a deprecated feature have been updated with an equivalent
|
||||
replacement feature.
|
||||
|
||||
Table~\ref{tab:Deprecated Features} summarizes deprecated features and
|
||||
their replacements in each version. Affected examples are updated
|
||||
accordingly and listed in Section~\ref{sec:Updated Examples}.
|
||||
|
||||
\nolinenumbers
|
||||
\renewcommand{\arraystretch}{1.4}
|
||||
\tablefirsthead{%
|
||||
\hline
|
||||
\tabpheader\\
|
||||
\hline\\[-3.5ex]
|
||||
}
|
||||
\tablehead{%
|
||||
\tabpcont{from previous}\\
|
||||
\hline
|
||||
\tabpheader\\
|
||||
\hline\\[-3ex]
|
||||
}
|
||||
\tabletail{%
|
||||
\hline\\[-4ex]
|
||||
\tabpcont{on next}\\
|
||||
}
|
||||
\tablelasttail{\hline\\[-2ex]}
|
||||
\tablecaption{Deprecated Features and Their Replacements\label{tab:Deprecated Features}}
|
||||
\begin{supertabular}{p{0.4in} p{2.3in} p{2.2in}}
|
||||
6.0 & \kcode{declare reduction(}\plc{reduction-id}: \plc{typename-list}: \plc{combiner}\kcode{)}
|
||||
& \kcode{declare reduction(}\plc{reduction-id}: \plc{typename-list}\kcode{)} \kcode{combiner(\plc{combiner-exp})} \\
|
||||
\hline
|
||||
5.2 & \kcode{default} clause on metadirectives
|
||||
& \kcode{otherwise} clause \\
|
||||
5.2 & delimited \kcode{declare target} directive for C/C++
|
||||
& \kcode{begin declare target} directive \\
|
||||
5.2 & \kcode{to} clause on \kcode{declare target} directive
|
||||
& \kcode{enter} clause \\
|
||||
5.2 & non-argument \kcode{destroy} clause on \kcode{depobj} construct
|
||||
& \kcode{destroy(\plc{argument})} \\
|
||||
5.2 & \kcode{allocate} directive for Fortran \bcode{ALLOCATE} statements
|
||||
& \kcode{allocators} directive \\
|
||||
5.2 & \kcode{depend} clause on \kcode{ordered} construct
|
||||
& \kcode{doacross} clause \\
|
||||
5.2 & \kcode{linear(\plc{modifier(list): linear-step})} clause
|
||||
& \kcode{linear(\plc{list:} step(\plc{linear-step})\plc{, modifier})} clause \\
|
||||
\hline
|
||||
5.1 & \kcode{master} construct
|
||||
& \kcode{masked} construct \\
|
||||
5.1 & \kcode{master} affinity policy
|
||||
& \kcode{primary} affinity policy \\
|
||||
\hline
|
||||
5.0 & \kcode{omp_lock_hint_*} constants
|
||||
& \kcode{omp_sync_hint_*} constants \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
These replacements appear in examples that illustrate, otherwise, earlier features.
|
||||
When using a compiler that is compliant with a version prior to
|
||||
the indicated version, the earlier form of an example for a previous
|
||||
version is listed as a reference.
|
||||
|
||||
\newpage
|
||||
\section{Updated Examples for Different Versions}
|
||||
\label{sec:Updated Examples}
|
||||
|
||||
The following tables list the updated examples for different versions as
|
||||
a result of feature deprecation. The \emph{Earlier Version} column of
|
||||
the tables shows the version tag of the earlier version. It also shows
|
||||
the prior name of an example when it has been renamed.
|
||||
|
||||
|
||||
Table~\ref{tab:Updated Examples 6.0} lists the updated examples for
|
||||
features deprecated in OpenMP 6.0
|
||||
in the Examples Document Version
|
||||
\href{https://github.com/OpenMP/Examples/tree/v6.0}{6.0}.
|
||||
The \emph{Earlier Version} column of the table lists the earlier version
|
||||
tags of the examples that can be found in
|
||||
the Examples Document Version
|
||||
\href{https://github.com/OpenMP/Examples/tree/v5.2}{5.2}.
|
||||
|
||||
\index{clauses!combiner@\kcode{combiner}}
|
||||
\index{combiner clause@\kcode{combiner} clause}
|
||||
|
||||
\nolinenumbers
|
||||
\dpftable{6.0}
|
||||
\begin{supertabular}{p{1.7in} p{1.1in} p{2.2in}}
|
||||
\hexentry{udr.1}[f90]{4.0} &
|
||||
\plc{combiner} expression in \kcode{declare} \\
|
||||
\hexentry{udr.2}[f90]{4.0} &
|
||||
\kcode{reduction} directive changed to use \\
|
||||
\hexentry{udr.3}[f90]{4.0} & \kcode{combiner} clause \\
|
||||
\hexentry[f90]{udr.4}{4.0} & \\
|
||||
\hexentry[cpp]{udr.5}{4.0} & \\
|
||||
\hexentry[cpp]{udr.6}{4.0} & \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
Table~\ref{tab:Updated Examples 5.2} lists the updated examples for
|
||||
features deprecated in OpenMP 5.2
|
||||
in the Examples Document Version \examplesref{5.2}.
|
||||
The \emph{Earlier Version} column of the table lists the earlier version
|
||||
tags of the examples that can be found in
|
||||
the Examples Document Version \examplesref{5.1}.
|
||||
|
||||
\index{clauses!default@\kcode{default}}
|
||||
\index{clauses!otherwise@\kcode{otherwise}}
|
||||
\index{clauses!to@\kcode{to}}
|
||||
\index{clauses!enter@\kcode{enter}}
|
||||
\index{clauses!depend@\kcode{depend}}
|
||||
\index{clauses!doacross@\kcode{doacross}}
|
||||
\index{clauses!linear@\kcode{linear}}
|
||||
\index{clauses!destroy@\kcode{destroy}}
|
||||
\index{default clause@\kcode{default} clause}
|
||||
\index{otherwise clause@\kcode{otherwise} clause}
|
||||
\index{to clause@\kcode{to} clause}
|
||||
\index{enter clause@\kcode{enter} clause}
|
||||
\index{depend clause@\kcode{depend} clause}
|
||||
\index{doacross clause@\kcode{doacross} clause}
|
||||
\index{linear clause@\kcode{linear} clause}
|
||||
\index{destroy clause@\kcode{destroy} clause}
|
||||
\index{directives!begin declare target@\kcode{begin declare target}}
|
||||
\index{begin declare target directive@\kcode{begin declare target} directive}
|
||||
\index{allocate directive@\kcode{allocate} directive}
|
||||
\index{allocators directive@\kcode{allocators} directive}
|
||||
|
||||
\nolinenumbers
|
||||
\dpftable{5.2}
|
||||
\begin{supertabular}{p{1.7in} p{1.2in} p{2.1in}}
|
||||
\hexentry{error.1}[f90]{5.1} &
|
||||
\kcode{default} clause on metadirectives \\
|
||||
\hexentry{metadirective.1}[f90]{5.0} &
|
||||
replaced with \kcode{otherwise} clause \\
|
||||
\hexentry{metadirective.2}[f90]{5.0} & \\
|
||||
\hexentry{metadirective.3}[f90]{5.0} & \\
|
||||
\hexentry{metadirective.4}[f90]{5.1} & \\
|
||||
\hexentry{target_ptr_map.4}{5.1} & \\
|
||||
\hexentry{target_ptr_map.5}[f90]{5.1} & \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry[f90]{array_shaping.1}{5.0} &
|
||||
\kcode{to} clause on \kcode{declare target} \\
|
||||
\hexentry{target_reverse_offload.7}{5.0} &
|
||||
directive replaced with \kcode{enter} clause \\
|
||||
\hexentry{target_task_reduction.1}[f90]{5.1} & \\
|
||||
\hexentry{target_task_reduction.2a}[f90]{5.0} & \\
|
||||
\hexentry{target_task_reduction.2b}[f90]{5.1} &\\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry{array_shaping.1}{5.0} &
|
||||
delimited \kcode{declare target} \\
|
||||
\hexentry{async_target.1}{4.0} &
|
||||
directive replaced with \\
|
||||
\hexentry{async_target.2}{4.0} &
|
||||
\kcode{begin declare target} \\
|
||||
\hexentry{declare_target.1}{4.0} &
|
||||
directive for C/C++ \\
|
||||
\hexentry[cpp]{declare_target.2c}{4.0} & \\
|
||||
\hexentry{declare_target.3}{4.0} & \\
|
||||
\hexentry{declare_target.4}{4.0} & \\
|
||||
\hexentry{declare_target.5}{4.0} & \\
|
||||
\hexentry{declare_target.6}{4.0} & \\
|
||||
\hexentry{declare_variant.1}{5.0} & \\
|
||||
\hexentry{device.1}{4.0} & \\
|
||||
\hexentry{metadirective.3}{5.0} & \\
|
||||
\hexentry{target_ptr_map.2}{5.0} & \\
|
||||
\hexentry{target_ptr_map.3a}{5.0} & \\
|
||||
\hexentry{target_ptr_map.3b}{5.0} & \\
|
||||
\hexentry{target_struct_map.1}{5.0} & \\
|
||||
\hexentry[cpp]{target_struct_map.2}{5.0} & \\
|
||||
\hexentry{target_struct_map.3}{5.0} & \\
|
||||
\hexentry{target_struct_map.4}{5.0} & \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry{doacross.1}[f90]{4.5} &
|
||||
\kcode{depend} clause on \kcode{ordered} \\
|
||||
\hexentry{doacross.2}[f90]{4.5} &
|
||||
construct replaced with \kcode{doacross} \\
|
||||
\hexentry{doacross.3}[f90]{4.5} &
|
||||
clause \\
|
||||
\hexentry{doacross.4}[f90]{4.5} & \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry[cpp]{linear_modifier.1}[f90]{4.5} &
|
||||
modifier syntax change for \kcode{linear} \\
|
||||
\hexentry[cpp]{linear_modifier.2}[f90]{4.5} &
|
||||
clause on \kcode{declare simd} directive \\
|
||||
\hexentry{linear_modifier.3}[f90]{4.5} & \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry[f90]{allocators.1}{5.0} &
|
||||
\kcode{allocate} directive replaced with \kcode{allocators} directive
|
||||
for Fortran \bcode{allocate} statements \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry{depobj.1}[f90]{5.0} &
|
||||
argument added to \kcode{destroy} clause on \kcode{depobj}
|
||||
construct \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
\newpage
|
||||
Table~\ref{tab:Updated Examples 5.1} lists the updated examples for
|
||||
features deprecated in OpenMP 5.1
|
||||
in the Examples Document Version \examplesref{5.1}.
|
||||
The \emph{Earlier Version} column of the table lists the earlier version
|
||||
tags and prior names of the examples that can be found in
|
||||
the Examples Document Version \examplesref{5.0.1}.
|
||||
|
||||
\index{affinity!master policy@\kcode{master} policy}
|
||||
\index{affinity!primary policy@\kcode{primary} policy}
|
||||
\index{constructs!master@\kcode{master}}
|
||||
\index{constructs!masked@\kcode{masked}}
|
||||
\index{master construct@\kcode{master} construct}
|
||||
\index{masked construct@\kcode{masked} construct}
|
||||
|
||||
\nolinenumbers
|
||||
\dpftable{5.1}
|
||||
\begin{supertabular}{p{1.8in} p{1.4in} p{1.8in}}
|
||||
\hexentry{affinity.5}[f]{4.0} &
|
||||
\kcode{master} affinity policy replaced with \kcode{primary} policy \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry{async_target.3}[f90]{5.0} &
|
||||
\kcode{master} construct replaced \\
|
||||
\hexentry{cancellation.2}[f90]{4.0} &
|
||||
with \kcode{masked} construct \\
|
||||
\hexentry{copyprivate.2}[f]{3.0} & \\
|
||||
\hexentry[f]{fort_sa_private.5}{3.0} & \\
|
||||
\hexentry{lock_owner.1}[f]{3.0} & \\
|
||||
\hexmentry{masked.1}[f]{3.0}{master.1} & \\
|
||||
\hexmentry{parallel_masked_taskloop.1}[f90]{5.0}{parallel_master_taskloop.1} &\\
|
||||
\hexentry{reduction.6}[f]{3.0} & \\
|
||||
\hexentry{target_task_reduction.1}[f90]{5.0} & \\
|
||||
\hexentry{target_task_reduction.2b}[f90]{5.0} & \\
|
||||
\hexentry{taskloop_simd_reduction.1}[f90]{5.0} & \\
|
||||
\hexentry{task_detach.1}[f90]{5.0} & \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
Table~\ref{tab:Updated Examples 5.0} lists the updated examples for
|
||||
features deprecated in OpenMP 5.0
|
||||
in the Examples Document Version \examplesref{5.1}.
|
||||
The \emph{Earlier Version} column of the table lists the earlier version
|
||||
tags of the examples that can be found in
|
||||
the Examples Document Version \examplesref{5.0.1}.
|
||||
|
||||
\nolinenumbers
|
||||
\dpftable{5.0}
|
||||
\begin{supertabular}{p{1.6in} p{1.3in} p{2.1in}}
|
||||
\hexentry{critical.2}[f]{4.5} &
|
||||
\kcode{omp_lock_hint_*} constants \\
|
||||
\hexentry[cpp]{init_lock_with_hint.1}[f]{4.5} &
|
||||
replaced with \kcode{omp_sync_hint_*} constants \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
|
@ -1,21 +0,0 @@
|
||||
|
||||
\chapter*{Examples}
|
||||
\label{chap:examples}
|
||||
\addcontentsline{toc}{chapter}{\protect\numberline{}Examples}
|
||||
The following are examples of the OpenMP API directives, constructs, and routines.
|
||||
\ccppspecificstart
|
||||
A statement following a directive is compound only when necessary, and a
|
||||
non-compound statement is indented with respect to a directive preceding it.
|
||||
\ccppspecificend
|
||||
|
||||
Each example is labeled as \plc{ename.seqno.ext}, where \plc{ename} is
|
||||
the example name, \plc{seqno} is the sequence number in a section, and
|
||||
\plc{ext} is the source file extension to indicate the code type and
|
||||
source form. \plc{ext} is one of the following:
|
||||
\begin{compactitem}
|
||||
\item \plc{c} -- C code,
|
||||
\item \plc{cpp} -- C++ code,
|
||||
\item \plc{f} -- Fortran code in fixed form, and
|
||||
\item \plc{f90} -- Fortran code in free form.
|
||||
\end{compactitem}
|
||||
|
@ -1,126 +0,0 @@
|
||||
%\pagebreak
|
||||
\section{\code{simd} and \code{declare} \code{simd} Constructs}
|
||||
\label{sec:SIMD}
|
||||
|
||||
The following example illustrates the basic use of the \code{simd} construct
|
||||
to assure the compiler that the loop can be vectorized.
|
||||
|
||||
\cexample{SIMD}{1}
|
||||
|
||||
\ffreeexample{SIMD}{1}
|
||||
|
||||
|
||||
When a function can be inlined within a loop the compiler has an opportunity to
|
||||
vectorize the loop. By guaranteeing SIMD behavior of a function's operations,
|
||||
characterizing the arguments of the function and privatizing temporary
|
||||
variables of the loop, the compiler can often create faster, vector code for
|
||||
the loop. In the examples below the \code{declare} \code{simd} construct is
|
||||
used on the \plc{add1} and \plc{add2} functions to enable creation of their
|
||||
corresponding SIMD function versions for execution within the associated SIMD
|
||||
loop. The functions characterize two different approaches of accessing data
|
||||
within the function: by a single variable and as an element in a data array,
|
||||
respectively. The \plc{add3} C function uses dereferencing.
|
||||
|
||||
The \code{declare} \code{simd} constructs also illustrate the use of
|
||||
\code{uniform} and \code{linear} clauses. The \code{uniform(fact)} clause
|
||||
indicates that the variable \plc{fact} is invariant across the SIMD lanes. In
|
||||
the \plc{add2} function \plc{a} and \plc{b} are included in the \code{unform}
|
||||
list because the C pointer and the Fortran array references are constant. The
|
||||
\plc{i} index used in the \plc{add2} function is included in a \code{linear}
|
||||
clause with a constant-linear-step of 1, to guarantee a unity increment of the
|
||||
associated loop. In the \code{declare} \code{simd} construct for the \plc{add3}
|
||||
C function the \code{linear(a,b:1)} clause instructs the compiler to generate
|
||||
unit-stride loads across the SIMD lanes; otherwise, costly \emph{gather}
|
||||
instructions would be generated for the unknown sequence of access of the
|
||||
pointer dereferences.
|
||||
|
||||
In the \code{simd} constructs for the loops the \code{private(tmp)} clause is
|
||||
necessary to assure that the each vector operation has its own \plc{tmp}
|
||||
variable.
|
||||
|
||||
\cexample{SIMD}{2}
|
||||
|
||||
\ffreeexample{SIMD}{2}
|
||||
|
||||
|
||||
A thread that encounters a SIMD construct executes a vectorized code of the
|
||||
iterations. Similar to the concerns of a worksharing loop a loop vectorized
|
||||
with a SIMD construct must assure that temporary and reduction variables are
|
||||
privatized and declared as reductions with clauses. The example below
|
||||
illustrates the use of \code{private} and \code{reduction} clauses in a SIMD
|
||||
construct.
|
||||
|
||||
\cexample{SIMD}{3}
|
||||
|
||||
\ffreeexample{SIMD}{3}
|
||||
|
||||
|
||||
A \code{safelen(N)} clause in a \code{simd} construct assures the compiler that
|
||||
there are no loop-carried dependencies for vectors of size \plc{N} or below. If
|
||||
the \code{safelen} clause is not specified, then the default safelen value is
|
||||
the number of loop iterations.
|
||||
|
||||
The \code{safelen(16)} clause in the example below guarantees that the vector
|
||||
code is safe for vectors up to and including size 16. In the loop, \plc{m} can
|
||||
be 16 or greater, for correct code execution. If the value of \plc{m} is less
|
||||
than 16, the behavior is undefined.
|
||||
|
||||
\cexample{SIMD}{4}
|
||||
|
||||
\ffreeexample{SIMD}{4}
|
||||
|
||||
|
||||
The following SIMD construct instructs the compiler to collapse the \plc{i} and
|
||||
\plc{j} loops into a single SIMD loop in which SIMD chunks are executed by
|
||||
threads of the team. Within the workshared loop chunks of a thread, the SIMD
|
||||
chunks are executed in the lanes of the vector units.
|
||||
|
||||
\cexample{SIMD}{5}
|
||||
|
||||
\ffreeexample{SIMD}{5}
|
||||
|
||||
|
||||
%%% section
|
||||
\section{\code{inbranch} and \code{notinbranch} Clauses}
|
||||
\label{sec:SIMD_branch}
|
||||
|
||||
The following examples illustrate the use of the \code{declare} \code{simd}
|
||||
construct with the \code{inbranch} and \code{notinbranch} clauses. The
|
||||
\code{notinbranch} clause informs the compiler that the function \plc{foo} is
|
||||
never called conditionally in the SIMD loop of the function \plc{myaddint}. On
|
||||
the other hand, the \code{inbranch} clause for the function goo indicates that
|
||||
the function is always called conditionally in the SIMD loop inside
|
||||
the function \plc{myaddfloat}.
|
||||
|
||||
\cexample{SIMD}{6}
|
||||
|
||||
\ffreeexample{SIMD}{6}
|
||||
|
||||
|
||||
In the code below, the function \plc{fib()} is called in the main program and
|
||||
also recursively called in the function \plc{fib()} within an \code{if}
|
||||
condition. The compiler creates a masked vector version and a non-masked vector
|
||||
version for the function \plc{fib()} while retaining the original scalar
|
||||
version of the \plc{fib()} function.
|
||||
|
||||
\cexample{SIMD}{7}
|
||||
|
||||
\ffreeexample{SIMD}{7}
|
||||
|
||||
|
||||
|
||||
%%% section
|
||||
\section{Loop-Carried Lexical Forward Dependence}
|
||||
\label{sec:SIMD_forward_dep}
|
||||
|
||||
|
||||
The following example tests the restriction on an SIMD loop with the loop-carried lexical forward-dependence. This dependence must be preserved for the correct execution of SIMD loops.
|
||||
|
||||
A loop can be vectorized even though the iterations are not completely independent when it has loop-carried dependences that are forward lexical dependences, indicated in the code below by the read of \plc{A[j+1]} and the write to \plc{A[j]} in C/C++ code (or \plc{A(j+1)} and \plc{A(j)} in Fortran). That is, the read of \plc{A[j+1]} (or \plc{A(j+1)} in Fortran) before the write to \plc{A[j]} (or \plc{A(j)} in Fortran) ordering must be preserved for each iteration in \plc{j} for valid SIMD code generation.
|
||||
|
||||
This test assures that the compiler preserves the loop carried lexical forward-dependence for generating a correct SIMD code.
|
||||
|
||||
\cexample{SIMD}{8}
|
||||
|
||||
\ffreeexample{SIMD}{8}
|
||||
|
@ -1,43 +0,0 @@
|
||||
\section{Affinity Query Functions}
|
||||
\label{sec: affinity_query}
|
||||
|
||||
In the example below a team of threads is generated on each socket of
|
||||
the system, using nested parallelism. Several query functions are used
|
||||
to gather information to support the creation of the teams and to obtain
|
||||
socket and thread numbers.
|
||||
|
||||
For proper execution of the code, the user must create a place partition, such that
|
||||
each place is a listing of the core numbers for a socket. For example,
|
||||
in a 2 socket system with 8 cores in each socket, and sequential numbering
|
||||
in the socket for the core numbers, the \code{OMP\_PLACES} variable would be set
|
||||
to "\{0:8\},\{8:8\}", using the place syntax \{\plc{lower\_bound}:\plc{length}:\plc{stride}\},
|
||||
and the default stride of 1.
|
||||
|
||||
The code determines the number of sockets (\plc{n\_sockets})
|
||||
using the \code{omp\_get\_num\_places()} query function.
|
||||
In this example each place is constructed with a list of
|
||||
each socket's core numbers, hence the number of places is equal
|
||||
to the number of sockets.
|
||||
|
||||
The outer parallel region forms a team of threads, and each thread
|
||||
executes on a socket (place) because the \code{proc\_bind} clause uses
|
||||
\code{spread} in the outer \code{parallel} construct.
|
||||
Next, in the \plc{socket\_init} function, an inner parallel region creates a team
|
||||
of threads equal to the number of elements (core numbers) from the place
|
||||
of the parent thread. Because the outer \code{parallel} construct uses
|
||||
a \code{spread} affinity policy, each of its threads inherits a subpartition of
|
||||
the original partition. Hence, the \code{omp\_get\_place\_num\_procs} query function
|
||||
returns the number of elements (here procs = cores) in the subpartition of the thread.
|
||||
After each parent thread creates its nested parallel region on the section,
|
||||
the socket number and thread number are reported.
|
||||
|
||||
Note: Portable tools like hwloc (Portable HardWare LOCality package), which support
|
||||
many common operating systems, can be used to determine the configuration of a system.
|
||||
On some systems there are utilities, files or user guides that provide configuration
|
||||
information. For instance, the socket number and proc\_id's for a socket
|
||||
can be found in the /proc/cpuinfo text file on Linux systems.
|
||||
|
||||
\cexample{affinity}{6}
|
||||
|
||||
\ffreeexample{affinity}{6}
|
||||
|
@ -1,35 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Array Sections in Device Constructs}
|
||||
\label{sec:array_sections}
|
||||
|
||||
The following examples show the usage of array sections in \code{map} clauses
|
||||
on \code{target} and \code{target} \code{data} constructs.
|
||||
|
||||
This example shows the invalid usage of two seperate sections of the same array
|
||||
inside of a \code{target} construct.
|
||||
|
||||
\cexample{array_sections}{1}
|
||||
|
||||
\ffreeexample{array_sections}{1}
|
||||
|
||||
This example shows the invalid usage of two separate sections of the same array
|
||||
inside of a \code{target} construct.
|
||||
|
||||
\cexample{array_sections}{2}
|
||||
|
||||
\ffreeexample{array_sections}{2}
|
||||
|
||||
This example shows the valid usage of two separate sections of the same array inside
|
||||
of a \code{target} construct.
|
||||
|
||||
\cexample{array_sections}{3}
|
||||
|
||||
\ffreeexample{array_sections}{3}
|
||||
|
||||
This example shows the valid usage of a wholly contained array section of an already
|
||||
mapped array section inside of a \code{target} construct.
|
||||
|
||||
\cexample{array_sections}{4}
|
||||
|
||||
\ffreeexample{array_sections}{4}
|
||||
|
@ -1,32 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Fortran \code{ASSOCIATE} Construct}
|
||||
\fortranspecificstart
|
||||
\label{sec:associate}
|
||||
|
||||
The following is an invalid example of specifying an associate name on a data-sharing attribute
|
||||
clause. The constraint in the Data Sharing Attribute Rules section in the OpenMP
|
||||
4.0 API Specifications states that an associate name preserves the association
|
||||
with the selector established at the \code{ASSOCIATE} statement. The associate
|
||||
name \plc{b} is associated with the shared variable \plc{a}. With the predetermined data-sharing
|
||||
attribute rule, the associate name \plc{b} is not allowed to be specified on the \code{private}
|
||||
clause.
|
||||
|
||||
\fnexample{associate}{1}
|
||||
|
||||
In next example, within the \code{parallel} construct, the association name \plc{thread\_id}
|
||||
is associated with the private copy of \plc{i}. The print statement should output the
|
||||
unique thread number.
|
||||
|
||||
\fnexample{associate}{2}
|
||||
|
||||
The following example illustrates the effect of specifying a selector name on a data-sharing
|
||||
attribute clause. The associate name \plc{u} is associated with \plc{v} and the variable \plc{v}
|
||||
is specified on the \code{private} clause of the \code{parallel} construct.
|
||||
The construct association is established prior to the \code{parallel} region.
|
||||
The association between \plc{u} and the original \plc{v} is retained (see the Data Sharing
|
||||
Attribute Rules section in the OpenMP 4.0 API Specifications). Inside the \code{parallel}
|
||||
region, \plc{v} has the value of -1 and \plc{u} has the value of the original \plc{v}.
|
||||
|
||||
\ffreenexample{associate}{3}
|
||||
\fortranspecificend
|
||||
|
@ -1,15 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Asynchronous \code{target} Execution and Dependences}
|
||||
\label{sec:async_target_exec_depend}
|
||||
|
||||
Asynchronous execution of a \code{target} region can be accomplished
|
||||
by creating an explicit task around the \code{target} region. Examples
|
||||
with explicit tasks are shown at the beginning of this section.
|
||||
|
||||
As of OpenMP 4.5 and beyond the \code{nowait} clause can be used on the
|
||||
\code{target} directive for asynchronous execution. Examples with
|
||||
\code{nowait} clauses follow the explicit \code{task} examples.
|
||||
|
||||
This section also shows the use of \code{depend} clauses to order
|
||||
executions through dependences.
|
||||
|
@ -1,31 +0,0 @@
|
||||
\subsection{\code{nowait} Clause on \code{target} Construct}
|
||||
\label{subsec:target_nowait_clause}
|
||||
|
||||
The following example shows how to execute code asynchronously on a
|
||||
device without an explicit task. The \code{nowait} clause on a \code{target}
|
||||
construct allows the thread of the \plc{target task} to perform other
|
||||
work while waiting for the \code{target} region execution to complete.
|
||||
Hence, the the \code{target} region can execute asynchronously on the
|
||||
device (without requiring a host thread to idle while waiting for
|
||||
the \plc{target task} execution to complete).
|
||||
|
||||
In this example the product of two vectors (arrays), \plc{v1}
|
||||
and \plc{v2}, is formed. One half of the operations is performed
|
||||
on the device, and the last half on the host, concurrently.
|
||||
|
||||
After a team of threads is formed the master thread generates
|
||||
the \plc{target task} while the other threads can continue on, without a barrier,
|
||||
to the execution of the host portion of the vector product.
|
||||
The completion of the \plc{target task} (asynchronous target execution) is
|
||||
guaranteed by the synchronization in the implicit barrier at the end of the
|
||||
host vector-product worksharing loop region. See the \code{barrier}
|
||||
glossary entry in the OpenMP specification for details.
|
||||
|
||||
The host loop scheduling is \code{dynamic}, to balance the host thread executions, since
|
||||
one thread is being used for offload generation. In the situation where
|
||||
little time is spent by the \plc{target task} in setting
|
||||
up and tearing down the the target execution, \code{static} scheduling may be desired.
|
||||
|
||||
\cexample{async_target}{3}
|
||||
|
||||
\ffreeexample{async_target}{3}
|
@ -1,18 +0,0 @@
|
||||
%begin
|
||||
\subsection{Asynchronous \code{target} with \code{nowait} and \code{depend} Clauses}
|
||||
\label{subsec:async_target_nowait_depend}
|
||||
|
||||
More details on dependences can be found in \specref{sec:task_depend}, Task
|
||||
Dependences. In this example, there are three flow dependences. In the first two dependences the
|
||||
target task does not execute until the preceding explicit tasks have finished. These
|
||||
dependences are produced by arrays \plc{v1} and \plc{v2} with the \code{out} dependence type in the first two tasks, and the \code{in} dependence type in the target task.
|
||||
|
||||
The last dependence is produced by array \plc{p} with the \code{out} dependence type in the target task, and the \code{in} dependence type in the last task. The last task does not execute until the target task finishes.
|
||||
|
||||
The \code{nowait} clause on the \code{target} construct creates a deferrable \plc{target task}, allowing the encountering task to continue execution without waiting for the completion of the \plc{target task}.
|
||||
|
||||
\cexample{async_target}{4}
|
||||
|
||||
\ffreeexample{async_target}{4}
|
||||
|
||||
%end
|
@ -1,54 +0,0 @@
|
||||
\subsection{Asynchronous \code{target} with Tasks}
|
||||
\label{subsec:async_target_with_tasks}
|
||||
|
||||
The following example shows how the \code{task} and \code{target} constructs
|
||||
are used to execute multiple \code{target} regions asynchronously. The task that
|
||||
encounters the \code{task} construct generates an explicit task that contains
|
||||
a \code{target} region. The thread executing the explicit task encounters a task
|
||||
scheduling point while waiting for the execution of the \code{target} region
|
||||
to complete, allowing the thread to switch back to the execution of the encountering
|
||||
task or one of the previously generated explicit tasks.
|
||||
|
||||
\cexample{async_target}{1}
|
||||
|
||||
The Fortran version has an interface block that contains the \code{declare} \code{target}.
|
||||
An identical statement exists in the function declaration (not shown here).
|
||||
|
||||
\ffreeexample{async_target}{1}
|
||||
|
||||
The following example shows how the \code{task} and \code{target} constructs
|
||||
are used to execute multiple \code{target} regions asynchronously. The task dependence
|
||||
ensures that the storage is allocated and initialized on the device before it is
|
||||
accessed.
|
||||
|
||||
\cexample{async_target}{2}
|
||||
|
||||
The Fortran example below is similar to the C version above. Instead of pointers, though, it uses
|
||||
the convenience of Fortran allocatable arrays on the device. In order to preserve the arrays
|
||||
allocated on the device across multiple \code{target} regions, a \code{target}~\code{data} region
|
||||
is used in this case.
|
||||
|
||||
If there is no shape specified for an allocatable array in a \code{map} clause, only the array descriptor
|
||||
(also called a dope vector) is mapped. That is, device space is created for the descriptor, and it
|
||||
is initially populated with host values. In this case, the \plc{v1} and \plc{v2} arrays will be in a
|
||||
non-associated state on the device. When space for \plc{v1} and \plc{v2} is allocated on the device
|
||||
in the first \code{target} region the addresses to the space will be included in their descriptors.
|
||||
|
||||
At the end of the first \code{target} region, the arrays \plc{v1} and \plc{v2} are preserved on the device
|
||||
for access in the second \code{target} region. At the end of the second \code{target} region, the data
|
||||
in array \plc{p} is copied back, the arrays \plc{v1} and \plc{v2} are not.
|
||||
|
||||
A \code{depend} clause is used in the \code{task} directive to provide a wait at the beginning of the second
|
||||
\code{target} region, to insure that there is no race condition with \plc{v1} and \plc{v2} in the two tasks.
|
||||
It would be noncompliant to use \plc{v1} and/or \plc{v2} in lieu of \plc{N} in the \code{depend} clauses,
|
||||
because the use of non-allocated allocatable arrays as list items in a \code{depend} clause would
|
||||
lead to unspecified behavior.
|
||||
|
||||
\noteheader{--} This example is not strictly compliant with the OpenMP 4.5 specification since the allocation status
|
||||
of allocatable arrays \plc{v1} and \plc{v2} is changed inside the \code{target} region, which is not allowed.
|
||||
(See the restrictions for the \code{map} clause in the \plc{Data-mapping Attribute Rules and Clauses}
|
||||
section of the specification.)
|
||||
However, the intention is to relax the restrictions on mapping of allocatable variables in the next release
|
||||
of the specification so that the example will be compliant.
|
||||
|
||||
\ffreeexample{async_target}{2}
|
@ -1,44 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{atomic} Construct}
|
||||
\label{sec:atomic}
|
||||
|
||||
The following example avoids race conditions (simultaneous updates of an element
|
||||
of \plc{x} by multiple threads) by using the \code{atomic} construct .
|
||||
|
||||
The advantage of using the \code{atomic} construct in this example is that it
|
||||
allows updates of two different elements of \plc{x} to occur in parallel. If
|
||||
a \code{critical} construct were used instead, then all updates to elements of
|
||||
\plc{x} would be executed serially (though not in any guaranteed order).
|
||||
|
||||
Note that the \code{atomic} directive applies only to the statement immediately
|
||||
following it. As a result, elements of \plc{y} are not updated atomically in
|
||||
this example.
|
||||
|
||||
\cexample{atomic}{1}
|
||||
|
||||
\fexample{atomic}{1}
|
||||
|
||||
The following example illustrates the \code{read} and \code{write} clauses
|
||||
for the \code{atomic} directive. These clauses ensure that the given variable
|
||||
is read or written, respectively, as a whole. Otherwise, some other thread might
|
||||
read or write part of the variable while the current thread was reading or writing
|
||||
another part of the variable. Note that most hardware provides atomic reads and
|
||||
writes for some set of properly aligned variables of specific sizes, but not necessarily
|
||||
for all the variable types supported by the OpenMP API.
|
||||
|
||||
\cexample{atomic}{2}
|
||||
|
||||
\fexample{atomic}{2}
|
||||
|
||||
The following example illustrates the \code{capture} clause for the \code{atomic}
|
||||
directive. In this case the value of a variable is captured, and then the variable
|
||||
is incremented. These operations occur atomically. This particular example could
|
||||
be implemented using the fetch-and-add instruction available on many kinds of hardware.
|
||||
The example also shows a way to implement a spin lock using the \code{capture}
|
||||
and \code{read} clauses.
|
||||
|
||||
\cexample{atomic}{3}
|
||||
|
||||
\fexample{atomic}{3}
|
||||
|
||||
|
@ -1,25 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Restrictions on the \code{atomic} Construct}
|
||||
\label{sec:atomic_restrict}
|
||||
|
||||
The following non-conforming examples illustrate the restrictions on the \code{atomic}
|
||||
construct.
|
||||
|
||||
\cexample{atomic_restrict}{1}
|
||||
|
||||
\fexample{atomic_restrict}{1}
|
||||
|
||||
\cexample{atomic_restrict}{2}
|
||||
|
||||
\fortranspecificstart
|
||||
The following example is non-conforming because \code{I} and \code{R} reference
|
||||
the same location but have different types.
|
||||
|
||||
\fnexample{atomic_restrict}{2}
|
||||
|
||||
Although the following example might work on some implementations, this is also
|
||||
non-conforming:
|
||||
|
||||
\fnexample{atomic_restrict}{3}
|
||||
\fortranspecificend
|
||||
|
@ -1,24 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Binding of \code{barrier} Regions}
|
||||
\label{sec:barrier_regions}
|
||||
|
||||
The binding rules call for a \code{barrier} region to bind to the closest enclosing
|
||||
\code{parallel} region.
|
||||
|
||||
In the following example, the call from the main program to \plc{sub2} is conforming
|
||||
because the \code{barrier} region (in \plc{sub3}) binds to the \code{parallel}
|
||||
region in \plc{sub2}. The call from the main program to \plc{sub1} is conforming
|
||||
because the \code{barrier} region binds to the \code{parallel} region in subroutine
|
||||
\plc{sub2}.
|
||||
|
||||
The call from the main program to \plc{sub3} is conforming because the \code{barrier}
|
||||
region binds to the implicit inactive \code{parallel} region enclosing the sequential
|
||||
part. Also note that the \code{barrier} region in \plc{sub3} when called from
|
||||
\plc{sub2} only synchronizes the team of threads in the enclosing \code{parallel}
|
||||
region and not all the threads created in \plc{sub1}.
|
||||
|
||||
\cexample{barrier_regions}{1}
|
||||
|
||||
\fexample{barrier_regions}{1}
|
||||
|
||||
|
@ -1,42 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Cancellation Constructs}
|
||||
\label{sec:cancellation}
|
||||
|
||||
The following example shows how the \code{cancel} directive can be used to terminate
|
||||
an OpenMP region. Although the \code{cancel} construct terminates the OpenMP
|
||||
worksharing region, programmers must still track the exception through the pointer
|
||||
ex and issue a cancellation for the \code{parallel} region if an exception has
|
||||
been raised. The master thread checks the exception pointer to make sure that the
|
||||
exception is properly handled in the sequential part. If cancellation of the \code{parallel}
|
||||
region has been requested, some threads might have executed \code{phase\_1()}.
|
||||
However, it is guaranteed that none of the threads executed \code{phase\_2()}.
|
||||
|
||||
\cppexample{cancellation}{1}
|
||||
|
||||
|
||||
The following example illustrates the use of the \code{cancel} construct in error
|
||||
handling. If there is an error condition from the \code{allocate} statement,
|
||||
the cancellation is activated. The encountering thread sets the shared variable
|
||||
\code{err} and other threads of the binding thread set proceed to the end of
|
||||
the worksharing construct after the cancellation has been activated.
|
||||
|
||||
\ffreeexample{cancellation}{1}
|
||||
|
||||
The following example shows how to cancel a parallel search on a binary tree as
|
||||
soon as the search value has been detected. The code creates a task to descend
|
||||
into the child nodes of the current tree node. If the search value has been found,
|
||||
the code remembers the tree node with the found value through an \code{atomic}
|
||||
write to the result variable and then cancels execution of all search tasks. The
|
||||
function \code{search\_tree\_parallel} groups all search tasks into a single
|
||||
task group to control the effect of the \code{cancel taskgroup} directive. The
|
||||
\plc{level} argument is used to create undeferred tasks after the first ten
|
||||
levels of the tree.
|
||||
|
||||
\cexample{cancellation}{2}
|
||||
|
||||
|
||||
The following is the equivalent parallel search example in Fortran.
|
||||
|
||||
\ffreeexample{cancellation}{2}
|
||||
|
||||
|
@ -1,37 +0,0 @@
|
||||
\pagebreak
|
||||
\section{C/C++ Arrays in a \code{firstprivate} Clause}
|
||||
\ccppspecificstart
|
||||
\label{sec:carrays_fpriv}
|
||||
|
||||
The following example illustrates the size and value of list items of array or
|
||||
pointer type in a \code{firstprivate} clause . The size of new list items is
|
||||
based on the type of the corresponding original list item, as determined by the
|
||||
base language.
|
||||
|
||||
In this example:
|
||||
|
||||
\begin{compactitem}
|
||||
\item The type of \code{A} is array of two arrays of two ints.
|
||||
|
||||
\item The type of \code{B} is adjusted to pointer to array of \code{n}
|
||||
ints, because it is a function parameter.
|
||||
|
||||
\item The type of \code{C} is adjusted to pointer to int, because
|
||||
it is a function parameter.
|
||||
|
||||
\item The type of \code{D} is array of two arrays of two ints.
|
||||
|
||||
\item The type of \code{E} is array of \code{n} arrays of \code{n}
|
||||
ints.
|
||||
\end{compactitem}
|
||||
|
||||
Note that \code{B} and \code{E} involve variable length array types.
|
||||
|
||||
The new items of array type are initialized as if each integer element of the original
|
||||
array is assigned to the corresponding element of the new array. Those of pointer
|
||||
type are initialized as if by assignment from the original item to the new item.
|
||||
|
||||
\cnexample{carrays_fpriv}{1}
|
||||
\ccppspecificend
|
||||
|
||||
|
@ -1,78 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{collapse} Clause}
|
||||
\label{sec:collapse}
|
||||
|
||||
In the following example, the \code{k} and \code{j} loops are associated with
|
||||
the loop construct. So the iterations of the \code{k} and \code{j} loops are
|
||||
collapsed into one loop with a larger iteration space, and that loop is then divided
|
||||
among the threads in the current team. Since the \code{i} loop is not associated
|
||||
with the loop construct, it is not collapsed, and the \code{i} loop is executed
|
||||
sequentially in its entirety in every iteration of the collapsed \code{k} and
|
||||
\code{j} loop.
|
||||
|
||||
The variable \code{j} can be omitted from the \code{private} clause when the
|
||||
\code{collapse} clause is used since it is implicitly private. However, if the
|
||||
\code{collapse} clause is omitted then \code{j} will be shared if it is omitted
|
||||
from the \code{private} clause. In either case, \code{k} is implicitly private
|
||||
and could be omitted from the \code{private} clause.
|
||||
|
||||
\cexample{collapse}{1}
|
||||
|
||||
\fexample{collapse}{1}
|
||||
|
||||
In the next example, the \code{k} and \code{j} loops are associated with the
|
||||
loop construct. So the iterations of the \code{k} and \code{j} loops are collapsed
|
||||
into one loop with a larger iteration space, and that loop is then divided among
|
||||
the threads in the current team.
|
||||
|
||||
The sequential execution of the iterations in the \code{k} and \code{j} loops
|
||||
determines the order of the iterations in the collapsed iteration space. This implies
|
||||
that in the sequentially last iteration of the collapsed iteration space, \code{k}
|
||||
will have the value \code{2} and \code{j} will have the value \code{3}. Since
|
||||
\code{klast} and \code{jlast} are \code{lastprivate}, their values are assigned
|
||||
by the sequentially last iteration of the collapsed \code{k} and \code{j} loop.
|
||||
This example prints: \code{2 3}.
|
||||
|
||||
\cexample{collapse}{2}
|
||||
|
||||
\fexample{collapse}{2}
|
||||
|
||||
The next example illustrates the interaction of the \code{collapse} and \code{ordered}
|
||||
clauses.
|
||||
|
||||
In the example, the loop construct has both a \code{collapse} clause and an \code{ordered}
|
||||
clause. The \code{collapse} clause causes the iterations of the \code{k} and
|
||||
\code{j} loops to be collapsed into one loop with a larger iteration space, and
|
||||
that loop is divided among the threads in the current team. An \code{ordered}
|
||||
clause is added to the loop construct, because an ordered region binds to the loop
|
||||
region arising from the loop construct.
|
||||
|
||||
According to Section 2.12.8 of the OpenMP 4.0 specification,
|
||||
a thread must not execute more than one ordered region that binds
|
||||
to the same loop region. So the \code{collapse} clause is required for the example
|
||||
to be conforming. With the \code{collapse} clause, the iterations of the \code{k}
|
||||
and \code{j} loops are collapsed into one loop, and therefore only one ordered
|
||||
region will bind to the collapsed \code{k} and \code{j} loop. Without the \code{collapse}
|
||||
clause, there would be two ordered regions that bind to each iteration of the \code{k}
|
||||
loop (one arising from the first iteration of the \code{j} loop, and the other
|
||||
arising from the second iteration of the \code{j} loop).
|
||||
|
||||
The code prints
|
||||
|
||||
\code{0 1 1}
|
||||
\\
|
||||
\code{0 1 2}
|
||||
\\
|
||||
\code{0 2 1}
|
||||
\\
|
||||
\code{1 2 2}
|
||||
\\
|
||||
\code{1 3 1}
|
||||
\\
|
||||
\code{1 3 2}
|
||||
|
||||
\cexample{collapse}{3}
|
||||
|
||||
\fexample{collapse}{3}
|
||||
|
||||
|
@ -1,13 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{copyin} Clause}
|
||||
\label{sec:copyin}
|
||||
|
||||
The \code{copyin} clause is used to initialize threadprivate data upon entry
|
||||
to a \code{parallel} region. The value of the threadprivate variable in the master
|
||||
thread is copied to the threadprivate variable of each other team member.
|
||||
|
||||
\cexample{copyin}{1}
|
||||
|
||||
\fexample{copyin}{1}
|
||||
|
||||
|
@ -1,51 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{copyprivate} Clause}
|
||||
\label{sec:copyprivate}
|
||||
|
||||
The \code{copyprivate} clause can be used to broadcast values acquired by a single
|
||||
thread directly to all instances of the private variables in the other threads.
|
||||
In this example, if the routine is called from the sequential part, its behavior
|
||||
is not affected by the presence of the directives. If it is called from a \code{parallel}
|
||||
region, then the actual arguments with which \code{a} and \code{b} are associated
|
||||
must be private.
|
||||
|
||||
The thread that executes the structured block associated with the \code{single}
|
||||
construct broadcasts the values of the private variables \code{a}, \code{b},
|
||||
\code{x}, and
|
||||
\code{y} from its implicit task's data environment to the data environments
|
||||
of the other implicit tasks in the thread team. The broadcast completes before
|
||||
any of the threads have left the barrier at the end of the construct.
|
||||
|
||||
\cexample{copyprivate}{1}
|
||||
|
||||
\fexample{copyprivate}{1}
|
||||
|
||||
In this example, assume that the input must be performed by the master thread.
|
||||
Since the \code{master} construct does not support the \code{copyprivate} clause,
|
||||
it cannot broadcast the input value that is read. However, \code{copyprivate}
|
||||
is used to broadcast an address where the input value is stored.
|
||||
|
||||
\cexample{copyprivate}{2}
|
||||
|
||||
\fexample{copyprivate}{2}
|
||||
|
||||
Suppose that the number of lock variables required within a \code{parallel} region
|
||||
cannot easily be determined prior to entering it. The \code{copyprivate} clause
|
||||
can be used to provide access to shared lock variables that are allocated within
|
||||
that \code{parallel} region.
|
||||
|
||||
\cexample{copyprivate}{3}
|
||||
|
||||
\fortranspecificstart
|
||||
\fnexample{copyprivate}{3}
|
||||
|
||||
Note that the effect of the \code{copyprivate} clause on a variable with the
|
||||
\code{allocatable} attribute is different than on a variable with the \code{pointer}
|
||||
attribute. The value of \code{A} is copied (as if by intrinsic assignment) and
|
||||
the pointer \code{B} is copied (as if by pointer assignment) to the corresponding
|
||||
list items in the other implicit tasks belonging to the \code{parallel} region.
|
||||
|
||||
\fnexample{copyprivate}{4}
|
||||
\fortranspecificend
|
||||
|
||||
|
@ -1,20 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{critical} Construct}
|
||||
\label{sec:critical}
|
||||
|
||||
The following example includes several \code{critical} constructs. The example
|
||||
illustrates a queuing model in which a task is dequeued and worked on. To guard
|
||||
against multiple threads dequeuing the same task, the dequeuing operation must
|
||||
be in a \code{critical} region. Because the two queues in this example are independent,
|
||||
they are protected by \code{critical} constructs with different names, \plc{xaxis}
|
||||
and \plc{yaxis}.
|
||||
|
||||
\cexample{critical}{1}
|
||||
|
||||
\fexample{critical}{1}
|
||||
|
||||
The following example extends the previous example by adding the \code{hint} clause to the \code{critical} constructs.
|
||||
|
||||
\cexample{critical}{2}
|
||||
|
||||
\fexample{critical}{2}
|
@ -1,142 +0,0 @@
|
||||
\pagebreak
|
||||
\section{\code{declare} \code{target} Construct}
|
||||
\label{sec:declare_target}
|
||||
|
||||
\subsection{\code{declare} \code{target} and \code{end} \code{declare} \code{target} for a Function}
|
||||
\label{subsec:declare_target_function}
|
||||
|
||||
The following example shows how the \code{declare} \code{target} directive
|
||||
is used to indicate that the corresponding call inside a \code{target} region
|
||||
is to a \code{fib} function that can execute on the default target device.
|
||||
|
||||
A version of the function is also available on the host device. When the \code{if}
|
||||
clause conditional expression on the \code{target} construct evaluates to \plc{false},
|
||||
the \code{target} region (thus \code{fib}) will execute on the host device.
|
||||
|
||||
For C/C++ codes the declaration of the function \code{fib} appears between the \code{declare}
|
||||
\code{target} and \code{end} \code{declare} \code{target} directives.
|
||||
|
||||
\cexample{declare_target}{1}
|
||||
|
||||
The Fortran \code{fib} subroutine contains a \code{declare} \code{target} declaration
|
||||
to indicate to the compiler to create an device executable version of the procedure.
|
||||
The subroutine name has not been included on the \code{declare} \code{target}
|
||||
directive and is, therefore, implicitly assumed.
|
||||
|
||||
The program uses the \code{module\_fib} module, which presents an explicit interface to
|
||||
the compiler with the \code{declare} \code{target} declarations for processing
|
||||
the \code{fib} call.
|
||||
|
||||
\ffreeexample{declare_target}{1}
|
||||
|
||||
The next Fortran example shows the use of an external subroutine. Without an explicit
|
||||
interface (through module use or an interface block) the \code{declare} \code{target}
|
||||
declarations within a external subroutine are unknown to the main program unit;
|
||||
therefore, a \code{declare} \code{target} must be provided within the program
|
||||
scope for the compiler to determine that a target binary should be available.
|
||||
|
||||
\ffreeexample{declare_target}{2}
|
||||
|
||||
\subsection{\code{declare} \code{target} Construct for Class Type}
|
||||
\label{subsec:declare_target_class}
|
||||
|
||||
\cppspecificstart
|
||||
The following example shows how the \code{declare} \code{target} and \code{end}
|
||||
\code{declare} \code{target} directives are used to enclose the declaration
|
||||
of a variable \plc{varY} with a class type \code{typeY}. The member function \code{typeY::foo()} cannot
|
||||
be accessed on a target device because its declaration did not appear between \code{declare}
|
||||
\code{target} and \code{end} \code{declare} \code{target} directives.
|
||||
|
||||
\cppnexample{declare_target}{2}
|
||||
\cppspecificend
|
||||
|
||||
\subsection{\code{declare} \code{target} and \code{end} \code{declare} \code{target} for Variables}
|
||||
\label{subsec:declare_target_variables}
|
||||
|
||||
The following examples show how the \code{declare} \code{target} and \code{end}
|
||||
\code{declare} \code{target} directives are used to indicate that global variables
|
||||
are mapped to the implicit device data environment of each target device.
|
||||
|
||||
In the following example, the declarations of the variables \plc{p}, \plc{v1}, and \plc{v2} appear
|
||||
between \code{declare} \code{target} and \code{end} \code{declare} \code{target}
|
||||
directives indicating that the variables are mapped to the implicit device data
|
||||
environment of each target device. The \code{target} \code{update} directive
|
||||
is then used to manage the consistency of the variables \plc{p}, \plc{v1}, and \plc{v2} between the
|
||||
data environment of the encountering host device task and the implicit device data
|
||||
environment of the default target device.
|
||||
|
||||
\cexample{declare_target}{3}
|
||||
|
||||
The Fortran version of the above C code uses a different syntax. Fortran modules
|
||||
use a list syntax on the \code{declare} \code{target} directive to declare
|
||||
mapped variables.
|
||||
|
||||
\ffreeexample{declare_target}{3}
|
||||
|
||||
The following example also indicates that the function \code{Pfun()} is available on the
|
||||
target device, as well as the variable \plc{Q}, which is mapped to the implicit device
|
||||
data environment of each target device. The \code{target} \code{update} directive
|
||||
is then used to manage the consistency of the variable \plc{Q} between the data environment
|
||||
of the encountering host device task and the implicit device data environment of
|
||||
the default target device.
|
||||
|
||||
In the following example, the function and variable declarations appear between
|
||||
the \code{declare} \code{target} and \code{end} \code{declare} \code{target}
|
||||
directives.
|
||||
|
||||
\cexample{declare_target}{4}
|
||||
|
||||
The Fortran version of the above C code uses a different syntax. In Fortran modules
|
||||
a list syntax on the \code{declare} \code{target} directive is used to declare
|
||||
mapped variables and procedures. The \plc{N} and \plc{Q} variables are declared as a comma
|
||||
separated list. When the \code{declare} \code{target} directive is used to
|
||||
declare just the procedure, the procedure name need not be listed -- it is implicitly
|
||||
assumed, as illustrated in the \code{Pfun()} function.
|
||||
|
||||
\ffreeexample{declare_target}{4}
|
||||
|
||||
\subsection{\code{declare} \code{target} and \code{end} \code{declare} \code{target} with \code{declare} \code{simd}}
|
||||
\label{subsec:declare_target_simd}
|
||||
|
||||
The following example shows how the \code{declare} \code{target} and \code{end}
|
||||
\code{declare} \code{target} directives are used to indicate that a function
|
||||
is available on a target device. The \code{declare} \code{simd} directive indicates
|
||||
that there is a SIMD version of the function \code{P()} that is available on the target
|
||||
device as well as one that is available on the host device.
|
||||
|
||||
\cexample{declare_target}{5}
|
||||
|
||||
The Fortran version of the above C code uses a different syntax. Fortran modules
|
||||
use a list syntax of the \code{declare} \code{target} declaration for the mapping.
|
||||
Here the \plc{N} and \plc{Q} variables are declared in the list form as a comma separated list.
|
||||
The function declaration does not use a list and implicitly assumes the function
|
||||
name. In this Fortran example row and column indices are reversed relative to the
|
||||
C/C++ example, as is usual for codes optimized for memory access.
|
||||
|
||||
\ffreeexample{declare_target}{5}
|
||||
|
||||
|
||||
\subsection{\code{declare}~\code{target} Directive with \code{link} Clause}
|
||||
\label{subsec:declare_target_link}
|
||||
|
||||
In the OpenMP 4.5 standard the \code{declare}~\code{target} directive was extended to allow static
|
||||
data to be mapped, \emph{when needed}, through a \code{link} clause.
|
||||
|
||||
Data storage for items listed in the \code{link} clause becomes available on the device
|
||||
when it is mapped implicitly or explicitly in a \code{map} clause, and it persists for the scope of
|
||||
the mapping (as specified by a \code{target} construct,
|
||||
a \code{target}~\code{data} construct, or
|
||||
\code{target}~\code{enter/exit}~\code{data} constructs).
|
||||
|
||||
Tip: When all the global data items will not fit on a device and are not needed
|
||||
simultaneously, use the \code{link} clause and map the data only when it is needed.
|
||||
|
||||
The following C and Fortran examples show two sets of data (single precision and double precision)
|
||||
that are global on the host for the entire execution on the host; but are only used
|
||||
globally on the device for part of the program execution. The single precision data
|
||||
are allocated and persist only for the first \code{target} region. Similarly, the
|
||||
double precision data are in scope on the device only for the second \code{target} region.
|
||||
|
||||
\cexample{declare_target}{6}
|
||||
\ffreeexample{declare_target}{6}
|
||||
|
@ -1,19 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{default(none)} Clause}
|
||||
\label{sec:default_none}
|
||||
|
||||
The following example distinguishes the variables that are affected by the \code{default(none)}
|
||||
clause from those that are not.
|
||||
|
||||
\ccppspecificstart
|
||||
Beginning with OpenMP 4.0, variables with \code{const}-qualified type and no mutable member
|
||||
are no longer predetermined shared. Thus, these variables (variable \plc{c} in the example)
|
||||
need to be explicitly listed
|
||||
in data-sharing attribute clauses when the \code{default(none)} clause is specified.
|
||||
|
||||
\cnexample{default_none}{1}
|
||||
\ccppspecificend
|
||||
|
||||
\fexample{default_none}{1}
|
||||
|
||||
|
@ -1,57 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Device Routines}
|
||||
\label{sec:device}
|
||||
|
||||
\subsection{\code{omp\_is\_initial\_device} Routine}
|
||||
\label{subsec:device_is_initial}
|
||||
|
||||
The following example shows how the \code{omp\_is\_initial\_device} runtime library routine
|
||||
can be used to query if a code is executing on the initial host device or on a
|
||||
target device. The example then sets the number of threads in the \code{parallel}
|
||||
region based on where the code is executing.
|
||||
|
||||
\cexample{device}{1}
|
||||
|
||||
\ffreeexample{device}{1}
|
||||
|
||||
\subsection{\code{omp\_get\_num\_devices} Routine}
|
||||
\label{subsec:device_num_devices}
|
||||
|
||||
The following example shows how the \code{omp\_get\_num\_devices} runtime library routine
|
||||
can be used to determine the number of devices.
|
||||
|
||||
\cexample{device}{2}
|
||||
|
||||
\ffreeexample{device}{2}
|
||||
|
||||
\subsection{\code{omp\_set\_default\_device} and \\
|
||||
\code{omp\_get\_default\_device} Routines}
|
||||
\label{subsec:device_is_set_get_default}
|
||||
|
||||
The following example shows how the \code{omp\_set\_default\_device} and \code{omp\_get\_default\_device}
|
||||
runtime library routines can be used to set the default device and determine the
|
||||
default device respectively.
|
||||
|
||||
\cexample{device}{3}
|
||||
|
||||
\ffreeexample{device}{3}
|
||||
|
||||
|
||||
\subsection{Target Memory and Device Pointers Routines}
|
||||
\label{subsec:target_mem_and_device_ptrs}
|
||||
|
||||
The following example shows how to create space on a device, transfer data
|
||||
to and from that space, and free the space, using API calls. The API calls
|
||||
directly execute allocation, copy and free operations on the device, without invoking
|
||||
any mapping through a \code{target} directive. The \code{omp\_target\_alloc} routine allocates space
|
||||
and returns a device pointer for referencing the space in the \code{omp\_target\_memcpy}
|
||||
API routine on the host. The \code{omp\_target\_free} routine frees the space on the device.
|
||||
|
||||
The example also illustrates how to access that space
|
||||
in a \code{target} region by exposing the device pointer in an \code{is\_device\_ptr} clause.
|
||||
|
||||
The example creates an array of cosine values on the default device, to be used
|
||||
on the host device. The function fails if a default device is not available.
|
||||
|
||||
\cexample{device}{4}
|
||||
|
@ -1,68 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Doacross Loop Nest}
|
||||
\label{sec:doacross}
|
||||
|
||||
An \code{ordered} clause can be used on a loop construct with an integer
|
||||
parameter argument to define the number of associated loops within
|
||||
a \plc{doacross loop nest} where cross-iteration dependences exist.
|
||||
A \code{depend} clause on an \code{ordered} construct within an ordered
|
||||
loop describes the dependences of the \plc{doacross} loops.
|
||||
|
||||
In the code below, the \code{depend(sink:i-1)} clause defines an \plc{i-1}
|
||||
to \plc{i} cross-iteration dependence that specifies a wait point for
|
||||
the completion of computation from iteration \plc{i-1} before proceeding
|
||||
to the subsequent statements. The \code{depend(source)} clause indicates
|
||||
the completion of computation from the current iteration (\plc{i})
|
||||
to satisfy the cross-iteration dependence that arises from the iteration.
|
||||
For this example the same sequential ordering could have been achieved
|
||||
with an \code{ordered} clause without a parameter, on the loop directive,
|
||||
and a single \code{ordered} directive without the \code{depend} clause
|
||||
specified for the statement executing the \plc{bar} function.
|
||||
|
||||
\cexample{doacross}{1}
|
||||
|
||||
\ffreeexample{doacross}{1}
|
||||
|
||||
The following code is similar to the previous example but with
|
||||
\plc{doacross loop nest} extended to two nested loops, \plc{i} and \plc{j},
|
||||
as specified by the \code{ordered(2)} clause on the loop directive.
|
||||
In the C/C++ code, the \plc{i} and \plc{j} loops are the first and
|
||||
second associated loops, respectively, whereas
|
||||
in the Fortran code, the \plc{j} and \plc{i} loops are the first and
|
||||
second associated loops, respectively.
|
||||
The \code{depend(sink:i-1,j)} and \code{depend(sink:i,j-1)} clauses in
|
||||
the C/C++ code define cross-iteration dependences in two dimensions from
|
||||
iterations (\plc{i-1, j}) and (\plc{i, j-1}) to iteration (\plc{i, j}).
|
||||
Likewise, the \code{depend(sink:j-1,i)} and \code{depend(sink:j,i-1)} clauses
|
||||
in the Fortran code define cross-iteration dependences from iterations
|
||||
(\plc{j-1, i}) and (\plc{j, i-1}) to iteration (\plc{j, i}).
|
||||
|
||||
\cexample{doacross}{2}
|
||||
|
||||
\ffreeexample{doacross}{2}
|
||||
|
||||
|
||||
The following example shows the incorrect use of the \code{ordered}
|
||||
directive with a \code{depend} clause. There are two issues with the code.
|
||||
The first issue is a missing \code{ordered}~\code{depend(source)} directive,
|
||||
which could cause a deadlock.
|
||||
The second issue is the \code{depend(sink:i+1,j)} and \code{depend(sink:i,j+1)}
|
||||
clauses define dependences on lexicographically later
|
||||
source iterations (\plc{i+1, j}) and (\plc{i, j+1}), which could cause
|
||||
a deadlock as well since they may not start to execute until the current iteration completes.
|
||||
|
||||
\cexample{doacross}{3}
|
||||
|
||||
\ffreeexample{doacross}{3}
|
||||
|
||||
|
||||
The following example illustrates the use of the \code{collapse} clause for
|
||||
a \plc{doacross loop nest}. The \plc{i} and \plc{j} loops are the associated
|
||||
loops for the collapsed loop as well as for the \plc{doacross loop nest}.
|
||||
The example also shows a compliant usage of the dependence source
|
||||
directive placed before the corresponding sink directive.
|
||||
Checking the completion of computation from previous iterations at the sink point can occur after the source statement.
|
||||
|
||||
\cexample{doacross}{4}
|
||||
|
||||
\ffreeexample{doacross}{4}
|
@ -1,12 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{flush} Construct without a List}
|
||||
\label{sec:flush_nolist}
|
||||
|
||||
The following example distinguishes the shared variables affected by a \code{flush}
|
||||
construct with no list from the shared objects that are not affected:
|
||||
|
||||
\cexample{flush_nolist}{1}
|
||||
|
||||
\fexample{flush_nolist}{1}
|
||||
|
||||
|
@ -1,19 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Fortran Restrictions on the \code{do} Construct}
|
||||
\label{sec:fort_do}
|
||||
\fortranspecificstart
|
||||
|
||||
If an \code{end do} directive follows a \plc{do-construct} in which several
|
||||
\code{DO} statements share a \code{DO} termination statement, then a \code{do}
|
||||
directive can only be specified for the outermost of these \code{DO} statements.
|
||||
The following example contains correct usages of loop constructs:
|
||||
|
||||
\fnexample{fort_do}{1}
|
||||
|
||||
The following example is non-conforming because the matching \code{do} directive
|
||||
for the \code{end do} does not precede the outermost loop:
|
||||
|
||||
\fnexample{fort_do}{2}
|
||||
\fortranspecificend
|
||||
|
||||
|
@ -1,23 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Fortran Restrictions on Storage Association with the \code{private} Clause}
|
||||
\fortranspecificstart
|
||||
\label{sec:fort_sa_private}
|
||||
|
||||
The following non-conforming examples illustrate the implications of the \code{private}
|
||||
clause rules with regard to storage association.
|
||||
|
||||
\fnexample{fort_sa_private}{1}
|
||||
|
||||
\fnexample{fort_sa_private}{2}
|
||||
|
||||
\fnexample{fort_sa_private}{3}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
|
||||
\fnexample{fort_sa_private}{4}
|
||||
|
||||
\fnexample{fort_sa_private}{5}
|
||||
\fortranspecificend
|
||||
|
@ -1,38 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Fortran Restrictions on \code{shared} and \code{private} Clauses with Common Blocks}
|
||||
\fortranspecificstart
|
||||
\label{sec:fort_sp_common}
|
||||
|
||||
When a named common block is specified in a \code{private}, \code{firstprivate},
|
||||
or \code{lastprivate} clause of a construct, none of its members may be declared
|
||||
in another data-sharing attribute clause on that construct. The following examples
|
||||
illustrate this point.
|
||||
|
||||
The following example is conforming:
|
||||
|
||||
\fnexample{fort_sp_common}{1}
|
||||
|
||||
The following example is also conforming:
|
||||
|
||||
\fnexample{fort_sp_common}{2}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
|
||||
The following example is conforming:
|
||||
|
||||
\fnexample{fort_sp_common}{3}
|
||||
|
||||
The following example is non-conforming because \code{x} is a constituent element
|
||||
of \code{c}:
|
||||
|
||||
\fnexample{fort_sp_common}{4}
|
||||
|
||||
The following example is non-conforming because a common block may not be declared
|
||||
both shared and private:
|
||||
|
||||
\fnexample{fort_sp_common}{5}
|
||||
\fortranspecificend
|
||||
|
||||
|
@ -1,18 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{firstprivate} Clause and the \code{sections} Construct}
|
||||
\label{sec:fpriv_sections}
|
||||
|
||||
In the following example of the \code{sections} construct the \code{firstprivate}
|
||||
clause is used to initialize the private copy of \code{section\_count} of each
|
||||
thread. The problem is that the \code{section} constructs modify \code{section\_count},
|
||||
which breaks the independence of the \code{section} constructs. When different
|
||||
threads execute each section, both sections will print the value 1. When the same
|
||||
thread executes the two sections, one section will print the value 1 and the other
|
||||
will print the value 2. Since the order of execution of the two sections in this
|
||||
case is unspecified, it is unspecified which section prints which value.
|
||||
|
||||
\cexample{fpriv_sections}{1}
|
||||
|
||||
\ffreeexample{fpriv_sections}{1}
|
||||
|
||||
|
@ -1,21 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{omp\_get\_num\_threads} Routine}
|
||||
\label{sec:get_nthrs}
|
||||
|
||||
In the following example, the \code{omp\_get\_num\_threads} call returns 1 in
|
||||
the sequential part of the code, so \code{np} will always be equal to 1. To determine
|
||||
the number of threads that will be deployed for the \code{parallel} region, the
|
||||
call should be inside the \code{parallel} region.
|
||||
|
||||
\cexample{get_nthrs}{1}
|
||||
|
||||
\fexample{get_nthrs}{1}
|
||||
|
||||
The following example shows how to rewrite this program without including a query
|
||||
for the number of threads:
|
||||
|
||||
\cexample{get_nthrs}{2}
|
||||
|
||||
\fexample{get_nthrs}{2}
|
||||
|
||||
|
@ -1,56 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Internal Control Variables (ICVs)}
|
||||
\label{sec:icv}
|
||||
|
||||
According to Section 2.3 of the OpenMP 4.0 specification, an OpenMP implementation must act as if there are ICVs that control
|
||||
the behavior of the program. This example illustrates two ICVs, \plc{nthreads-var}
|
||||
and \plc{max-active-levels-var}. The \plc{nthreads-var} ICV controls the
|
||||
number of threads requested for encountered parallel regions; there is one copy
|
||||
of this ICV per task. The \plc{max-active-levels-var} ICV controls the maximum
|
||||
number of nested active parallel regions; there is one copy of this ICV for the
|
||||
whole program.
|
||||
|
||||
In the following example, the \plc{nest-var}, \plc{max-active-levels-var},
|
||||
\plc{dyn-var}, and \plc{nthreads-var} ICVs are modified through calls to
|
||||
the runtime library routines \code{omp\_set\_nested},\\ \code{omp\_set\_max\_active\_levels},\code{
|
||||
omp\_set\_dynamic}, and \code{omp\_set\_num\_threads} respectively. These ICVs
|
||||
affect the operation of \code{parallel} regions. Each implicit task generated
|
||||
by a \code{parallel} region has its own copy of the \plc{nest-var, dyn-var},
|
||||
and \plc{nthreads-var} ICVs.
|
||||
|
||||
In the following example, the new value of \plc{nthreads-var} applies only to
|
||||
the implicit tasks that execute the call to \code{omp\_set\_num\_threads}. There
|
||||
is one copy of the \plc{max-active-levels-var} ICV for the whole program and
|
||||
its value is the same for all tasks. This example assumes that nested parallelism
|
||||
is supported.
|
||||
|
||||
The outer \code{parallel} region creates a team of two threads; each of the threads
|
||||
will execute one of the two implicit tasks generated by the outer \code{parallel}
|
||||
region.
|
||||
|
||||
Each implicit task generated by the outer \code{parallel} region calls \code{omp\_set\_num\_threads(3)},
|
||||
assigning the value 3 to its respective copy of \plc{nthreads-var}. Then each
|
||||
implicit task encounters an inner \code{parallel} region that creates a team
|
||||
of three threads; each of the threads will execute one of the three implicit tasks
|
||||
generated by that inner \code{parallel} region.
|
||||
|
||||
Since the outer \code{parallel} region is executed by 2 threads, and the inner
|
||||
by 3, there will be a total of 6 implicit tasks generated by the two inner \code{parallel}
|
||||
regions.
|
||||
|
||||
Each implicit task generated by an inner \code{parallel} region will execute
|
||||
the call to\\ \code{omp\_set\_num\_threads(4)}, assigning the value 4 to its respective
|
||||
copy of \plc{nthreads-var}.
|
||||
|
||||
The print statement in the outer \code{parallel} region is executed by only one
|
||||
of the threads in the team. So it will be executed only once.
|
||||
|
||||
The print statement in an inner \code{parallel} region is also executed by only
|
||||
one of the threads in the team. Since we have a total of two inner \code{parallel}
|
||||
regions, the print statement will be executed twice -- once per inner \code{parallel}
|
||||
region.
|
||||
|
||||
\cexample{icv}{1}
|
||||
|
||||
\fexample{icv}{1}
|
||||
|
@ -1,10 +0,0 @@
|
||||
\subsection{The \code{omp\_init\_lock} Routine}
|
||||
\label{subsec:init_lock}
|
||||
|
||||
The following example demonstrates how to initialize an array of locks in a \code{parallel}
|
||||
region by using \code{omp\_init\_lock}.
|
||||
|
||||
\cppexample{init_lock}{1}
|
||||
|
||||
\fexample{init_lock}{1}
|
||||
|
@ -1,10 +0,0 @@
|
||||
%\pagebreak
|
||||
\subsection{The \code{omp\_init\_lock\_with\_hint} Routine}
|
||||
\label{subsec:init_lock_with_hint}
|
||||
|
||||
The following example demonstrates how to initialize an array of locks in a \code{parallel} region by using \code{omp\_init\_lock\_with\_hint}.
|
||||
Note, hints are combined with an \code{|} or \code{+} operator in C/C++ and a \code{+} operator in Fortran.
|
||||
|
||||
\cppexample{init_lock_with_hint}{1}
|
||||
|
||||
\fexample{init_lock_with_hint}{1}
|
@ -1,14 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{lastprivate} Clause}
|
||||
\label{sec:lastprivate}
|
||||
|
||||
Correct execution sometimes depends on the value that the last iteration of a loop
|
||||
assigns to a variable. Such programs must list all such variables in a \code{lastprivate}
|
||||
clause so that the values of the variables are the same as when the loop is executed
|
||||
sequentially.
|
||||
|
||||
\cexample{lastprivate}{1}
|
||||
|
||||
\fexample{lastprivate}{1}
|
||||
|
||||
|
@ -1,13 +0,0 @@
|
||||
\section{\code{linear} Clause in Loop Constructs}
|
||||
\label{sec:linear_in_loop}
|
||||
|
||||
The following example shows the use of the \code{linear} clause in a loop
|
||||
construct to allow the proper parallelization of a loop that contains
|
||||
an induction variable (\plc{j}). At the end of the execution of
|
||||
the loop construct, the original variable \plc{j} is updated with
|
||||
the value \plc{N/2} from the last iteration of the loop.
|
||||
|
||||
\cexample{linear_in_loop}{1}
|
||||
|
||||
\ffreeexample{linear_in_loop}{1}
|
||||
|
@ -1,22 +0,0 @@
|
||||
\subsection{Ownership of Locks}
|
||||
\label{subsec:lock_owner}
|
||||
|
||||
Ownership of locks has changed since OpenMP 2.5. In OpenMP 2.5, locks are owned
|
||||
by threads; so a lock released by the \code{omp\_unset\_lock} routine must be
|
||||
owned by the same thread executing the routine. Beginning with OpenMP 3.0, locks are owned
|
||||
by task regions; so a lock released by the \code{omp\_unset\_lock} routine in
|
||||
a task region must be owned by the same task region.
|
||||
|
||||
This change in ownership requires extra care when using locks. The following program
|
||||
is conforming in OpenMP 2.5 because the thread that releases the lock \code{lck}
|
||||
in the parallel region is the same thread that acquired the lock in the sequential
|
||||
part of the program (master thread of parallel region and the initial thread are
|
||||
the same). However, it is not conforming beginning with OpenMP 3.0, because the task
|
||||
region that releases the lock \code{lck} is different from the task region that
|
||||
acquires the lock.
|
||||
|
||||
\cexample{lock_owner}{1}
|
||||
|
||||
\fexample{lock_owner}{1}
|
||||
|
||||
|
@ -1,13 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{master} Construct}
|
||||
\label{sec:master}
|
||||
|
||||
The following example demonstrates the master construct . In the example, the master
|
||||
keeps track of how many iterations have been executed and prints out a progress
|
||||
report. The other threads skip the master region without waiting.
|
||||
|
||||
\cexample{master}{1}
|
||||
|
||||
\fexample{master}{1}
|
||||
|
||||
|
@ -1,38 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The OpenMP Memory Model}
|
||||
\label{sec:mem_model}
|
||||
|
||||
In the following example, at Print 1, the value of \plc{x} could be either 2
|
||||
or 5, depending on the timing of the threads, and the implementation of the assignment
|
||||
to \plc{x}. There are two reasons that the value at Print 1 might not be 5.
|
||||
First, Print 1 might be executed before the assignment to \plc{x} is executed.
|
||||
Second, even if Print 1 is executed after the assignment, the value 5 is not guaranteed
|
||||
to be seen by thread 1 because a flush may not have been executed by thread 0 since
|
||||
the assignment.
|
||||
|
||||
The barrier after Print 1 contains implicit flushes on all threads, as well as
|
||||
a thread synchronization, so the programmer is guaranteed that the value 5 will
|
||||
be printed by both Print 2 and Print 3.
|
||||
|
||||
\cexample{mem_model}{1}
|
||||
|
||||
\ffreeexample{mem_model}{1}
|
||||
|
||||
The following example demonstrates why synchronization is difficult to perform
|
||||
correctly through variables. The value of flag is undefined in both prints on thread
|
||||
1 and the value of data is only well-defined in the second print.
|
||||
|
||||
\cexample{mem_model}{2}
|
||||
|
||||
\fexample{mem_model}{2}
|
||||
|
||||
The next example demonstrates why synchronization is difficult to perform correctly
|
||||
through variables. Because the \plc{write}(1)-\plc{flush}(1)-\plc{flush}(2)-\plc{read}(2)
|
||||
sequence cannot be guaranteed in the example, the statements on thread 0 and thread
|
||||
1 may execute in either order.
|
||||
|
||||
\cexample{mem_model}{3}
|
||||
|
||||
\fexample{mem_model}{3}
|
||||
|
||||
|
@ -1,28 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{nowait} Clause}
|
||||
\label{sec:nowait}
|
||||
|
||||
If there are multiple independent loops within a \code{parallel} region, you
|
||||
can use the \code{nowait} clause to avoid the implied barrier at the end of the
|
||||
loop construct, as follows:
|
||||
|
||||
\cexample{nowait}{1}
|
||||
|
||||
\fexample{nowait}{1}
|
||||
|
||||
In the following example, static scheduling distributes the same logical iteration
|
||||
numbers to the threads that execute the three loop regions. This allows the \code{nowait}
|
||||
clause to be used, even though there is a data dependence between the loops. The
|
||||
dependence is satisfied as long the same thread executes the same logical iteration
|
||||
numbers in each loop.
|
||||
|
||||
Note that the iteration count of the loops must be the same. The example satisfies
|
||||
this requirement, since the iteration space of the first two loops is from \code{0}
|
||||
to \code{n-1} (from \code{1} to \code{N} in the Fortran version), while the
|
||||
iteration space of the last loop is from \code{1} to \code{n} (\code{2} to
|
||||
\code{N+1} in the Fortran version).
|
||||
|
||||
\cexample{nowait}{2}
|
||||
|
||||
\ffreeexample{nowait}{2}
|
||||
|
@ -1,30 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Interaction Between the \code{num\_threads} Clause and \code{omp\_set\_dynamic}}
|
||||
\label{sec:nthrs_dynamic}
|
||||
|
||||
The following example demonstrates the \code{num\_threads} clause and the effect
|
||||
of the \\
|
||||
\code{omp\_set\_dynamic} routine on it.
|
||||
|
||||
The call to the \code{omp\_set\_dynamic} routine with argument \code{0} in
|
||||
C/C++, or \code{.FALSE.} in Fortran, disables the dynamic adjustment of the number
|
||||
of threads in OpenMP implementations that support it. In this case, 10 threads
|
||||
are provided. Note that in case of an error the OpenMP implementation is free to
|
||||
abort the program or to supply any number of threads available.
|
||||
|
||||
\cexample{nthrs_dynamic}{1}
|
||||
|
||||
\fexample{nthrs_dynamic}{1}
|
||||
|
||||
The call to the \code{omp\_set\_dynamic} routine with a non-zero argument in
|
||||
C/C++, or \code{.TRUE.} in Fortran, allows the OpenMP implementation to choose
|
||||
any number of threads between 1 and 10.
|
||||
|
||||
\cexample{nthrs_dynamic}{2}
|
||||
|
||||
\fexample{nthrs_dynamic}{2}
|
||||
|
||||
It is good practice to set the \plc{dyn-var} ICV explicitly by calling the \code{omp\_set\_dynamic}
|
||||
routine, as its default setting is implementation defined.
|
||||
|
||||
|
@ -1,12 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Controlling the Number of Threads on Multiple Nesting Levels}
|
||||
\label{sec:nthrs_nesting}
|
||||
|
||||
The following examples demonstrate how to use the \code{OMP\_NUM\_THREADS} environment
|
||||
variable to control the number of threads on multiple nesting levels:
|
||||
|
||||
\cexample{nthrs_nesting}{1}
|
||||
|
||||
\fexample{nthrs_nesting}{1}
|
||||
|
||||
|
@ -1,28 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{ordered} Clause and the \code{ordered} Construct}
|
||||
\label{sec:ordered}
|
||||
|
||||
Ordered constructs are useful for sequentially ordering the output from work that
|
||||
is done in parallel. The following program prints out the indices in sequential
|
||||
order:
|
||||
|
||||
\cexample{ordered}{1}
|
||||
|
||||
\fexample{ordered}{1}
|
||||
|
||||
It is possible to have multiple \code{ordered} constructs within a loop region
|
||||
with the \code{ordered} clause specified. The first example is non-conforming
|
||||
because all iterations execute two \code{ordered} regions. An iteration of a
|
||||
loop must not execute more than one \code{ordered} region:
|
||||
|
||||
\cexample{ordered}{2}
|
||||
|
||||
\fexample{ordered}{2}
|
||||
|
||||
The following is a conforming example with more than one \code{ordered} construct.
|
||||
Each iteration will execute only one \code{ordered} region:
|
||||
|
||||
\cexample{ordered}{3}
|
||||
|
||||
\fexample{ordered}{3}
|
||||
|
@ -1,12 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{parallel} Construct}
|
||||
\label{sec:parallel}
|
||||
|
||||
The \code{parallel} construct can be used in coarse-grain parallel programs.
|
||||
In the following example, each thread in the \code{parallel} region decides what
|
||||
part of the global array \plc{x} to work on, based on the thread number:
|
||||
|
||||
\cexample{parallel}{1}
|
||||
|
||||
\fexample{parallel}{1}
|
||||
|
@ -1,12 +0,0 @@
|
||||
\pagebreak
|
||||
\section{A Simple Parallel Loop}
|
||||
\label{sec:ploop}
|
||||
|
||||
The following example demonstrates how to parallelize a simple loop using the parallel
|
||||
loop construct. The loop iteration variable is private by default, so it is not
|
||||
necessary to specify it explicitly in a \code{private} clause.
|
||||
|
||||
\cexample{ploop}{1}
|
||||
|
||||
\fexample{ploop}{1}
|
||||
|
@ -1,31 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{private} Clause}
|
||||
\label{sec:private}
|
||||
|
||||
In the following example, the values of original list items \plc{i} and \plc{j}
|
||||
are retained on exit from the \code{parallel} region, while the private list
|
||||
items \plc{i} and \plc{j} are modified within the \code{parallel} construct.
|
||||
|
||||
\cexample{private}{1}
|
||||
|
||||
\fexample{private}{1}
|
||||
|
||||
In the following example, all uses of the variable \plc{a} within the loop construct
|
||||
in the routine \plc{f} refer to a private list item \plc{a}, while it is
|
||||
unspecified whether references to \plc{a} in the routine \plc{g} are to a
|
||||
private list item or the original list item.
|
||||
|
||||
\cexample{private}{2}
|
||||
|
||||
\fexample{private}{2}
|
||||
|
||||
The following example demonstrates that a list item that appears in a \code{private}
|
||||
clause in a \code{parallel} construct may also appear in a \code{private}
|
||||
clause in an enclosed worksharing construct, which results in an additional private
|
||||
copy.
|
||||
|
||||
\cexample{private}{3}
|
||||
|
||||
\fexample{private}{3}
|
||||
|
||||
|
@ -1,13 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{parallel} \code{sections} Construct}
|
||||
\label{sec:psections}
|
||||
|
||||
In the following example routines \code{XAXIS}, \code{YAXIS}, and \code{ZAXIS} can
|
||||
be executed concurrently. The first \code{section} directive is optional. Note
|
||||
that all \code{section} directives need to appear in the \code{parallel sections}
|
||||
construct.
|
||||
|
||||
\cexample{psections}{1}
|
||||
|
||||
\fexample{psections}{1}
|
||||
|
@ -1,65 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{reduction} Clause}
|
||||
\label{sec:reduction}
|
||||
|
||||
The following example demonstrates the \code{reduction} clause ; note that some
|
||||
reductions can be expressed in the loop in several ways, as shown for the \code{max}
|
||||
and \code{min} reductions below:
|
||||
|
||||
\cexample{reduction}{1}
|
||||
|
||||
\ffreeexample{reduction}{1}
|
||||
|
||||
A common implementation of the preceding example is to treat it as if it had been
|
||||
written as follows:
|
||||
|
||||
\cexample{reduction}{2}
|
||||
|
||||
\fortranspecificstart
|
||||
\ffreenexample{reduction}{2}
|
||||
|
||||
The following program is non-conforming because the reduction is on the
|
||||
\emph{intrinsic procedure name} \code{MAX} but that name has been redefined to be the variable
|
||||
named \code{MAX}.
|
||||
|
||||
\ffreenexample{reduction}{3}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
|
||||
The following conforming program performs the reduction using the
|
||||
\emph{intrinsic procedure name} \code{MAX} even though the intrinsic \code{MAX} has been renamed
|
||||
to \code{REN}.
|
||||
|
||||
\ffreenexample{reduction}{4}
|
||||
|
||||
The following conforming program performs the reduction using
|
||||
\plc{intrinsic procedure name} \code{MAX} even though the intrinsic \code{MAX} has been renamed
|
||||
to \code{MIN}.
|
||||
|
||||
\ffreenexample{reduction}{5}
|
||||
\fortranspecificend
|
||||
|
||||
The following example is non-conforming because the initialization (\code{a =
|
||||
0}) of the original list item \code{a} is not synchronized with the update of
|
||||
\code{a} as a result of the reduction computation in the \code{for} loop. Therefore,
|
||||
the example may print an incorrect value for \code{a}.
|
||||
|
||||
To avoid this problem, the initialization of the original list item \code{a}
|
||||
should complete before any update of \code{a} as a result of the \code{reduction}
|
||||
clause. This can be achieved by adding an explicit barrier after the assignment
|
||||
\code{a = 0}, or by enclosing the assignment \code{a = 0} in a \code{single}
|
||||
directive (which has an implied barrier), or by initializing \code{a} before
|
||||
the start of the \code{parallel} region.
|
||||
|
||||
\cexample{reduction}{6}
|
||||
|
||||
\fexample{reduction}{6}
|
||||
|
||||
The following example demonstrates the reduction of array \plc{a}. In C/C++ this is illustrated by the explicit use of an array section \plc{a[0:N]} in the \code{reduction} clause. The corresponding Fortran example uses array syntax supported in the base language. As of the OpenMP 4.5 specification the explicit use of array section in the \code{reduction} clause in Fortran is not permitted. But this oversight will be fixed in the next release of the specification.
|
||||
|
||||
|
||||
\cexample{reduction}{7}
|
||||
|
||||
\ffreeexample{reduction}{7}
|
@ -1,18 +0,0 @@
|
||||
\subsection{Simple Lock Routines}
|
||||
\label{subsec:simple_lock}
|
||||
|
||||
In the following example, the lock routines cause the threads to be idle while
|
||||
waiting for entry to the first critical section, but to do other work while waiting
|
||||
for entry to the second. The \code{omp\_set\_lock} function blocks, but the \code{omp\_test\_lock}
|
||||
function does not, allowing the work in \code{skip} to be done.
|
||||
|
||||
Note that the argument to the lock routines should have type \code{omp\_lock\_t},
|
||||
and that there is no need to flush it.
|
||||
|
||||
\cexample{simple_lock}{1}
|
||||
|
||||
Note that there is no need to flush the lock variable.
|
||||
|
||||
\fexample{simple_lock}{1}
|
||||
|
||||
|
@ -1,18 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{single} Construct}
|
||||
\label{sec:single}
|
||||
|
||||
The following example demonstrates the \code{single} construct. In the example,
|
||||
only one thread prints each of the progress messages. All other threads will skip
|
||||
the \code{single} region and stop at the barrier at the end of the \code{single}
|
||||
construct until all threads in the team have reached the barrier. If other threads
|
||||
can proceed without waiting for the thread executing the \code{single} region,
|
||||
a \code{nowait} clause can be specified, as is done in the third \code{single}
|
||||
construct in this example. The user must not make any assumptions as to which thread
|
||||
will execute a \code{single} region.
|
||||
|
||||
\cexample{single}{1}
|
||||
|
||||
\fexample{single}{1}
|
||||
|
||||
|
@ -1,31 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Placement of \code{flush}, \code{barrier}, \code{taskwait}
|
||||
and \code{taskyield} Directives}
|
||||
\label{sec:standalone}
|
||||
|
||||
The following example is non-conforming, because the \code{flush}, \code{barrier},
|
||||
\code{taskwait}, and \code{taskyield} directives are stand-alone directives
|
||||
and cannot be the immediate substatement of an \code{if} statement.
|
||||
|
||||
\cexample{standalone}{1}
|
||||
|
||||
The following example is non-conforming, because the \code{flush}, \code{barrier},
|
||||
\code{taskwait}, and \code{taskyield} directives are stand-alone directives
|
||||
and cannot be the action statement of an \code{if} statement or a labeled branch
|
||||
target.
|
||||
|
||||
\ffreeexample{standalone}{1}
|
||||
|
||||
The following version of the above example is conforming because the \code{flush},
|
||||
\code{barrier}, \code{taskwait}, and \code{taskyield} directives are enclosed
|
||||
in a compound statement.
|
||||
|
||||
\cexample{standalone}{2}
|
||||
|
||||
The following example is conforming because the \code{flush}, \code{barrier},
|
||||
\code{taskwait}, and \code{taskyield} directives are enclosed in an \code{if}
|
||||
construct or follow the labeled branch target.
|
||||
|
||||
\ffreeexample{standalone}{2}
|
||||
|
||||
|
@ -1,112 +0,0 @@
|
||||
\pagebreak
|
||||
\section{\code{target} Construct}
|
||||
\label{sec:target}
|
||||
|
||||
\subsection{\code{target} Construct on \code{parallel} Construct}
|
||||
\label{subsec:target_parallel}
|
||||
|
||||
This following example shows how the \code{target} construct offloads a code
|
||||
region to a target device. The variables \plc{p}, \plc{v1}, \plc{v2}, and \plc{N} are implicitly mapped
|
||||
to the target device.
|
||||
|
||||
\cexample{target}{1}
|
||||
|
||||
\ffreeexample{target}{1}
|
||||
|
||||
\subsection{\code{target} Construct with \code{map} Clause}
|
||||
\label{subsec:target_map}
|
||||
|
||||
This following example shows how the \code{target} construct offloads a code
|
||||
region to a target device. The variables \plc{p}, \plc{v1} and \plc{v2} are explicitly mapped to the
|
||||
target device using the \code{map} clause. The variable \plc{N} is implicitly mapped to
|
||||
the target device.
|
||||
|
||||
\cexample{target}{2}
|
||||
|
||||
\ffreeexample{target}{2}
|
||||
|
||||
\subsection{\code{map} Clause with \code{to}/\code{from} map-types}
|
||||
\label{subsec:target_map_tofrom}
|
||||
|
||||
The following example shows how the \code{target} construct offloads a code region
|
||||
to a target device. In the \code{map} clause, the \code{to} and \code{from}
|
||||
map-types define the mapping between the original (host) data and the target (device)
|
||||
data. The \code{to} map-type specifies that the data will only be read on the
|
||||
device, and the \code{from} map-type specifies that the data will only be written
|
||||
to on the device. By specifying a guaranteed access on the device, data transfers
|
||||
can be reduced for the \code{target} region.
|
||||
|
||||
The \code{to} map-type indicates that at the start of the \code{target} region
|
||||
the variables \plc{v1} and \plc{v2} are initialized with the values of the corresponding variables
|
||||
on the host device, and at the end of the \code{target} region the variables
|
||||
\plc{v1} and \plc{v2} are not assigned to their corresponding variables on the host device.
|
||||
|
||||
The \code{from} map-type indicates that at the start of the \code{target} region
|
||||
the variable \plc{p} is not initialized with the value of the corresponding variable
|
||||
on the host device, and at the end of the \code{target} region the variable \plc{p}
|
||||
is assigned to the corresponding variable on the host device.
|
||||
|
||||
\cexample{target}{3}
|
||||
|
||||
The \code{to} and \code{from} map-types allow programmers to optimize data
|
||||
motion. Since data for the \plc{v} arrays are not returned, and data for the \plc{p} array
|
||||
are not transferred to the device, only one-half of the data is moved, compared
|
||||
to the default behavior of an implicit mapping.
|
||||
|
||||
\ffreeexample{target}{3}
|
||||
|
||||
\subsection{\code{map} Clause with Array Sections}
|
||||
\label{subsec:target_array_section}
|
||||
|
||||
The following example shows how the \code{target} construct offloads a code region
|
||||
to a target device. In the \code{map} clause, map-types are used to optimize
|
||||
the mapping of variables to the target device. Because variables \plc{p}, \plc{v1} and \plc{v2} are
|
||||
pointers, array section notation must be used to map the arrays. The notation \code{:N}
|
||||
is equivalent to \code{0:N}.
|
||||
|
||||
\cexample{target}{4}
|
||||
|
||||
In C, the length of the pointed-to array must be specified. In Fortran the extent
|
||||
of the array is known and the length need not be specified. A section of the array
|
||||
can be specified with the usual Fortran syntax, as shown in the following example.
|
||||
The value 1 is assumed for the lower bound for array section \plc{v2(:N)}.
|
||||
|
||||
\ffreeexample{target}{4}
|
||||
|
||||
A more realistic situation in which an assumed-size array is passed to \code{vec\_mult}
|
||||
requires that the length of the arrays be specified, because the compiler does
|
||||
not know the size of the storage. A section of the array must be specified with
|
||||
the usual Fortran syntax, as shown in the following example. The value 1 is assumed
|
||||
for the lower bound for array section \plc{v2(:N)}.
|
||||
|
||||
\ffreeexample{target}{4b}
|
||||
|
||||
\subsection{\code{target} Construct with \code{if} Clause}
|
||||
\label{subsec:target_if}
|
||||
|
||||
The following example shows how the \code{target} construct offloads a code region
|
||||
to a target device.
|
||||
|
||||
The \code{if} clause on the \code{target} construct indicates that if the variable
|
||||
\plc{N} is smaller than a given threshold, then the \code{target} region will be executed
|
||||
by the host device.
|
||||
|
||||
The \code{if} clause on the \code{parallel} construct indicates that if the
|
||||
variable \plc{N} is smaller than a second threshold then the \code{parallel} region
|
||||
is inactive.
|
||||
|
||||
\cexample{target}{5}
|
||||
|
||||
\ffreeexample{target}{5}
|
||||
|
||||
The following example is a modification of the above \plc{target.5} code to show the combined \code{target}
|
||||
and parallel loop directives. It uses the \plc{directive-name} modifier in multiple \code{if}
|
||||
clauses to specify the component directive to which it applies.
|
||||
|
||||
The \code{if} clause with the \code{target} modifier applies to the \code{target} component of the
|
||||
combined directive, and the \code{if} clause with the \code{parallel} modifier applies
|
||||
to the \code{parallel} component of the combined directive.
|
||||
|
||||
\cexample{target}{6}
|
||||
|
||||
\ffreeexample{target}{6}
|
@ -1,178 +0,0 @@
|
||||
\pagebreak
|
||||
\section{\code{target} \code{data} Construct}
|
||||
\label{sec:target_data}
|
||||
|
||||
\subsection{Simple \code{target} \code{data} Construct}
|
||||
\label{subsec:target_data_simple}
|
||||
|
||||
This example shows how the \code{target} \code{data} construct maps variables
|
||||
to a device data environment. The \code{target} \code{data} construct creates
|
||||
a new device data environment and maps the variables \plc{v1}, \plc{v2}, and \plc{p} to the new device
|
||||
data environment. The \code{target} construct enclosed in the \code{target}
|
||||
\code{data} region creates a new device data environment, which inherits the
|
||||
variables \plc{v1}, \plc{v2}, and \plc{p} from the enclosing device data environment. The variable
|
||||
\plc{N} is mapped into the new device data environment from the encountering task's data
|
||||
environment.
|
||||
|
||||
\cexample{target_data}{1}
|
||||
|
||||
The Fortran code passes a reference and specifies the extent of the arrays in the
|
||||
declaration. No length information is necessary in the map clause, as is required
|
||||
with C/C++ pointers.
|
||||
|
||||
\ffreeexample{target_data}{1}
|
||||
|
||||
\subsection{\code{target} \code{data} Region Enclosing Multiple \code{target} Regions}
|
||||
\label{subsec:target_data_multiregion}
|
||||
|
||||
The following examples show how the \code{target} \code{data} construct maps
|
||||
variables to a device data environment of a \code{target} region. The \code{target}
|
||||
\code{data} construct creates a device data environment and encloses \code{target}
|
||||
regions, which have their own device data environments. The device data environment
|
||||
of the \code{target} \code{data} region is inherited by the device data environment
|
||||
of an enclosed \code{target} region. The \code{target} \code{data} construct
|
||||
is used to create variables that will persist throughout the \code{target} \code{data}
|
||||
region.
|
||||
|
||||
In the following example the variables \plc{v1} and \plc{v2} are mapped at each \code{target}
|
||||
construct. Instead of mapping the variable \plc{p} twice, once at each \code{target}
|
||||
construct, \plc{p} is mapped once by the \code{target} \code{data} construct.
|
||||
|
||||
\cexample{target_data}{2}
|
||||
|
||||
|
||||
The Fortran code uses reference and specifies the extent of the \plc{p}, \plc{v1} and \plc{v2} arrays.
|
||||
No length information is necessary in the \code{map} clause, as is required with
|
||||
C/C++ pointers. The arrays \plc{v1} and \plc{v2} are mapped at each \code{target} construct.
|
||||
Instead of mapping the array \plc{p} twice, once at each target construct, \plc{p} is mapped
|
||||
once by the \code{target} \code{data} construct.
|
||||
|
||||
\ffreeexample{target_data}{2}
|
||||
|
||||
In the following example, the variable tmp defaults to \code{tofrom} map-type
|
||||
and is mapped at each \code{target} construct. The array \plc{Q} is mapped once at
|
||||
the enclosing \code{target} \code{data} region instead of at each \code{target}
|
||||
construct.
|
||||
|
||||
\cexample{target_data}{3}
|
||||
|
||||
In the following example the arrays \plc{v1} and \plc{v2} are mapped at each \code{target}
|
||||
construct. Instead of mapping the array \plc{Q} twice at each \code{target} construct,
|
||||
\plc{Q} is mapped once by the \code{target} \code{data} construct. Note, the \plc{tmp}
|
||||
variable is implicitly remapped for each \code{target} region, mapping the value
|
||||
from the device to the host at the end of the first \code{target} region, and
|
||||
from the host to the device for the second \code{target} region.
|
||||
|
||||
\ffreeexample{target_data}{3}
|
||||
|
||||
\subsection{\code{target} \code{data} Construct with Orphaned Call}
|
||||
|
||||
The following two examples show how the \code{target} \code{data} construct
|
||||
maps variables to a device data environment. The \code{target} \code{data}
|
||||
construct's device data environment encloses the \code{target} construct's device
|
||||
data environment in the function \code{vec\_mult()}.
|
||||
|
||||
When the type of the variable appearing in an array section is pointer, the pointer
|
||||
variable and the storage location of the corresponding array section are mapped
|
||||
to the device data environment. The pointer variable is treated as if it had appeared
|
||||
in a \code{map} clause with a map-type of \code{alloc}. The array section's
|
||||
storage location is mapped according to the map-type in the \code{map} clause
|
||||
(the default map-type is \code{tofrom}).
|
||||
|
||||
The \code{target} construct's device data environment inherits the storage locations
|
||||
of the array sections \plc{v1[0:N]}, \plc{v2[:n]}, and \plc{p0[0:N]} from the enclosing target data
|
||||
construct's device data environment. Neither initialization nor assignment is performed
|
||||
for the array sections in the new device data environment.
|
||||
|
||||
The pointer variables \plc{p1}, \plc{v3}, and \plc{v4} are mapped into the target construct's device
|
||||
data environment with an implicit map-type of alloc and they are assigned the address
|
||||
of the storage location associated with their corresponding array sections. Note
|
||||
that the following pairs of array section storage locations are equivalent (\plc{p0[:N]},
|
||||
\plc{p1[:N]}), (\plc{v1[:N]},\plc{v3[:N]}), and (\plc{v2[:N]},\plc{v4[:N]}).
|
||||
|
||||
\cexample{target_data}{4}
|
||||
|
||||
The Fortran code maps the pointers and storage in an identical manner (same extent,
|
||||
but uses indices from 1 to \plc{N}).
|
||||
|
||||
The \code{target} construct's device data environment inherits the storage locations
|
||||
of the arrays \plc{v1}, \plc{v2} and \plc{p0} from the enclosing \code{target} \code{data} constructs's
|
||||
device data environment. However, in Fortran the associated data of the pointer
|
||||
is known, and the shape is not required.
|
||||
|
||||
The pointer variables \plc{p1}, \plc{v3}, and \plc{v4} are mapped into the \code{target} construct's
|
||||
device data environment with an implicit map-type of \code{alloc} and they are
|
||||
assigned the address of the storage location associated with their corresponding
|
||||
array sections. Note that the following pair of array storage locations are equivalent
|
||||
(\plc{p0},\plc{p1}), (\plc{v1},\plc{v3}), and (\plc{v2},\plc{v4}).
|
||||
|
||||
\ffreeexample{target_data}{4}
|
||||
|
||||
|
||||
In the following example, the variables \plc{p1}, \plc{v3}, and \plc{v4} are references to the pointer
|
||||
variables \plc{p0}, \plc{v1} and \plc{v2} respectively. The \code{target} construct's device data
|
||||
environment inherits the pointer variables \plc{p0}, \plc{v1}, and \plc{v2} from the enclosing \code{target}
|
||||
\code{data} construct's device data environment. Thus, \plc{p1}, \plc{v3}, and \plc{v4} are already
|
||||
present in the device data environment.
|
||||
|
||||
\cppexample{target_data}{5}
|
||||
|
||||
In the following example, the usual Fortran approach is used for dynamic memory.
|
||||
The \plc{p0}, \plc{v1}, and \plc{v2} arrays are allocated in the main program and passed as references
|
||||
from one routine to another. In \code{vec\_mult}, \plc{p1}, \plc{v3} and \plc{v4} are references to the
|
||||
\plc{p0}, \plc{v1}, and \plc{v2} arrays, respectively. The \code{target} construct's device data
|
||||
environment inherits the arrays \plc{p0}, \plc{v1}, and \plc{v2} from the enclosing target data construct's
|
||||
device data environment. Thus, \plc{p1}, \plc{v3}, and \plc{v4} are already present in the device
|
||||
data environment.
|
||||
|
||||
\ffreeexample{target_data}{5}
|
||||
|
||||
\subsection{\code{target} \code{data} Construct with \code{if} Clause}
|
||||
\label{subsec:target_data_if}
|
||||
|
||||
The following two examples show how the \code{target} \code{data} construct
|
||||
maps variables to a device data environment.
|
||||
|
||||
In the following example, the if clause on the \code{target} \code{data} construct
|
||||
indicates that if the variable \plc{N} is smaller than a given threshold, then the \code{target}
|
||||
\code{data} construct will not create a device data environment.
|
||||
|
||||
The \code{target} constructs enclosed in the \code{target} \code{data} region
|
||||
must also use an \code{if} clause on the same condition, otherwise the pointer
|
||||
variable \plc{p} is implicitly mapped with a map-type of \code{tofrom}, but the storage
|
||||
location for the array section \plc{p[0:N]} will not be mapped in the device data environments
|
||||
of the \code{target} constructs.
|
||||
|
||||
\cexample{target_data}{6}
|
||||
|
||||
The \code{if} clauses work the same way for the following Fortran code. The \code{target}
|
||||
constructs enclosed in the \code{target} \code{data} region should also use
|
||||
an \code{if} clause with the same condition, so that the \code{target} \code{data}
|
||||
region and the \code{target} region are either both created for the device, or
|
||||
are both ignored.
|
||||
|
||||
\ffreeexample{target_data}{6}
|
||||
|
||||
In the following example, when the \code{if} clause conditional expression on
|
||||
the \code{target} construct evaluates to \plc{false}, the target region will
|
||||
execute on the host device. However, the \code{target} \code{data} construct
|
||||
created an enclosing device data environment that mapped \plc{p[0:N]} to a device data
|
||||
environment on the default device. At the end of the \code{target} \code{data}
|
||||
region the array section \plc{p[0:N]} will be assigned from the device data environment
|
||||
to the corresponding variable in the data environment of the task that encountered
|
||||
the \code{target} \code{data} construct, resulting in undefined values in \plc{p[0:N]}.
|
||||
|
||||
\cexample{target_data}{7}
|
||||
|
||||
The \code{if} clauses work the same way for the following Fortran code. When
|
||||
the \code{if} clause conditional expression on the \code{target} construct
|
||||
evaluates to \plc{false}, the \code{target} region will execute on the host
|
||||
device. However, the \code{target} \code{data} construct created an enclosing
|
||||
device data environment that mapped the \plc{p} array (and \plc{v1} and \plc{v2}) to a device data
|
||||
environment on the default target device. At the end of the \code{target} \code{data}
|
||||
region the \plc{p} array will be assigned from the device data environment to the corresponding
|
||||
variable in the data environment of the task that encountered the \code{target}
|
||||
\code{data} construct, resulting in undefined values in \plc{p}.
|
||||
|
||||
\ffreeexample{target_data}{7}
|
||||
|
@ -1,47 +0,0 @@
|
||||
%begin
|
||||
\pagebreak
|
||||
\section{\code{target} \code{enter} \code{data} and \code{target} \code{exit} \code{data} Constructs}
|
||||
\label{sec:target_enter_exit_data}
|
||||
%\section{Simple target enter data and target exit data Constructs}
|
||||
|
||||
The structured data construct (\code{target}~\code{data}) provides persistent data on a
|
||||
device for subsequent \code{target} constructs as shown in the
|
||||
\code{target}~\code{data} examples above. This is accomplished by creating a single
|
||||
\code{target}~\code{data} region containing \code{target} constructs.
|
||||
|
||||
The unstructured data constructs allow the creation and deletion of data on
|
||||
the device at any appropriate point within the host code, as shown below
|
||||
with the \code{target}~\code{enter}~\code{data} and \code{target}~\code{exit}~\code{data} constructs.
|
||||
|
||||
The following C++ code creates/deletes a vector in a constructor/destructor
|
||||
of a class. The constructor creates a vector with \code{target}~\code{enter}~\code{data}
|
||||
and uses an \code{alloc} modifier in the \code{map} clause to avoid copying values
|
||||
to the device. The destructor deletes the data (\code{target}~\code{exit}~\code{data})
|
||||
and uses the \code{delete} modifier in the \code{map} clause to avoid copying data
|
||||
back to the host. Note, the stand-alone \code{target}~\code{enter}~\code{data} occurs
|
||||
after the host vector is created, and the \code{target}~\code{exit}~\code{data}
|
||||
construct occurs before the host data is deleted.
|
||||
|
||||
\cppexample{target_unstructured_data}{1}
|
||||
|
||||
The following C code allocates and frees the data member of a Matrix structure.
|
||||
The \code{init\_matrix} function allocates the memory used in the structure and
|
||||
uses the \code{target}~\code{enter}~\code{data} directive to map it to the target device. The
|
||||
\code{free\_matrix} function removes the mapped array from the target device
|
||||
and then frees the memory on the host. Note, the stand-alone
|
||||
\code{target}~\code{enter}~\code{data} occurs after the host memory is allocated, and the
|
||||
\code{target}~\code{exit}~\code{data} construct occurs before the host data is freed.
|
||||
|
||||
\cexample{target_unstructured_data}{1}
|
||||
|
||||
The following Fortran code allocates and deallocates a module array. The
|
||||
\code{initialize} subroutine allocates the module array and uses the
|
||||
\code{target}~\code{enter}~\code{data} directive to map it to the target device. The
|
||||
\code{finalize} subroutine removes the mapped array from the target device and
|
||||
then deallocates the array on the host. Note, the stand-alone
|
||||
\code{target}~\code{enter}~\code{data} occurs after the host memory is allocated, and the
|
||||
\code{target}~\code{exit}~\code{data} construct occurs before the host data is deallocated.
|
||||
|
||||
\ffreeexample{target_unstructured_data}{1}
|
||||
%end
|
||||
|
@ -1,55 +0,0 @@
|
||||
\pagebreak
|
||||
\section{\code{target} \code{update} Construct}
|
||||
\label{sec:target_update}
|
||||
|
||||
\subsection{Simple \code{target} \code{data} and \code{target} \code{update} Constructs}
|
||||
\label{subsec:target_data_and_update}
|
||||
|
||||
The following example shows how the \code{target} \code{update} construct updates
|
||||
variables in a device data environment.
|
||||
|
||||
The \code{target} \code{data} construct maps array sections \plc{v1[:N]} and \plc{v2[:N]}
|
||||
(arrays \plc{v1} and \plc{v2} in the Fortran code) into a device data environment.
|
||||
|
||||
The task executing on the host device encounters the first \code{target} region
|
||||
and waits for the completion of the region.
|
||||
|
||||
After the execution of the first \code{target} region, the task executing on
|
||||
the host device then assigns new values to \plc{v1[:N]} and \plc{v2[:N]} (\plc{v1} and \plc{v2} arrays
|
||||
in Fortran code) in the task's data environment by calling the function \code{init\_again()}.
|
||||
|
||||
The \code{target} \code{update} construct assigns the new values of \plc{v1} and
|
||||
\plc{v2} from the task's data environment to the corresponding mapped array sections
|
||||
in the device data environment of the \code{target} \code{data} construct.
|
||||
|
||||
The task executing on the host device then encounters the second \code{target}
|
||||
region and waits for the completion of the region.
|
||||
|
||||
The second \code{target} region uses the updated values of \plc{v1[:N]} and \plc{v2[:N]}.
|
||||
|
||||
\cexample{target_update}{1}
|
||||
|
||||
\ffreeexample{target_update}{1}
|
||||
|
||||
\subsection{\code{target} \code{update} Construct with \code{if} Clause}
|
||||
\label{subsec:target_update_if}
|
||||
|
||||
The following example shows how the \code{target} \code{update} construct updates
|
||||
variables in a device data environment.
|
||||
|
||||
The \code{target} \code{data} construct maps array sections \plc{v1[:N]} and \plc{v2[:N]}
|
||||
(arrays \plc{v1} and \plc{v2} in the Fortran code) into a device data environment. In between
|
||||
the two \code{target} regions, the task executing on the host device conditionally
|
||||
assigns new values to \plc{v1} and \plc{v2} in the task's data environment. The function \code{maybe\_init\_again()}
|
||||
returns \plc{true} if new data is written.
|
||||
|
||||
When the conditional expression (the return value of \code{maybe\_init\_again()}) in the
|
||||
\code{if} clause is \plc{true}, the \code{target} \code{update} construct
|
||||
assigns the new values of \plc{v1} and \plc{v2} from the task's data environment to the corresponding
|
||||
mapped array sections in the \code{target} \code{data} construct's device data
|
||||
environment.
|
||||
|
||||
\cexample{target_update}{2}
|
||||
|
||||
\ffreeexample{target_update}{2}
|
||||
|
@ -1,77 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Task Dependences}
|
||||
\label{sec:task_depend}
|
||||
|
||||
\subsection{Flow Dependence}
|
||||
\label{subsec:task_flow_depend}
|
||||
|
||||
In this example we show a simple flow dependence expressed using the \code{depend}
|
||||
clause on the \code{task} construct.
|
||||
|
||||
\cexample{task_dep}{1}
|
||||
|
||||
\ffreeexample{task_dep}{1}
|
||||
|
||||
The program will always print \texttt{"}x = 2\texttt{"}, because the \code{depend}
|
||||
clauses enforce the ordering of the tasks. If the \code{depend} clauses had been
|
||||
omitted, then the tasks could execute in any order and the program and the program
|
||||
would have a race condition.
|
||||
|
||||
\subsection{Anti-dependence}
|
||||
\label{subsec:task_anti_depend}
|
||||
|
||||
In this example we show an anti-dependence expressed using the \code{depend}
|
||||
clause on the \code{task} construct.
|
||||
|
||||
\cexample{task_dep}{2}
|
||||
|
||||
\ffreeexample{task_dep}{2}
|
||||
|
||||
The program will always print \texttt{"}x = 1\texttt{"}, because the \code{depend}
|
||||
clauses enforce the ordering of the tasks. If the \code{depend} clauses had been
|
||||
omitted, then the tasks could execute in any order and the program would have a
|
||||
race condition.
|
||||
|
||||
\subsection{Output Dependence}
|
||||
\label{subsec:task_out_depend}
|
||||
|
||||
In this example we show an output dependence expressed using the \code{depend}
|
||||
clause on the \code{task} construct.
|
||||
|
||||
\cexample{task_dep}{3}
|
||||
|
||||
\ffreeexample{task_dep}{3}
|
||||
|
||||
The program will always print \texttt{"}x = 2\texttt{"}, because the \code{depend}
|
||||
clauses enforce the ordering of the tasks. If the \code{depend} clauses had been
|
||||
omitted, then the tasks could execute in any order and the program would have a
|
||||
race condition.
|
||||
|
||||
\subsection{Concurrent Execution with Dependences}
|
||||
\label{subsec:task_concurrent_depend}
|
||||
|
||||
In this example we show potentially concurrent execution of tasks using multiple
|
||||
flow dependences expressed using the \code{depend} clause on the \code{task}
|
||||
construct.
|
||||
|
||||
\cexample{task_dep}{4}
|
||||
|
||||
\ffreeexample{task_dep}{4}
|
||||
|
||||
The last two tasks are dependent on the first task. However there is no dependence
|
||||
between the last two tasks, which may execute in any order (or concurrently if
|
||||
more than one thread is available). Thus, the possible outputs are \texttt{"}x
|
||||
+ 1 = 3. x + 2 = 4. \texttt{"} and \texttt{"}x + 2 = 4. x + 1 = 3. \texttt{"}.
|
||||
If the \code{depend} clauses had been omitted, then all of the tasks could execute
|
||||
in any order and the program would have a race condition.
|
||||
|
||||
\subsection{Matrix multiplication}
|
||||
\label{subsec:task_matrix_mult}
|
||||
|
||||
This example shows a task-based blocked matrix multiplication. Matrices are of
|
||||
NxN elements, and the multiplication is implemented using blocks of BSxBS elements.
|
||||
|
||||
\cexample{task_dep}{5}
|
||||
|
||||
\ffreeexample{task_dep}{5}
|
||||
|
@ -1,22 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Task Priority}
|
||||
\label{sec:task_priority}
|
||||
|
||||
|
||||
|
||||
%\subsection{Task Priority}
|
||||
%\label{subsec:task_priority}
|
||||
|
||||
In this example we compute arrays in a matrix through a \plc{compute\_array} routine.
|
||||
Each task has a priority value equal to the value of the loop variable \plc{i} at the
|
||||
moment of its creation. A higher priority on a task means that a task is a candidate
|
||||
to run sooner.
|
||||
|
||||
The creation of tasks occurs in ascending order (according to the iteration space of
|
||||
the loop) but a hint, by means of the \code{priority} clause, is provided to reverse
|
||||
the execution order.
|
||||
|
||||
\cexample{task_priority}{1}
|
||||
|
||||
\ffreeexample{task_priority}{1}
|
||||
|
@ -1,20 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{taskgroup} Construct}
|
||||
\label{sec:taskgroup}
|
||||
|
||||
In this example, tasks are grouped and synchronized using the \code{taskgroup}
|
||||
construct.
|
||||
|
||||
Initially, one task (the task executing the \code{start\_background\_work()}
|
||||
call) is created in the \code{parallel} region, and later a parallel tree traversal
|
||||
is started (the task executing the root of the recursive \code{compute\_tree()}
|
||||
calls). While synchronizing tasks at the end of each tree traversal, using the
|
||||
\code{taskgroup} construct ensures that the formerly started background task
|
||||
does not participate in the synchronization, and is left free to execute in parallel.
|
||||
This is opposed to the behaviour of the \code{taskwait} construct, which would
|
||||
include the background tasks in the synchronization.
|
||||
|
||||
\cexample{taskgroup}{1}
|
||||
|
||||
\ffreeexample{taskgroup}{1}
|
||||
|
@ -1,190 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{task} and \code{taskwait} Constructs}
|
||||
\label{sec:task_taskwait}
|
||||
|
||||
The following example shows how to traverse a tree-like structure using explicit
|
||||
tasks. Note that the \code{traverse} function should be called from within a
|
||||
parallel region for the different specified tasks to be executed in parallel. Also
|
||||
note that the tasks will be executed in no specified order because there are no
|
||||
synchronization directives. Thus, assuming that the traversal will be done in post
|
||||
order, as in the sequential code, is wrong.
|
||||
|
||||
\cexample{tasking}{1}
|
||||
|
||||
\ffreeexample{tasking}{1}
|
||||
|
||||
In the next example, we force a postorder traversal of the tree by adding a \code{taskwait}
|
||||
directive. Now, we can safely assume that the left and right sons have been executed
|
||||
before we process the current node.
|
||||
|
||||
\cexample{tasking}{2}
|
||||
|
||||
\ffreeexample{tasking}{2}
|
||||
|
||||
The following example demonstrates how to use the \code{task} construct to process
|
||||
elements of a linked list in parallel. The thread executing the \code{single}
|
||||
region generates all of the explicit tasks, which are then executed by the threads
|
||||
in the current team. The pointer \plc{p} is \code{firstprivate} by default
|
||||
on the \code{task} construct so it is not necessary to specify it in a \code{firstprivate}
|
||||
clause.
|
||||
|
||||
\cexample{tasking}{3}
|
||||
|
||||
\ffreeexample{tasking}{3}
|
||||
|
||||
The \code{fib()} function should be called from within a \code{parallel} region
|
||||
for the different specified tasks to be executed in parallel. Also, only one thread
|
||||
of the \code{parallel} region should call \code{fib()} unless multiple concurrent
|
||||
Fibonacci computations are desired.
|
||||
|
||||
\cexample{tasking}{4}
|
||||
|
||||
\fexample{tasking}{4}
|
||||
|
||||
Note: There are more efficient algorithms for computing Fibonacci numbers. This
|
||||
classic recursion algorithm is for illustrative purposes.
|
||||
|
||||
The following example demonstrates a way to generate a large number of tasks with
|
||||
one thread and execute them with the threads in the team. While generating these
|
||||
tasks, the implementation may reach its limit on unassigned tasks. If it does,
|
||||
the implementation is allowed to cause the thread executing the task generating
|
||||
loop to suspend its task at the task scheduling point in the \code{task} directive,
|
||||
and start executing unassigned tasks. Once the number of unassigned tasks is sufficiently
|
||||
low, the thread may resume execution of the task generating loop.
|
||||
|
||||
\cexample{tasking}{5}
|
||||
\pagebreak
|
||||
\fexample{tasking}{5}
|
||||
|
||||
The following example is the same as the previous one, except that the tasks are
|
||||
generated in an untied task. While generating the tasks, the implementation may
|
||||
reach its limit on unassigned tasks. If it does, the implementation is allowed
|
||||
to cause the thread executing the task generating loop to suspend its task at the
|
||||
task scheduling point in the \code{task} directive, and start executing unassigned
|
||||
tasks. If that thread begins execution of a task that takes a long time to complete,
|
||||
the other threads may complete all the other tasks before it is finished.
|
||||
|
||||
In this case, since the loop is in an untied task, any other thread is eligible
|
||||
to resume the task generating loop. In the previous examples, the other threads
|
||||
would be forced to idle until the generating thread finishes its long task, since
|
||||
the task generating loop was in a tied task.
|
||||
|
||||
\cexample{tasking}{6}
|
||||
|
||||
\fexample{tasking}{6}
|
||||
|
||||
The following two examples demonstrate how the scheduling rules illustrated in
|
||||
Section 2.11.3 of the OpenMP 4.0 specification affect the usage of
|
||||
\code{threadprivate} variables in tasks. A \code{threadprivate}
|
||||
variable can be modified by another task that is executed by the same thread. Thus,
|
||||
the value of a \code{threadprivate} variable cannot be assumed to be unchanged
|
||||
across a task scheduling point. In untied tasks, task scheduling points may be
|
||||
added in any place by the implementation.
|
||||
|
||||
A task switch may occur at a task scheduling point. A single thread may execute
|
||||
both of the task regions that modify \code{tp}. The parts of these task regions
|
||||
in which \code{tp} is modified may be executed in any order so the resulting
|
||||
value of \code{var} can be either 1 or 2.
|
||||
|
||||
\cexample{tasking}{7}
|
||||
|
||||
|
||||
\fexample{tasking}{7}
|
||||
|
||||
In this example, scheduling constraints prohibit a thread in the team from executing
|
||||
a new task that modifies \code{tp} while another such task region tied to the
|
||||
same thread is suspended. Therefore, the value written will persist across the
|
||||
task scheduling point.
|
||||
|
||||
\cexample{tasking}{8}
|
||||
|
||||
|
||||
\fexample{tasking}{8}
|
||||
|
||||
The following two examples demonstrate how the scheduling rules illustrated in
|
||||
Section 2.11.3 of the OpenMP 4.0 specification affect the usage of locks
|
||||
and critical sections in tasks. If a lock is held
|
||||
across a task scheduling point, no attempt should be made to acquire the same lock
|
||||
in any code that may be interleaved. Otherwise, a deadlock is possible.
|
||||
|
||||
In the example below, suppose the thread executing task 1 defers task 2. When
|
||||
it encounters the task scheduling point at task 3, it could suspend task 1 and
|
||||
begin task 2 which will result in a deadlock when it tries to enter critical region
|
||||
1.
|
||||
|
||||
\cexample{tasking}{9}
|
||||
|
||||
|
||||
\fexample{tasking}{9}
|
||||
|
||||
In the following example, \code{lock} is held across a task scheduling point.
|
||||
However, according to the scheduling restrictions, the executing thread can't
|
||||
begin executing one of the non-descendant tasks that also acquires \code{lock} before
|
||||
the task region is complete. Therefore, no deadlock is possible.
|
||||
|
||||
\cexample{tasking}{10}
|
||||
|
||||
|
||||
\ffreeexample{tasking}{10}
|
||||
|
||||
The following examples illustrate the use of the \code{mergeable} clause in the
|
||||
\code{task} construct. In this first example, the \code{task} construct has
|
||||
been annotated with the \code{mergeable} clause. The addition of this clause
|
||||
allows the implementation to reuse the data environment (including the ICVs) of
|
||||
the parent task for the task inside \code{foo} if the task is included or undeferred.
|
||||
Thus, the result of the execution may differ depending on whether the task is merged
|
||||
or not. Therefore the mergeable clause needs to be used with caution. In this example,
|
||||
the use of the mergeable clause is safe. As \code{x} is a shared variable the
|
||||
outcome does not depend on whether or not the task is merged (that is, the task
|
||||
will always increment the same variable and will always compute the same value
|
||||
for \code{x}).
|
||||
|
||||
\cexample{tasking}{11}
|
||||
|
||||
\ffreeexample{tasking}{11}
|
||||
|
||||
This second example shows an incorrect use of the \code{mergeable} clause. In
|
||||
this example, the created task will access different instances of the variable
|
||||
\code{x} if the task is not merged, as \code{x} is \code{firstprivate}, but
|
||||
it will access the same variable \code{x} if the task is merged. As a result,
|
||||
the behavior of the program is unspecified and it can print two different values
|
||||
for \code{x} depending on the decisions taken by the implementation.
|
||||
|
||||
\cexample{tasking}{12}
|
||||
|
||||
\ffreeexample{tasking}{12}
|
||||
|
||||
The following example shows the use of the \code{final} clause and the \code{omp\_in\_final}
|
||||
API call in a recursive binary search program. To reduce overhead, once a certain
|
||||
depth of recursion is reached the program uses the \code{final} clause to create
|
||||
only included tasks, which allow additional optimizations.
|
||||
|
||||
The use of the \code{omp\_in\_final} API call allows programmers to optimize
|
||||
their code by specifying which parts of the program are not necessary when a task
|
||||
can create only included tasks (that is, the code is inside a \code{final} task).
|
||||
In this example, the use of a different state variable is not necessary so once
|
||||
the program reaches the part of the computation that is finalized and copying from
|
||||
the parent state to the new state is eliminated. The allocation of \code{new\_state}
|
||||
in the stack could also be avoided but it would make this example less clear. The
|
||||
\code{final} clause is most effective when used in conjunction with the \code{mergeable}
|
||||
clause since all tasks created in a \code{final} task region are included tasks
|
||||
that can be merged if the \code{mergeable} clause is present.
|
||||
|
||||
\cexample{tasking}{13}
|
||||
|
||||
\ffreeexample{tasking}{13}
|
||||
|
||||
The following example illustrates the difference between the \code{if} and the
|
||||
\code{final} clauses. The \code{if} clause has a local effect. In the first
|
||||
nest of tasks, the one that has the \code{if} clause will be undeferred but
|
||||
the task nested inside that task will not be affected by the \code{if} clause
|
||||
and will be created as usual. Alternatively, the \code{final} clause affects
|
||||
all \code{task} constructs in the \code{final} task region but not the \code{final}
|
||||
task itself. In the second nest of tasks, the nested tasks will be created as included
|
||||
tasks. Note also that the conditions for the \code{if} and \code{final} clauses
|
||||
are usually the opposite.
|
||||
|
||||
\cexample{tasking}{14}
|
||||
|
||||
\ffreeexample{tasking}{14}
|
||||
|
@ -1,14 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{taskloop} Construct}
|
||||
\label{sec:taskloop}
|
||||
|
||||
The following example illustrates how to execute a long running task concurrently with tasks created
|
||||
with a \code{taskloop} directive for a loop having unbalanced amounts of work for its iterations.
|
||||
|
||||
The \code{grainsize} clause specifies that each task is to execute at least 500 iterations of the loop.
|
||||
|
||||
The \code{nogroup} clause removes the implicit taskgroup of the \code{taskloop} construct; the explicit \code{taskgroup} construct in the example ensures that the function is not exited before the long-running task and the loops have finished execution.
|
||||
|
||||
\cexample{taskloop}{1}
|
||||
|
||||
\ffreeexample{taskloop}{1}
|
@ -1,14 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{taskyield} Construct}
|
||||
\label{sec:taskyield}
|
||||
|
||||
The following example illustrates the use of the \code{taskyield} directive.
|
||||
The tasks in the example compute something useful and then do some computation
|
||||
that must be done in a critical region. By using \code{taskyield} when a task
|
||||
cannot get access to the \code{critical} region the implementation can suspend
|
||||
the current task and schedule some other task that can do something useful.
|
||||
|
||||
\cexample{taskyield}{1}
|
||||
|
||||
\ffreeexample{taskyield}{1}
|
||||
|
@ -1,124 +0,0 @@
|
||||
\pagebreak
|
||||
\section{\code{teams} Constructs}
|
||||
\label{sec:teams}
|
||||
|
||||
\subsection{\code{target} and \code{teams} Constructs with \code{omp\_get\_num\_teams}\\
|
||||
and \code{omp\_get\_team\_num} Routines}
|
||||
\label{subsec:teams_api}
|
||||
|
||||
The following example shows how the \code{target} and \code{teams} constructs
|
||||
are used to create a league of thread teams that execute a region. The \code{teams}
|
||||
construct creates a league of at most two teams where the master thread of each
|
||||
team executes the \code{teams} region.
|
||||
|
||||
The \code{omp\_get\_num\_teams} routine returns the number of teams executing in a \code{teams}
|
||||
region. The \code{omp\_get\_team\_num} routine returns the team number, which is an integer
|
||||
between 0 and one less than the value returned by \code{omp\_get\_num\_teams}. The following
|
||||
example manually distributes a loop across two teams.
|
||||
|
||||
\cexample{teams}{1}
|
||||
|
||||
\ffreeexample{teams}{1}
|
||||
|
||||
\subsection{\code{target}, \code{teams}, and \code{distribute} Constructs}
|
||||
\label{subsec:teams_distribute}
|
||||
|
||||
The following example shows how the \code{target}, \code{teams}, and \code{distribute}
|
||||
constructs are used to execute a loop nest in a \code{target} region. The \code{teams}
|
||||
construct creates a league and the master thread of each team executes the \code{teams}
|
||||
region. The \code{distribute} construct schedules the subsequent loop iterations
|
||||
across the master threads of each team.
|
||||
|
||||
The number of teams in the league is less than or equal to the variable \plc{num\_blocks}.
|
||||
Each team in the league has a number of threads less than or equal to the variable
|
||||
\plc{block\_threads}. The iterations in the outer loop are distributed among the master
|
||||
threads of each team.
|
||||
|
||||
When a team's master thread encounters the parallel loop construct before the inner
|
||||
loop, the other threads in its team are activated. The team executes the \code{parallel}
|
||||
region and then workshares the execution of the loop.
|
||||
|
||||
Each master thread executing the \code{teams} region has a private copy of the
|
||||
variable \plc{sum} that is created by the \code{reduction} clause on the \code{teams} construct.
|
||||
The master thread and all threads in its team have a private copy of the variable
|
||||
\plc{sum} that is created by the \code{reduction} clause on the parallel loop construct.
|
||||
The second private \plc{sum} is reduced into the master thread's private copy of \plc{sum}
|
||||
created by the \code{teams} construct. At the end of the \code{teams} region,
|
||||
each master thread's private copy of \plc{sum} is reduced into the final \plc{sum} that is
|
||||
implicitly mapped into the \code{target} region.
|
||||
|
||||
\cexample{teams}{2}
|
||||
|
||||
\ffreeexample{teams}{2}
|
||||
|
||||
\subsection{\code{target} \code{teams}, and Distribute Parallel Loop Constructs}
|
||||
\label{subsec:teams_distribute_parallel}
|
||||
|
||||
The following example shows how the \code{target} \code{teams} and distribute
|
||||
parallel loop constructs are used to execute a \code{target} region. The \code{target}
|
||||
\code{teams} construct creates a league of teams where the master thread of each
|
||||
team executes the \code{teams} region.
|
||||
|
||||
The distribute parallel loop construct schedules the loop iterations across the
|
||||
master threads of each team and then across the threads of each team.
|
||||
|
||||
\cexample{teams}{3}
|
||||
|
||||
\ffreeexample{teams}{3}
|
||||
|
||||
\subsection{\code{target} \code{teams} and Distribute Parallel Loop
|
||||
Constructs with Scheduling Clauses}
|
||||
\label{subsec:teams_distribute_parallel_schedule}
|
||||
|
||||
The following example shows how the \code{target} \code{teams} and distribute
|
||||
parallel loop constructs are used to execute a \code{target} region. The \code{teams}
|
||||
construct creates a league of at most eight teams where the master thread of each
|
||||
team executes the \code{teams} region. The number of threads in each team is
|
||||
less than or equal to 16.
|
||||
|
||||
The \code{distribute} parallel loop construct schedules the subsequent loop iterations
|
||||
across the master threads of each team and then across the threads of each team.
|
||||
|
||||
The \code{dist\_schedule} clause on the distribute parallel loop construct indicates
|
||||
that loop iterations are distributed to the master thread of each team in chunks
|
||||
of 1024 iterations.
|
||||
|
||||
The \code{schedule} clause indicates that the 1024 iterations distributed to
|
||||
a master thread are then assigned to the threads in its associated team in chunks
|
||||
of 64 iterations.
|
||||
|
||||
\cexample{teams}{4}
|
||||
|
||||
\ffreeexample{teams}{4}
|
||||
|
||||
\subsection{\code{target} \code{teams} and \code{distribute} \code{simd} Constructs}
|
||||
\label{subsec:teams_distribute_simd}
|
||||
|
||||
The following example shows how the \code{target} \code{teams} and \code{distribute}
|
||||
\code{simd} constructs are used to execute a loop in a \code{target} region.
|
||||
The \code{target} \code{teams} construct creates a league of teams where the
|
||||
master thread of each team executes the \code{teams} region.
|
||||
|
||||
The \code{distribute} \code{simd} construct schedules the loop iterations across
|
||||
the master thread of each team and then uses SIMD parallelism to execute the iterations.
|
||||
|
||||
\cexample{teams}{5}
|
||||
|
||||
\ffreeexample{teams}{5}
|
||||
|
||||
\subsection{\code{target} \code{teams} and Distribute Parallel Loop SIMD Constructs}
|
||||
\label{subsec:teams_distribute_parallel_simd}
|
||||
|
||||
The following example shows how the \code{target} \code{teams} and the distribute
|
||||
parallel loop SIMD constructs are used to execute a loop in a \code{target} \code{teams}
|
||||
region. The \code{target} \code{teams} construct creates a league of teams
|
||||
where the master thread of each team executes the \code{teams} region.
|
||||
|
||||
The distribute parallel loop SIMD construct schedules the loop iterations across
|
||||
the master thread of each team and then across the threads of each team where each
|
||||
thread uses SIMD parallelism.
|
||||
|
||||
\cexample{teams}{6}
|
||||
|
||||
\ffreeexample{teams}{6}
|
||||
|
@ -1,106 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{threadprivate} Directive}
|
||||
\label{sec:threadprivate}
|
||||
|
||||
The following examples demonstrate how to use the \code{threadprivate} directive
|
||||
to give each thread a separate counter.
|
||||
|
||||
\cexample{threadprivate}{1}
|
||||
|
||||
\fexample{threadprivate}{1}
|
||||
|
||||
\ccppspecificstart
|
||||
The following example uses \code{threadprivate} on a static variable:
|
||||
|
||||
\cnexample{threadprivate}{2}
|
||||
|
||||
The following example demonstrates unspecified behavior for the initialization
|
||||
of a \code{threadprivate} variable. A \code{threadprivate} variable is initialized
|
||||
once at an unspecified point before its first reference. Because \code{a} is
|
||||
constructed using the value of \code{x} (which is modified by the statement
|
||||
\code{x++}), the value of \code{a.val} at the start of the \code{parallel}
|
||||
region could be either 1 or 2. This problem is avoided for \code{b}, which uses
|
||||
an auxiliary \code{const} variable and a copy-constructor.
|
||||
|
||||
\cppnexample{threadprivate}{3}
|
||||
\ccppspecificend
|
||||
|
||||
The following examples show non-conforming uses and correct uses of the \code{threadprivate}
|
||||
directive.
|
||||
|
||||
\fortranspecificstart
|
||||
The following example is non-conforming because the common block is not declared
|
||||
local to the subroutine that refers to it:
|
||||
|
||||
\fnexample{threadprivate}{2}
|
||||
|
||||
The following example is also non-conforming because the common block is not declared
|
||||
local to the subroutine that refers to it:
|
||||
|
||||
\fnexample{threadprivate}{3}
|
||||
|
||||
The following example is a correct rewrite of the previous example:
|
||||
|
||||
\fnexample{threadprivate}{4}
|
||||
|
||||
The following is an example of the use of \code{threadprivate} for local variables:
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
|
||||
\fnexample{threadprivate}{5}
|
||||
|
||||
The above program, if executed by two threads, will print one of the following
|
||||
two sets of output:
|
||||
|
||||
\code{a = 11 12 13}
|
||||
\\
|
||||
\code{ptr = 4}
|
||||
\\
|
||||
\code{i = 15}
|
||||
|
||||
\code{A is not allocated}
|
||||
\\
|
||||
\code{ptr = 4}
|
||||
\\
|
||||
\code{i = 5}
|
||||
|
||||
or
|
||||
|
||||
\code{A is not allocated}
|
||||
\\
|
||||
\code{ptr = 4}
|
||||
\\
|
||||
\code{i = 15}
|
||||
|
||||
\code{a = 1 2 3}
|
||||
\\
|
||||
\code{ptr = 4}
|
||||
\\
|
||||
\code{i = 5}
|
||||
|
||||
The following is an example of the use of \code{threadprivate} for module variables:
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
|
||||
\fnexample{threadprivate}{6}
|
||||
\fortranspecificend
|
||||
|
||||
\cppspecificstart
|
||||
The following example illustrates initialization of \code{threadprivate} variables
|
||||
for class-type \code{T}. \code{t1} is default constructed, \code{t2} is constructed
|
||||
taking a constructor accepting one argument of integer type, \code{t3} is copy
|
||||
constructed with argument \code{f()}:
|
||||
|
||||
\cppnexample{threadprivate}{4}
|
||||
|
||||
The following example illustrates the use of \code{threadprivate} for static
|
||||
class members. The \code{threadprivate} directive for a static class member must
|
||||
be placed inside the class definition.
|
||||
|
||||
\cppnexample{threadprivate}{5}
|
||||
\cppspecificend
|
||||
|
@ -1,76 +0,0 @@
|
||||
\pagebreak
|
||||
\section{The \code{workshare} Construct}
|
||||
\fortranspecificstart
|
||||
\label{sec:workshare}
|
||||
|
||||
The following are examples of the \code{workshare} construct.
|
||||
|
||||
In the following example, \code{workshare} spreads work across the threads executing
|
||||
the \code{parallel} region, and there is a barrier after the last statement.
|
||||
Implementations must enforce Fortran execution rules inside of the \code{workshare}
|
||||
block.
|
||||
|
||||
\fnexample{workshare}{1}
|
||||
|
||||
In the following example, the barrier at the end of the first \code{workshare}
|
||||
region is eliminated with a \code{nowait} clause. Threads doing \code{CC =
|
||||
DD} immediately begin work on \code{EE = FF} when they are done with \code{CC
|
||||
= DD}.
|
||||
|
||||
\fnexample{workshare}{2}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
|
||||
The following example shows the use of an \code{atomic} directive inside a \code{workshare}
|
||||
construct. The computation of \code{SUM(AA)} is workshared, but the update to
|
||||
\code{R} is atomic.
|
||||
|
||||
\fnexample{workshare}{3}
|
||||
|
||||
Fortran \code{WHERE} and \code{FORALL} statements are \emph{compound statements},
|
||||
made up of a \emph{control} part and a \emph{statement} part. When \code{workshare}
|
||||
is applied to one of these compound statements, both the control and the statement
|
||||
parts are workshared. The following example shows the use of a \code{WHERE} statement
|
||||
in a \code{workshare} construct.
|
||||
|
||||
Each task gets worked on in order by the threads:
|
||||
|
||||
\code{AA = BB} then
|
||||
\\
|
||||
\code{CC = DD} then
|
||||
\\
|
||||
\code{EE .ne. 0} then
|
||||
\\
|
||||
\code{FF = 1 / EE} then
|
||||
\\
|
||||
\code{GG = HH}
|
||||
|
||||
\fnexample{workshare}{4}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
|
||||
In the following example, an assignment to a shared scalar variable is performed
|
||||
by one thread in a \code{workshare} while all other threads in the team wait.
|
||||
|
||||
\fnexample{workshare}{5}
|
||||
|
||||
The following example contains an assignment to a private scalar variable, which
|
||||
is performed by one thread in a \code{workshare} while all other threads wait.
|
||||
It is non-conforming because the private scalar variable is undefined after the
|
||||
assignment statement.
|
||||
|
||||
\fnexample{workshare}{6}
|
||||
|
||||
Fortran execution rules must be enforced inside a \code{workshare} construct.
|
||||
In the following example, the same result is produced in the following program
|
||||
fragment regardless of whether the code is executed sequentially or inside an OpenMP
|
||||
program with multiple threads:
|
||||
|
||||
\fnexample{workshare}{7}
|
||||
\fortranspecificend
|
||||
|
||||
|
@ -1,18 +0,0 @@
|
||||
\pagebreak
|
||||
\section{Worksharing Constructs Inside a \code{critical} Construct}
|
||||
\label{sec:worksharing_critical}
|
||||
|
||||
The following example demonstrates using a worksharing construct inside a \code{critical}
|
||||
construct. This example is conforming because the worksharing \code{single}
|
||||
region is not closely nested inside the \code{critical} region. A single thread
|
||||
executes the one and only section in the \code{sections} region, and executes
|
||||
the \code{critical} region. The same thread encounters the nested \code{parallel}
|
||||
region, creates a new team of threads, and becomes the master of the new team.
|
||||
One of the threads in the new team enters the \code{single} region and increments
|
||||
\code{i} by \code{1}. At the end of this example \code{i} is equal to \code{2}.
|
||||
|
||||
\cexample{worksharing_critical}{1}
|
||||
|
||||
\fexample{worksharing_critical}{1}
|
||||
|
||||
|
48
Foreword_Chapt.tex
Normal file
48
Foreword_Chapt.tex
Normal file
@ -0,0 +1,48 @@
|
||||
\chapter*{Foreword}
|
||||
\label{chap:foreword}
|
||||
|
||||
The OpenMP Examples document has been updated with new features
|
||||
found in the OpenMP \SVER\ Specification.
|
||||
In order to provide users with new feature examples concurrently
|
||||
with the release of the OpenMP 6.0 Specification,
|
||||
the 6.0 Examples document is being released early
|
||||
with a caveat that some of the 6.0 features
|
||||
(such as \kcode{workdistribute} construct, \kcode{taskgraph} construct,
|
||||
\kcode{threadset} clause and free-agent threads) will be covered
|
||||
in the next release of the document.
|
||||
For a list of the new examples and updates in this release,
|
||||
please refer to the Document Revision History of the Appendix on page~\pageref{chap:history}.
|
||||
|
||||
Text describing an example with a \SVER\ feature specifically states
|
||||
that the feature support begins in the OpenMP \SVER\ Specification. Also,
|
||||
an \kcode{\small{}omp_\SVER} keyword is included in the metadata of the source code.
|
||||
These distinctions are presented to remind readers that a \SVER\ compliant
|
||||
OpenMP implementation is necessary to use these features in codes.
|
||||
|
||||
%Examples for most of the \SVER\ features are included in this document,
|
||||
%and
|
||||
Incremental releases will become available as more feature examples
|
||||
and updates are submitted and approved by the OpenMP Examples Subcommittee.
|
||||
Examples are accepted for this document after discussions, revisions and reviews
|
||||
in the Examples Subcommittee, and two reviews/discussions and two votes
|
||||
in the OpenMP Language Committee.
|
||||
Draft examples are often derived from case studies for new features in the language,
|
||||
and are revised to illustrate the basic application of the features with code comments,
|
||||
and a text description. We are grateful to the numerous members of the Language Committee
|
||||
who took the time to prepare codes and descriptions, and shepherd them through
|
||||
the acceptance process. We sincerely appreciate the Example Subcommittee members, who
|
||||
actively participated and contributed in weekly meetings over the years.
|
||||
|
||||
\bigskip
|
||||
Examples Subcommittee Co-chairs: \smallskip\linebreak
|
||||
Henry Jin (\textsc{NASA} Ames Research Center) \linebreak
|
||||
Swaroop Pophale (Oak Ridge National Laboratory)
|
||||
|
||||
\bigskip
|
||||
\bigskip
|
||||
Past Examples Subcommittee Co-chairs:
|
||||
\begin{itemize}
|
||||
\item Kent Milfeld (2014 - 2022)
|
||||
\end{itemize}
|
||||
|
||||
|
502
History.tex
502
History.tex
@ -1,39 +1,463 @@
|
||||
\chapter{Document Revision History}
|
||||
\cchapter{Document Revision History}{history}
|
||||
\label{chap:history}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.2.2 to 6.0}
|
||||
\label{sec:history_522_to_60}
|
||||
|
||||
\begin{itemize}
|
||||
\item General changes:
|
||||
\begin{itemize}
|
||||
\item Added a set of structured LaTeX environments for specifying
|
||||
language-dependent text. This allows extracting language-specific
|
||||
content of the Examples document. Refer to the content of
|
||||
\examplesblob{v6.0/Contributions.md} for details.
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following examples for the 6.0 features:
|
||||
\begin{itemize}
|
||||
\item \kcode{omp::decl} attribute for declarative directives in C/C++
|
||||
(\specref{sec:attributes})
|
||||
\item \kcode{transparent} clause on the \kcode{task} construct to enable dependences
|
||||
between non-sibling tasks (\specref{subsec:depend_trans_task})
|
||||
\item Task dependences for \kcode{taskloop} construct
|
||||
(\specref{sec:taskloop_depend})
|
||||
\item \kcode{num_threads} clause that appears inside \kcode{target} region
|
||||
(\specref{subsec:target_teams_num_teams})
|
||||
\item \kcode{nowait} clause with argument on the \kcode{target} construct to control deferment
|
||||
of target task (\specref{subsec:async_target_nowait_arg})
|
||||
\item Traits for specifying devices (\specref{sec:device_env_traits})
|
||||
\item \kcode{apply} clause with modifier argument to
|
||||
support selective loop transformations
|
||||
(\specref{sec:apply_clause})
|
||||
\item Reduction on private variables in a \kcode{parallel} region
|
||||
(\specref{subsec:priv_reduction})
|
||||
\item \kcode{induction} clause (\specref{subsec:induction})
|
||||
and user-defined induction (\specref{subsec:user-defined-induction})
|
||||
\item \kcode{init_complete} clause for \kcode{scan} directive to
|
||||
support initialization phase in scan operation
|
||||
(\specref{sec:scan})
|
||||
\item \kcode{assume} construct with \kcode{no_openmp} and \kcode{no_parallelism} clauses (\specref{sec:assumption})
|
||||
\item \kcode{num_threads} clause with a list
|
||||
(\specref{subsec:icv_nthreads})
|
||||
\item \kcode{dispatch} construct to control variant substitution
|
||||
for a procedure call (\specref{sec:dispatch})
|
||||
\end{itemize}
|
||||
|
||||
\item Other changes:
|
||||
\begin{itemize}
|
||||
\item Changed attribute specifier as a directive form from C++ only to C/C++
|
||||
(\specref{chap:directive_syntax})
|
||||
\item Added missing \bcode{include <omp.h>} in Example \example{atomic.4.c}
|
||||
and \bcode{use omp_lib} in Example \example{atomic.4.f90}
|
||||
(\specref{sec:atomic_hint})
|
||||
\item Fixed the function declaration order for variant functions in
|
||||
Examples \example{selector_scoring.[12].c} and Fortran pointer
|
||||
initialization in Example \example{selector_scoring.2.f90}
|
||||
(\specref{subsec:context_selector_scoring})
|
||||
\item Replaced the deprecated use of \plc{combiner-exp}
|
||||
in \kcode{declare reduction} directive with \kcode{combiner} clause
|
||||
(\specref{subsec:UDR} and \specref{sec:Updated Examples})
|
||||
\item Fixed the initialization of Fortran pointers
|
||||
in Example \example{cancellation.2.f90} and changed to
|
||||
use \kcode{atomic write} for performing atomic writes
|
||||
(\specref{sec:cancellation})
|
||||
\item Added missing \kcode{declare target} directive for external procedure
|
||||
called inside \kcode{target} region in Example
|
||||
\example{requires.1.f90} (\specref{sec:requires})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.2.1 to 5.2.2}
|
||||
\label{sec:history_521_to_522}
|
||||
|
||||
\begin{itemize}
|
||||
\item To improve the style of the document, a set of macros was introduced
|
||||
and consistently used for language keywords, names, concepts, and user codes
|
||||
in the text description of the document. Refer to the content of
|
||||
\examplesblob{v5.2.2/Contributions.md}
|
||||
for details.
|
||||
|
||||
\item Added the following examples:
|
||||
\begin{itemize}
|
||||
\item Orphaned and nested \kcode{loop} constructs (\specref{sec:loop})
|
||||
\item \kcode{all} variable category for the \kcode{defaultmap} clause
|
||||
(\specref{sec:defaultmap})
|
||||
\item \kcode{target update} construct using a custom mapper
|
||||
(\specref{subsec:target_update_mapper})
|
||||
\item \kcode{indirect} clause for indirect procedure calls in a
|
||||
\kcode{target} region (\specref{subsec:indirect})
|
||||
\item \kcode{omp_target_memcpy_async} routine with depend object
|
||||
(\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item Synchronization hint for atomic operation (\specref{sec:atomic_hint})
|
||||
\item Implication of passing shared variable to a procedure
|
||||
in Fortran (\specref{sec:fort_shared_var})
|
||||
\item Assumption directives for providing additional information
|
||||
about program properties (\specref{sec:assumption})
|
||||
\item Mapping behavior of scalars, pointers, references (C++) and associate names
|
||||
(Fortran) when unified shared memory is required
|
||||
(\specref{sec:requires})
|
||||
\item \kcode{begin declare variant} paired with \kcode{end declare variant}
|
||||
example to show use of nested declare variant
|
||||
directives (\specref{subsec:declare_variant})
|
||||
\item Explicit scoring in context selectors
|
||||
(\specref{subsec:context_selector_scoring})
|
||||
\end{itemize}
|
||||
|
||||
\item Miscellaneous changes:
|
||||
\begin{itemize}
|
||||
\item Included a general statement in Introduction about the number of
|
||||
threads used throughout the examples document (\specref{sec:examples})
|
||||
\item Clarified the mapping of virtual functions in \kcode{target} regions
|
||||
(\specref{sec:virtual_functions})
|
||||
\item Added missing \kcode{declare target} directive for procedures
|
||||
called inside \kcode{target} region in \example{Examples}
|
||||
\example{declare_mapper.1.f90} (\specref{sec:declare_mapper}),
|
||||
\example{target_reduction.*.f90} (\specref{subsec:target_reduction}),
|
||||
and \example{target_task_reduction.*.f90}
|
||||
(\specref{subsec:target_task_reduction})
|
||||
\item Added missing \kcode{end target} directive in
|
||||
\example{Example declare_mapper.3.f90}
|
||||
(\specref{sec:declare_mapper})
|
||||
\item Removed example for \kcode{flush} without a list from Synchronization
|
||||
since the example is confusing and the use of \kcode{flush} is already
|
||||
covered in other examples
|
||||
(\specref{chap:synchronization})
|
||||
\item \docref{declare variant Directive} and \docref{Metadirective} sections were moved to
|
||||
subsections in the new \docref{Context-based Variant Selection} section,
|
||||
with a section introduction on context selectors.
|
||||
(\specref{sec:context_based_variants})
|
||||
\item Fixed a typo (`\kcode{for}' $\rightarrow$ `\kcode{do}') in
|
||||
\example{Example metadirective.4.f90}
|
||||
(\specref{subsec:metadirective})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.2 to 5.2.1}
|
||||
\label{sec:history_52_to_521}
|
||||
|
||||
\begin{itemize}
|
||||
\item General changes:
|
||||
\begin{itemize}
|
||||
\item Updated source metadata tags for all examples to use an improved form
|
||||
(see \examplesblob{v5.2.1/Contributions.md})
|
||||
\item Explicitly included the version tag \verlabel[pre\_]{3.0} in those
|
||||
examples that did not contain a version tag previously
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following examples for the 5.2 features:
|
||||
\begin{itemize}
|
||||
\item \kcode{uses_allocators} clause for the use of allocators in
|
||||
\kcode{target} regions (\specref{sec:allocators})
|
||||
\end{itemize}
|
||||
\item Added the following examples for the 5.1 features:
|
||||
\begin{itemize}
|
||||
\item The \kcode{inoutset} dependence type (\specref{subsec:task_concurrent_depend})
|
||||
\item Atomic compare and capture (\specref{sec:cas})
|
||||
\end{itemize}
|
||||
\item Added the following examples for the 5.0 features:
|
||||
\begin{itemize}
|
||||
\item \kcode{declare target} directive with \kcode{device_type(nohost)}
|
||||
clause (\specref{subsec:declare_target_device_type})
|
||||
\item \kcode{omp_pause_resource} and \kcode{omp_pause_resource_all}
|
||||
routines (\specref{sec:pause_resource})
|
||||
\end{itemize}
|
||||
|
||||
\item Miscellaneous fixes:
|
||||
\begin{itemize}
|
||||
\item Cast to implementation-defined enum type \kcode{omp_event_handle_t}
|
||||
now uses \bcode{uintptr_t} (not \bcode{void *}) in
|
||||
\example{Example task_detach.2.c}
|
||||
(\specref{sec:task_detachment})
|
||||
\item Moved Fortran \kcode{requires} directive into program main (\ucode{rev_off}),
|
||||
the program unit, in \example{Example target_reverse_offload.7.f90}
|
||||
(\specref{subsec:target_reverse_offload})
|
||||
\item Fixed an inconsistent use of mapper in \example{Example target_mapper.3.f90}
|
||||
(\specref{sec:declare_mapper})
|
||||
\item Added a missing semicolon at end of \ucode{XOR1} class definition in
|
||||
\example{Example declare_target.2a.cpp}
|
||||
(\specref{subsec:declare_target_class})
|
||||
\item Fixed the placement of \kcode{declare simd} directive in
|
||||
\example{Examples linear_modifier.*.f90} (\specref{sec:linear_modifier})
|
||||
and added a general statement about where a Fortran declarative
|
||||
directive can appear (\specref{chap:directive_syntax})
|
||||
\item Fixed mismatched argument list in \example{Example fort_sa_private.5.f}
|
||||
(\specref{sec:fort_sa_private})
|
||||
\item Moved the placement of \kcode{declare target enter}
|
||||
directive after function declaration
|
||||
(\specref{subsec:target_task_reduction})
|
||||
\item Fixed an incorrect use of \kcode{omp_in_parallel} routine in
|
||||
\example{Example metadirective.4}
|
||||
(\specref{subsec:metadirective})
|
||||
\item Fixed an incorrect value for \kcode{at} clause
|
||||
(\specref{subsec:error})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.1 to 5.2}
|
||||
\label{sec:history_51_to_52}
|
||||
|
||||
\begin{itemize}
|
||||
\item General changes:
|
||||
\begin{itemize}
|
||||
\item Included a description of the semantics for OpenMP directive syntax
|
||||
(see \specref{chap:directive_syntax})
|
||||
\item Reorganized the Introduction Chapter and moved the Feature
|
||||
Deprecation Chapter to Appendix~\ref{chap:deprecated_features}
|
||||
\item Included a list of examples that were updated for feature deprecation
|
||||
and replacement in each version (see Appendix~\ref{sec:Updated Examples})
|
||||
\item Added Index entries
|
||||
\end{itemize}
|
||||
|
||||
\item Updated the examples for feature deprecation and replacement in OpenMP 5.2.
|
||||
See Table~\ref{tab:Deprecated Features} and
|
||||
Table~\ref{tab:Updated Examples 5.2} for details.
|
||||
|
||||
\item Added the following examples for the 5.2 features:
|
||||
\begin{itemize}
|
||||
\item Mapping class objects with virtual functions
|
||||
(\specref{sec:virtual_functions})
|
||||
\item \kcode{allocators} construct for Fortran \bcode{allocate} statement
|
||||
(\specref{sec:allocators})
|
||||
\item Behavior of reallocation of variables through OpenMP allocator in
|
||||
Fortran (\specref{sec:allocators})
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following examples for the 5.1 features:
|
||||
\begin{itemize}
|
||||
\item Clarification of optional \kcode{end} directive for strictly structured
|
||||
block in Fortran (\specref{sec:fortran_free_format_comments})
|
||||
\item \kcode{filter} clause on \kcode{masked} construct (\specref{sec:masked})
|
||||
\item \kcode{omp_all_memory} reserved locator for specifying task dependences
|
||||
(\specref{subsec:depend_undefer_task})
|
||||
\item Behavior of Fortran allocatable variables in \kcode{target} regions
|
||||
(\specref{sec:fort_allocatable_array_mapping})
|
||||
\item Device memory routines in Fortran
|
||||
(\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item Partial tiles from \kcode{tile} construct
|
||||
(\specref{sec:incomplete_tiles})
|
||||
\item Fortran associate names and selectors in \kcode{target} region
|
||||
(\specref{sec:associate_target})
|
||||
\item \kcode{allocate} directive for variable declarations and
|
||||
\kcode{allocate} clause on \kcode{task} constructs
|
||||
(\specref{sec:allocators})
|
||||
\item Controlling concurrency and reproducibility with \kcode{order} clause
|
||||
(\specref{sec:reproducible_modifier})
|
||||
\end{itemize}
|
||||
|
||||
\item Added other examples:
|
||||
\begin{itemize}
|
||||
\item Using lambda expressions with \kcode{target} constructs
|
||||
(\specref{sec:lambda_expressions})
|
||||
\item Target memory and device pointer routines
|
||||
(\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item Examples to illustrate the ordering properties of
|
||||
the \plc{flush} operation (\specref{sec:mem_model})
|
||||
\item User selector in the \kcode{metadirective} directive
|
||||
(\specref{subsec:metadirective})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.0.1 to 5.1}
|
||||
\label{sec:history_501_to_51}
|
||||
|
||||
\begin{itemize}
|
||||
\item General changes:
|
||||
\begin{itemize}
|
||||
\item Replaced \kcode{master} construct example with equivalent \kcode{masked} construct example (\specref{sec:masked})
|
||||
\item Primary thread is now used to describe thread number 0 in the current team
|
||||
\item \kcode{primary} thread affinity policy is now used to specify that every
|
||||
thread in the team is assigned to the same place as the primary thread (\specref{subsec:affinity_primary})
|
||||
\item The \kcode{omp_lock_hint_*} constants have been renamed \kcode{omp_sync_hint_*} (\specref{sec:critical}, \specref{sec:locks})
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following new chapters:
|
||||
\begin{itemize}
|
||||
\item Deprecated Features (on page~\pageref{chap:deprecated_features})
|
||||
\item Directive Syntax (\specref{chap:directive_syntax})
|
||||
\item Loop Transformations (\specref{chap:loop_transformations})
|
||||
\item OMPT Interface (\specref{chap:ompt_interface})
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following examples for the 5.1 features:
|
||||
\begin{itemize}
|
||||
\item OpenMP directives in C++ \plc{attribute} specifiers
|
||||
(\specref{sec:attributes})
|
||||
\item Directive syntax adjustment to allow Fortran \bcode{BLOCK} ...
|
||||
\bcode{END BLOCK} as a structured block
|
||||
(\specref{sec:fortran_free_format_comments})
|
||||
\item \kcode{omp_target_is_accessible} API routine
|
||||
(\specref{sec:pointer_mapping})
|
||||
\item Fortran allocatable array mapping in \kcode{target} regions (\specref{sec:fort_allocatable_array_mapping})
|
||||
\item \kcode{begin declare target} (with
|
||||
\kcode{end declare target}) directive
|
||||
(\specref{subsec:declare_target_class})
|
||||
\item \kcode{tile} construct (\specref{sec:tile})
|
||||
\item \kcode{unroll} construct (\specref{sec:unroll})
|
||||
\item Reduction with the \kcode{scope} construct
|
||||
(\specref{subsec:reduction_scope})
|
||||
\item \kcode{metadirective} directive with dynamic \kcode{condition} selector
|
||||
(\specref{subsec:metadirective})
|
||||
\item \kcode{interop} construct (\specref{sec:interop})
|
||||
\item Environment display with the \kcode{omp_display_env} routine
|
||||
(\specref{subsec:display_env})
|
||||
\item \kcode{error} directive (\specref{subsec:error})
|
||||
\end{itemize}
|
||||
|
||||
\item Included additional examples for the 5.0 features:
|
||||
\begin{itemize}
|
||||
\item \kcode{collapse} clause for non-rectangular loop nest
|
||||
(\specref{sec:collapse})
|
||||
\item \kcode{detach} clause for tasks (\specref{sec:task_detachment})
|
||||
\item Pointer attachment for a structure member (\specref{sec:structure_mapping})
|
||||
\item Host and device pointer association with the \kcode{omp_target_associate_ptr} routine (\specref{sec:target_associate_ptr})
|
||||
|
||||
\item Sample code on activating the tool interface
|
||||
(\specref{sec:ompt_start})
|
||||
\end{itemize}
|
||||
|
||||
\item Added other examples:
|
||||
\begin{itemize}
|
||||
\item The \kcode{omp_get_wtime} routine (\specref{subsec:get_wtime})
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.0.0 to 5.0.1}
|
||||
\label{sec:history_50_to_501}
|
||||
|
||||
\begin{itemize}
|
||||
\item Added version tags \verlabel{\plc{x.y}} in example labels
|
||||
and the corresponding source codes for all examples that feature
|
||||
OpenMP 3.0 and later.
|
||||
|
||||
\item Included additional examples for the 5.0 features:
|
||||
|
||||
\begin{itemize}
|
||||
\item Extension to the \kcode{defaultmap} clause
|
||||
(\specref{sec:defaultmap})
|
||||
\item Transferring noncontiguous data with the \kcode{target update} directive in Fortran (\specref{sec:array-shaping})
|
||||
\item \kcode{conditional} modifier for the \kcode{lastprivate} clause (\specref{sec:lastprivate})
|
||||
\item \kcode{task} modifier for the \kcode{reduction} clause (\specref{subsec:task_reduction})
|
||||
\item Reduction on combined target constructs (\specref{subsec:target_reduction})
|
||||
\item Task reduction with \kcode{target} constructs
|
||||
(\specref{subsec:target_task_reduction})
|
||||
\item \kcode{scan} directive for returning the \emph{prefix sum} of a reduction (\specref{sec:scan})
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\item Included additional examples for the 4.x features:
|
||||
|
||||
\begin{itemize}
|
||||
\item Dependence for undeferred tasks
|
||||
(\specref{subsec:depend_undefer_task})
|
||||
\item \kcode{ref}, \kcode{val}, \kcode{uval} modifiers for \kcode{linear} clause (\specref{sec:linear_modifier})
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\item Clarified the description of pointer mapping and pointer attachment in
|
||||
\specref{sec:pointer_mapping}.
|
||||
\item Clarified the description of memory model examples
|
||||
in \specref{sec:mem_model}.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\section{Changes from 4.5.0 to 5.0.0}
|
||||
\label{sec:history_45_to_50}
|
||||
|
||||
\begin{itemize}
|
||||
\item Added the following examples for the 5.0 features:
|
||||
|
||||
\begin{itemize}
|
||||
\item Extended \kcode{teams} construct for host execution (\specref{sec:host_teams})
|
||||
\item \kcode{loop} and \kcode{teams loop} constructs specify loop iterations that can execute concurrently
|
||||
(\specref{sec:loop})
|
||||
\item Task data affinity is indicated by \kcode{affinity} clause of \kcode{task} construct
|
||||
(\specref{sec: task_affinity})
|
||||
\item Display thread affinity with \kcode{OMP_DISPLAY_AFFINITY} environment variable or \kcode{omp_display_affinity()} API routine
|
||||
(\specref{sec:affinity_display})
|
||||
\item \kcode{taskwait} with dependences (\specref{subsec:taskwait_depend})
|
||||
\item \kcode{mutexinoutset} task dependences (\specref{subsec:task_dep_mutexinoutset})
|
||||
\item Multidependence Iterators (in \kcode{depend} clauses) (\specref{subsec:depend_iterator})
|
||||
\item Combined constructs: \kcode{parallel master taskloop} and \kcode{parallel master taskloop simd}
|
||||
(\specref{sec:parallel_masked_taskloop})
|
||||
\item Reverse Offload through \kcode{ancestor} modifier of \kcode{device} clause. (\specref{subsec:target_reverse_offload})
|
||||
\item Pointer Mapping - behavior of mapped pointers (\specref{sec:pointer_mapping}) %Example_target_ptr_map*
|
||||
\item Structure Mapping - behavior of mapped structures (\specref{sec:structure_mapping}) %Examples_target_structure_mapping.tex target_struct_map*
|
||||
\item Array Shaping with the \plc{shape-operator} (\specref{sec:array-shaping})
|
||||
\item The \kcode{declare mapper} directive (\specref{sec:declare_mapper})
|
||||
\item Acquire and Release Semantics Synchronization: Memory ordering
|
||||
clauses \kcode{acquire}, \kcode{release}, and \kcode{acq_rel} were added
|
||||
to flush and atomic constructs
|
||||
(\specref{sec:acquire_and_release_semantics})
|
||||
\item \kcode{depobj} construct provides dependence objects for subsequent use in \kcode{depend} clauses
|
||||
(\specref{sec:depobj})
|
||||
\item \kcode{reduction} clause for \kcode{task} construct (\specref{subsec:task_reduction})
|
||||
\item \kcode{reduction} clause for \kcode{taskloop} construct (\specref{subsec:taskloop_reduction})
|
||||
\item \kcode{reduction} clause for \kcode{taskloop simd} construct (\specref{subsec:taskloop_reduction})
|
||||
\item Memory Allocators for making OpenMP memory requests with traits (\specref{sec:allocators})
|
||||
\item \kcode{requires} directive specifies required features of implementation (\specref{sec:requires})
|
||||
\item \kcode{declare variant} directive - for function variants
|
||||
(\specref{subsec:declare_variant})
|
||||
\item \kcode{metadirective} directive - for directive variants
|
||||
(\specref{subsec:metadirective})
|
||||
\item \kcode{OMP_TARGET_OFFLOAD} Environment Variable - controls offload behavior (\specref{sec:target_offload})
|
||||
\end{itemize}
|
||||
|
||||
\item Included the following additional examples for the 4.x features:
|
||||
\begin{itemize}
|
||||
\item more taskloop examples (\specref{sec:taskloop})
|
||||
\item user-defined reduction (UDR) (\specref{subsec:UDR})
|
||||
%NEW 5.0
|
||||
%\item \code{target} \code{enter} and \code{exit} \code{data} unstructured data constructs (\specref{sec:target_enter_exit_data}) %Example_target_unstructured_data.* ?
|
||||
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
\section{Changes from 4.0.2 to 4.5.0}
|
||||
\begin{itemize}
|
||||
\item Reorganized into chapters of major topics
|
||||
\item Included file extensions in example labels to indicate source type
|
||||
\item Applied the explicit \code{map(tofrom)} for scalar variables
|
||||
in a number of examples to comply with
|
||||
the change of the default behavior for scalar variables from
|
||||
\code{map(tofrom)} to \code{firstprivate} in the 4.5 specification
|
||||
\item Applied the explicit \kcode{map(tofrom)} for scalar variables
|
||||
in a number of examples to comply with
|
||||
the change of the default behavior for scalar variables from
|
||||
\kcode{map(tofrom)} to \kcode{firstprivate} in the 4.5 specification
|
||||
\item Added the following new examples:
|
||||
|
||||
\begin{itemize}
|
||||
\item \code{linear} clause in loop constructs (\specref{sec:linear_in_loop})
|
||||
\item task priority (\specref{sec:task_priority})
|
||||
\item \code{taskloop} construct (\specref{sec:taskloop})
|
||||
\item \plc{directive-name} modifier in multiple \code{if} clauses on
|
||||
a combined construct (\specref{subsec:target_if})
|
||||
\item unstructured data mapping (\specref{sec:target_enter_exit_data})
|
||||
\item \code{link} clause for \code{declare}~\code{target} directive
|
||||
(\specref{subsec:declare_target_link})
|
||||
\item asynchronous target execution with \code{nowait} clause (\specref{sec:async_target_exec_depend})
|
||||
\item device memory routines and device pointers
|
||||
(\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item doacross loop nest (\specref{sec:doacross})
|
||||
\item locks with hints (\specref{sec:locks})
|
||||
\item C/C++ array reduction (\specref{sec:reduction})
|
||||
\item \kcode{linear} clause in loop constructs (\specref{sec:linear_in_loop})
|
||||
\item \kcode{priority} clause for \kcode{task} construct (\specref{sec:task_priority})
|
||||
\item \kcode{taskloop} construct (\specref{sec:taskloop})
|
||||
\item \plc{directive-name} modifier in multiple \kcode{if} clauses on
|
||||
a combined construct (\specref{subsec:target_if})
|
||||
\item unstructured data mapping (\specref{sec:target_enter_exit_data})
|
||||
\item \kcode{link} clause for \kcode{declare target} directive
|
||||
(\specref{subsec:declare_target_link})
|
||||
\item asynchronous target execution with \kcode{nowait} clause (\specref{sec:async_target_exec_depend})
|
||||
\item device memory routines and device pointers (\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item doacross loop nest (\specref{sec:doacross})
|
||||
\item locks with hints (\specref{sec:locks})
|
||||
\item C/C++ array reduction (\specref{subsec:reduction})
|
||||
\item C++ reference types in data sharing clauses (\specref{sec:cpp_reference})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\section{Changes from 4.0.1 to 4.0.2}
|
||||
|
||||
\begin{itemize}
|
||||
\item Names of examples were changed from numbers to mnemonics
|
||||
\item Added SIMD examples (\specref{sec:SIMD})
|
||||
\item Added SIMD examples (\specref{sec:SIMD})
|
||||
\item Applied miscellaneous fixes in several source codes
|
||||
\item Added the revision history
|
||||
\end{itemize}
|
||||
@ -42,27 +466,29 @@ a combined construct (\specref{subsec:target_if})
|
||||
|
||||
Added the following new examples:
|
||||
\begin{itemize}
|
||||
\item the \code{proc\_bind} clause (\specref{sec:affinity})
|
||||
\item the \code{taskgroup} construct (\specref{sec:taskgroup})
|
||||
\item the \kcode{proc_bind} clause (\specref{sec:affinity})
|
||||
\item the \kcode{taskgroup} construct (\specref{sec:taskgroup})
|
||||
\end{itemize}
|
||||
|
||||
\section{Changes from 3.1 to 4.0}
|
||||
|
||||
Beginning with OpenMP 4.0, examples were placed in a separate document
|
||||
from the specification document.
|
||||
|
||||
Version 4.0 added the following new examples:
|
||||
\begin{itemize}
|
||||
\item task dependences (\specref{sec:task_depend})
|
||||
\item \code{target} construct (\specref{sec:target})
|
||||
\item \code{target} \code{data} construct (\specref{sec:target_data})
|
||||
\item \code{target} \code{update} construct (\specref{sec:target_update})
|
||||
\item \code{declare} \code{target} construct (\specref{sec:declare_target})
|
||||
\item \code{teams} constructs (\specref{sec:teams})
|
||||
\item asynchronous execution of a \code{target} region using tasks
|
||||
(\specref{subsec:async_target_with_tasks})
|
||||
\item array sections in device constructs (\specref{sec:array_sections})
|
||||
\item device runtime routines (\specref{sec:device})
|
||||
\item Fortran ASSOCIATE construct (\specref{sec:associate})
|
||||
\item cancellation constructs (\specref{sec:cancellation})
|
||||
\item Beginning with OpenMP 4.0, examples were placed in a separate document
|
||||
from the specification document.
|
||||
\item Version 4.0 added the following new examples:
|
||||
|
||||
\begin{itemize}
|
||||
\item task dependences (\specref{sec:task_depend})
|
||||
\item \kcode{target} construct (\specref{sec:target})
|
||||
\item array sections in device constructs (\specref{sec:array_sections})
|
||||
\item \kcode{target data} construct (\specref{sec:target_data})
|
||||
\item \kcode{target update} construct (\specref{sec:target_update})
|
||||
\item \kcode{declare target} directive (\specref{sec:declare_target})
|
||||
\item \kcode{teams} constructs (\specref{sec:teams})
|
||||
\item asynchronous execution of a \kcode{target} region using tasks (\specref{subsec:async_target_with_tasks})
|
||||
\item device runtime routines (\specref{sec:device})
|
||||
\item Fortran ASSOCIATE construct (\specref{sec:associate})
|
||||
\item cancellation constructs (\specref{sec:cancellation})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
122
Makefile
122
Makefile
@ -1,22 +1,41 @@
|
||||
# Makefile for the OpenMP Examples document in LaTex format.
|
||||
# For more information, see the master document, openmp-examples.tex.
|
||||
# For more information, see the main document, openmp-examples.tex.
|
||||
SHELL=bash
|
||||
|
||||
include versioninfo
|
||||
|
||||
version=4.5.0
|
||||
default: openmp-examples.pdf
|
||||
diff: clean openmp-diff-abridged.pdf
|
||||
|
||||
release: VERSIONSTR="$(version_date)"
|
||||
release: clean openmp-examples.pdf
|
||||
|
||||
book: BOOK_BUILD="\\def\\bookbuild{1}"
|
||||
book: clean release
|
||||
mv openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf
|
||||
|
||||
ccpp-only: LANG_OPT="\\ccpptrue\\fortranfalse"
|
||||
ccpp-only: clean release
|
||||
|
||||
fortran-only: LANG_OPT="\\ccppfalse\\fortrantrue"
|
||||
fortran-only: clean release
|
||||
|
||||
CHAPTERS=Title_Page.tex \
|
||||
Introduction_Chapt.tex \
|
||||
Examples_*.tex \
|
||||
History.tex
|
||||
Foreword_Chapt.tex \
|
||||
Chap_*.tex \
|
||||
Deprecated_Features.tex \
|
||||
History.tex \
|
||||
*/*.tex
|
||||
|
||||
SOURCES=sources/*.c \
|
||||
sources/*.cpp \
|
||||
sources/*.f90 \
|
||||
sources/*.f
|
||||
SOURCES=*/sources/*.c \
|
||||
*/sources/*.cpp \
|
||||
*/sources/*.f90 \
|
||||
*/sources/*.f
|
||||
|
||||
INTERMEDIATE_FILES=openmp-examples.pdf \
|
||||
openmp-examples.toc \
|
||||
openmp-examples.lof \
|
||||
openmp-examples.lot \
|
||||
openmp-examples.idx \
|
||||
openmp-examples.aux \
|
||||
openmp-examples.ilg \
|
||||
@ -24,13 +43,90 @@ INTERMEDIATE_FILES=openmp-examples.pdf \
|
||||
openmp-examples.out \
|
||||
openmp-examples.log
|
||||
|
||||
openmp-examples.pdf: $(CHAPTERS) $(SOURCES) openmp.sty openmp-examples.tex openmp-logo.png
|
||||
LATEXCMD=pdflatex -interaction=batchmode -file-line-error
|
||||
LATEXDCMD=$(LATEXCMD) -draftmode
|
||||
|
||||
# check for branches names with "name_XXX"
|
||||
DIFF_TICKET_ID=$(shell git rev-parse --abbrev-ref HEAD)
|
||||
GITREV=$(shell git rev-parse --short HEAD || echo "??")
|
||||
VERSIONSTR="GIT rev $(GITREV)"
|
||||
LANG_OPT="\\ccpptrue\\fortrantrue"
|
||||
|
||||
openmp-examples.pdf: $(CHAPTERS) $(SOURCES) openmp.sty openmp-examples.tex openmp-logo.png generated-include.tex
|
||||
rm -f $(INTERMEDIATE_FILES)
|
||||
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
|
||||
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
|
||||
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
|
||||
touch generated-include.tex
|
||||
$(LATEXDCMD) openmp-examples.tex
|
||||
makeindex -s openmp-index.ist openmp-examples.idx
|
||||
$(LATEXDCMD) openmp-examples.tex
|
||||
$(LATEXCMD) openmp-examples.tex
|
||||
cp openmp-examples.pdf openmp-examples-${version}.pdf
|
||||
|
||||
check:
|
||||
sources/check_tags
|
||||
|
||||
clean:
|
||||
rm -f $(INTERMEDIATE_FILES)
|
||||
rm -f generated-include.tex
|
||||
rm -f openmp-diff-full.pdf openmp-diff-abridged.pdf
|
||||
rm -rf *.tmpdir
|
||||
cd util; make clean
|
||||
rm -f chk_tags.log sources/*.log
|
||||
|
||||
realclean: clean
|
||||
rm -f openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf
|
||||
|
||||
ifdef DIFF_TO
|
||||
VC_DIFF_TO := -r ${DIFF_TO}
|
||||
else
|
||||
VC_DIFF_TO :=
|
||||
endif
|
||||
ifdef DIFF_FROM
|
||||
VC_DIFF_FROM := -r ${DIFF_FROM}
|
||||
else
|
||||
VC_DIFF_FROM := -r work_6.0
|
||||
endif
|
||||
|
||||
DIFF_TO:=HEAD
|
||||
DIFF_FROM:=work_6.0
|
||||
DIFF_TYPE:=UNDERLINE
|
||||
|
||||
COMMON_DIFF_OPTS:=--math-markup=whole \
|
||||
--append-safecmd=plc,code,kcode,scode,ucode,vcode,splc,bcode,pvar,pout,example \
|
||||
--append-textcmd=subsubsubsection
|
||||
|
||||
VC_DIFF_OPTS:=${COMMON_DIFF_OPTS} --force -c latexdiff.cfg --flatten --type="${DIFF_TYPE}" --git --pdf ${VC_DIFF_FROM} ${VC_DIFF_TO} --subtype=ZLABEL --graphics-markup=none
|
||||
|
||||
VC_DIFF_MINIMAL_OPTS:= --only-changes --force
|
||||
|
||||
generated-include.tex:
|
||||
echo "$(BOOK_BUILD)" > $@
|
||||
echo "\\def\\VER{${version}}" >> $@
|
||||
echo "\\def\\SVER{${version_spec}}" >> $@
|
||||
echo "\\def\\VERDATE{${VERSIONSTR}}" >> $@
|
||||
@echo "\\newif\\ifccpp\\newif\\iffortran" >> $@
|
||||
echo "$(LANG_OPT)" >> $@
|
||||
util/list_tags -vtag */sources/* >> $@
|
||||
|
||||
%.tmpdir: $(wildcard *.sty) $(wildcard *.png) $(wildcard *.aux) openmp-examples.pdf
|
||||
mkdir -p $@/sources
|
||||
for i in affinity devices loop_transformations parallel_execution SIMD tasking \
|
||||
data_environment memory_model program_control synchronization \
|
||||
directives ompt_interface; do \
|
||||
mkdir -p $@/$$i; ln -sf "$$PWD"/$$i/sources $@/$$i/sources; done
|
||||
mkdir -p $@/figs
|
||||
cp -f $^ "$@/"
|
||||
cp -f sources/* "$@/sources"
|
||||
cp -f figs/* "$@/figs"
|
||||
|
||||
openmp-diff-abridged.pdf: diff-fast-minimal.tmpdir openmp-examples.pdf
|
||||
env PATH="$(shell pwd)/util/latexdiff:$(PATH)" latexdiff-vc ${VC_DIFF_MINIMAL_OPTS} --fast -d $< ${VC_DIFF_OPTS} openmp-examples.tex
|
||||
cp $</openmp-examples.pdf $@
|
||||
if [ "x$(DIFF_TICKET_ID)" != "x" ]; then cp $@ ${@:.pdf=-$(DIFF_TICKET_ID).pdf}; fi
|
||||
|
||||
# Slow but portable diffs
|
||||
openmp-diff-minimal.pdf: diffs-slow-minimal.tmpdir
|
||||
env PATH="$(shell pwd)/util/latexdiff:$(PATH)" latexdiff-vc ${VC_DIFF_MINIMAL_OPTS} -d $< ${VC_DIFF_OPTS} openmp-examples.tex
|
||||
cp $</openmp-examples.pdf $@
|
||||
if [ "x$(DIFF_TICKET_ID)" != "x" ]; then cp $@ ${@:.pdf=-$(DIFF_TICKET_ID).pdf}; fi
|
||||
|
||||
.PHONY: diff default book clean realclean
|
||||
|
64
README
64
README
@ -1,64 +0,0 @@
|
||||
This is the OpenMP Examples document in LaTeX format.
|
||||
Please see the master file, openmp-examples.tex, for more information.
|
||||
|
||||
For a brief revision history, please see Changes.log.
|
||||
|
||||
For copyright information, please see omp_copyright.txt.
|
||||
|
||||
|
||||
1) Process for adding an example
|
||||
|
||||
- Prepare source code and text description
|
||||
- Give a high level description in a trac ticket
|
||||
- Determine a name (ename) for the example
|
||||
- Propose a new name if creating a new chapter
|
||||
- Use the existing name if adding to an existing chapter
|
||||
- Number the example within the chapter (seq-no)
|
||||
- Create files for the source code with proper tags in
|
||||
sources/Example_<ename>.<seq-no>c.c
|
||||
sources/Example_<ename>.<seq-no>f.f
|
||||
- Create or update the description text in the chapter file
|
||||
Examples_<ename>,tex
|
||||
- If needed, add the new chapter file name in
|
||||
Makefile
|
||||
openmp-examples.tex
|
||||
- Commit the changes in git and push to the GitHub repo
|
||||
- Discuss and vote in committee
|
||||
|
||||
2) Tags (meta data) for example sources
|
||||
|
||||
@@name: <ename>.<seq-no>[c|f]
|
||||
@@type: C|C++|F-fixed|F-free
|
||||
@@compilable: yes|no|maybe
|
||||
@@linkable: yes|no|maybe
|
||||
@@expect: success|failure|nothing|rt-error
|
||||
|
||||
"name" is the name of an example
|
||||
"type" is the source code type, which can be translated into or from
|
||||
proper file extension (c,cpp,f,f90)
|
||||
"compilable" indicates whether the source code is compilable
|
||||
"linkable" indicates whether the source code is linkable
|
||||
"expect" indicates some expected result for testing purpose
|
||||
"success|failure|nothing" applies to the result of code compilation
|
||||
"rt-error" is for a case where compilation may be successful,
|
||||
but the code contains potential runtime issues (such as race condition).
|
||||
Alternative would be to just use "conforming" or "non-conforming".
|
||||
|
||||
3) LaTeX macros for examples
|
||||
|
||||
- Source code with language h-rules
|
||||
\cexample{<ename>}{<seq-no>c}
|
||||
\fexample{<ename>}{<seq-no>f}
|
||||
|
||||
- Source code without language h-rules
|
||||
\cnexample{<ename>}{<seq-no>c}
|
||||
\fnexample{<ename>}{<seq-no>f}
|
||||
|
||||
- Language h-rules
|
||||
\cspecificstart, \cspecificend
|
||||
\cppspecificstart, \cppspecificend
|
||||
\ccppspecificstart, \ccppspecificend
|
||||
\fortranspecificstart, \fortranspecificend
|
||||
|
||||
- See openmp.sty for more information
|
||||
|
12
README.md
12
README.md
@ -1,2 +1,10 @@
|
||||
# Examples
|
||||
LaTeX Examples Document Source
|
||||
# OpenMP Examples Document
|
||||
|
||||
This is the OpenMP Examples document in LaTeX format.
|
||||
|
||||
Please see [Contributions.md](Contributions.md) on how to make contributions to adding new examples.
|
||||
|
||||
For a brief revision history, please see [Changes.log](Changes.log).
|
||||
|
||||
For copyright information, please see [omp_copyright.txt](omp_copyright.txt).
|
||||
|
||||
|
150
SIMD/SIMD.tex
Normal file
150
SIMD/SIMD.tex
Normal file
@ -0,0 +1,150 @@
|
||||
%\pagebreak
|
||||
\section{\kcode{simd} and \kcode{declare simd} Directives}
|
||||
\label{sec:SIMD}
|
||||
|
||||
\index{constructs!simd@\kcode{simd}}
|
||||
\index{simd construct@\kcode{simd} construct}
|
||||
The following example illustrates the basic use of the \kcode{simd} construct
|
||||
to assure the compiler that the loop can be vectorized.
|
||||
|
||||
\cexample[4.0]{SIMD}{1}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{1}
|
||||
|
||||
|
||||
\index{directives!declare simd@\kcode{declare simd}}
|
||||
\index{declare simd directive@\kcode{declare simd} directive}
|
||||
\index{clauses!uniform@\kcode{uniform}}
|
||||
\index{uniform clause@\kcode{uniform} clause}
|
||||
\index{clauses!linear@\kcode{linear}}
|
||||
\index{linear clause@\kcode{linear} clause}
|
||||
When a function can be inlined within a loop the compiler has an opportunity to
|
||||
vectorize the loop. By guaranteeing SIMD behavior of a function's operations,
|
||||
characterizing the arguments of the function and privatizing temporary
|
||||
variables of the loop, the compiler can often create faster, vector code for
|
||||
the loop. In the examples below the \kcode{declare simd} directive is
|
||||
used on the \ucode{add1} and \ucode{add2} functions to enable creation of their
|
||||
corresponding SIMD function versions for execution within the associated SIMD
|
||||
loop. The functions characterize two different approaches of accessing data
|
||||
within the function: by a single variable and as an element in a data array,
|
||||
respectively. The \ucode{add3} C function uses dereferencing.
|
||||
|
||||
The \kcode{declare simd} directives also illustrate the use of
|
||||
\kcode{uniform} and \kcode{linear} clauses. The \kcode{uniform(\ucode{fact})} clause
|
||||
indicates that the variable \ucode{fact} is invariant across the SIMD lanes. In
|
||||
the \ucode{add2} function \ucode{a} and \ucode{b} are included in the \kcode{uniform}
|
||||
list because the C pointer and the Fortran array references are constant. The
|
||||
\ucode{i} index used in the \ucode{add2} function is included in a \kcode{linear}
|
||||
clause with a constant-linear-step of 1, to guarantee a unity increment of the
|
||||
associated loop. In the \kcode{declare simd} directive for the \ucode{add3}
|
||||
C function the \kcode{linear(\ucode{a,b:1})} clause instructs the compiler to generate
|
||||
unit-stride loads across the SIMD lanes; otherwise, costly \emph{gather}
|
||||
instructions would be generated for the unknown sequence of access of the
|
||||
pointer dereferences.
|
||||
|
||||
In the \kcode{simd} constructs for the loops the \kcode{private(\ucode{tmp})} clause is
|
||||
necessary to assure that each vector operation has its own \ucode{tmp}
|
||||
variable.
|
||||
|
||||
\cexample[4.0]{SIMD}{2}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{2}
|
||||
|
||||
%\pagebreak
|
||||
\index{clauses!private@\kcode{private}}
|
||||
\index{private clause@\kcode{private} clause}
|
||||
\index{clauses!reduction@\kcode{reduction}}
|
||||
\index{reduction clause@\kcode{reduction} clause}
|
||||
\index{reductions!reduction clause@\kcode{reduction} clause}
|
||||
A thread that encounters a SIMD construct executes a vectorized code of the
|
||||
iterations. Similar to the concerns of a worksharing loop a loop vectorized
|
||||
with a SIMD construct must assure that temporary and reduction variables are
|
||||
privatized and declared as reductions with clauses. The example below
|
||||
illustrates the use of \kcode{private} and \kcode{reduction} clauses in a SIMD
|
||||
construct.
|
||||
|
||||
\cexample[4.0]{SIMD}{3}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{3}
|
||||
|
||||
|
||||
%\pagebreak
|
||||
\index{clauses!safelen@\kcode{safelen}}
|
||||
\index{safelen clause@\kcode{safelen} clause}
|
||||
A \kcode{safelen(\ucode{N})} clause in a \kcode{simd} construct assures the compiler that
|
||||
there are no loop-carried dependences for vectors of size \ucode{N} or below. If
|
||||
the \kcode{safelen} clause is not specified, then the default safelen value is
|
||||
the number of loop iterations.
|
||||
|
||||
The \kcode{safelen(\ucode{16})} clause in the example below guarantees that the vector
|
||||
code is safe for vectors up to and including size 16. In the loop, \ucode{m} can
|
||||
be 16 or greater, for correct code execution. If the value of \ucode{m} is less
|
||||
than 16, the behavior is undefined.
|
||||
|
||||
\cexample[4.0]{SIMD}{4}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{4}
|
||||
|
||||
%\pagebreak
|
||||
\index{clauses!collapse@\kcode{collapse}}
|
||||
\index{collapse clause@\kcode{collapse} clause}
|
||||
The following SIMD construct instructs the compiler to collapse the \ucode{i} and
|
||||
\ucode{j} loops into a single SIMD loop in which SIMD chunks are executed by
|
||||
threads of the team. Within the workshared loop chunks of a thread, the SIMD
|
||||
chunks are executed in the lanes of the vector units.
|
||||
|
||||
\cexample[4.0]{SIMD}{5}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{5}
|
||||
|
||||
|
||||
%%% section
|
||||
\section{\kcode{inbranch} and \kcode{notinbranch} Clauses}
|
||||
\label{sec:SIMD_branch}
|
||||
\index{clauses!inbranch@\kcode{inbranch}}
|
||||
\index{inbranch clause@\kcode{inbranch} clause}
|
||||
\index{clauses!notinbranch@\kcode{notinbranch}}
|
||||
\index{notinbranch clause@\kcode{notinbranch} clause}
|
||||
|
||||
The following examples illustrate the use of the \kcode{declare simd}
|
||||
directive with the \kcode{inbranch} and \kcode{notinbranch} clauses. The
|
||||
\kcode{notinbranch} clause informs the compiler that the function \ucode{foo} is
|
||||
never called conditionally in the SIMD loop of the function \ucode{myaddint}. On
|
||||
the other hand, the \kcode{inbranch} clause for the function goo indicates that
|
||||
the function is always called conditionally in the SIMD loop inside
|
||||
the function \ucode{myaddfloat}.
|
||||
|
||||
\cexample[4.0]{SIMD}{6}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{6}
|
||||
|
||||
|
||||
In the code below, the function \ucode{fib()} is called in the main program and
|
||||
also recursively called in the function \ucode{fib()} within an \bcode{if}
|
||||
condition. The compiler creates a masked vector version and a non-masked vector
|
||||
version for the function \ucode{fib()} while retaining the original scalar
|
||||
version of the \ucode{fib()} function.
|
||||
|
||||
\cexample[4.0]{SIMD}{7}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{7}
|
||||
|
||||
|
||||
|
||||
%%% section
|
||||
%\pagebreak
|
||||
\section{Loop-Carried Lexical Forward Dependence}
|
||||
\label{sec:SIMD_forward_dep}
|
||||
\index{dependences!loop-carried lexical forward}
|
||||
|
||||
|
||||
The following example tests the restriction on an SIMD loop with the loop-carried lexical forward-dependence. This dependence must be preserved for the correct execution of SIMD loops.
|
||||
|
||||
A loop can be vectorized even though the iterations are not completely independent when it has loop-carried dependences that are forward lexical dependences, indicated in the code below by the read of \ucode{A[j+1]} and the write to \ucode{A[j]} in C/C++ code (or \ucode{A(j+1)} and \ucode{A(j)} in Fortran). That is, the read of \ucode{A[j+1]} (or \ucode{A(j+1)} in Fortran) before the write to \ucode{A[j]} (or \ucode{A(j)} in Fortran) ordering must be preserved for each iteration in \ucode{j} for valid SIMD code generation.
|
||||
|
||||
This test assures that the compiler preserves the loop-carried lexical forward-dependence for generating a correct SIMD code.
|
||||
|
||||
\cexample[4.0]{SIMD}{8}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{8}
|
||||
|
83
SIMD/linear_modifier.tex
Normal file
83
SIMD/linear_modifier.tex
Normal file
@ -0,0 +1,83 @@
|
||||
%%% section
|
||||
\section{\kcode{ref}, \kcode{val}, \kcode{uval} Modifiers for \kcode{linear} Clause}
|
||||
\label{sec:linear_modifier}
|
||||
\index{modifiers, linear@modifiers, \kcode{linear}!ref@\kcode{ref}}
|
||||
\index{modifiers, linear@modifiers, \kcode{linear}!val@\kcode{val}}
|
||||
\index{modifiers, linear@modifiers, \kcode{linear}!uval@\kcode{uval}}
|
||||
\index{clauses!linear@\kcode{linear}}
|
||||
\index{linear clause@\kcode{linear} clause}
|
||||
|
||||
When generating vector functions from \kcode{declare simd} directives,
|
||||
it is important for a compiler to know the proper types of function arguments in
|
||||
order to generate efficient codes.
|
||||
This is especially true for C++ reference types and Fortran arguments.
|
||||
|
||||
In the following example, the function \ucode{add_one2} has a C++ reference
|
||||
parameter (or Fortran argument) \ucode{p}. Variable \ucode{p} gets incremented by 1 in the function.
|
||||
The caller loop \ucode{i} in the main program passes
|
||||
a variable \ucode{k} as a reference to the function \ucode{add_one2} call.
|
||||
The \kcode{ref} modifier for the \kcode{linear} clause on the
|
||||
\kcode{declare simd} directive specifies that the
|
||||
reference-type parameter \ucode{p} is to match the property of the variable
|
||||
\ucode{k} in the loop.
|
||||
This use of reference type is equivalent to the second call to
|
||||
\ucode{add_one2} with a direct passing of the array element \ucode{a[i]}.
|
||||
In the example, the preferred vector
|
||||
length 8 is specified for both the caller loop and the callee function.
|
||||
|
||||
When \kcode{linear(\ucode{p}: ref)} is applied to an argument passed by reference,
|
||||
it tells the compiler that the addresses in its vector argument are consecutive,
|
||||
and so the compiler can generate a single vector load or store instead of
|
||||
a gather or scatter. This allows more efficient SIMD code to be generated with
|
||||
less source changes.
|
||||
|
||||
\cppexample[5.2]{linear_modifier}{1}
|
||||
\ffreeexample[5.2]{linear_modifier}{1}
|
||||
%\clearpage
|
||||
|
||||
|
||||
The following example is a variant of the above example. The function \ucode{add_one2}
|
||||
in the C++ code includes an additional C++ reference parameter \ucode{i}.
|
||||
The loop index \ucode{i} of the caller loop \ucode{i} in the main program
|
||||
is passed as a reference to the function \ucode{add_one2} call.
|
||||
The loop index \ucode{i} has a uniform address with
|
||||
linear value of step 1 across SIMD lanes.
|
||||
Thus, the \kcode{uval} modifier is used for the \kcode{linear} clause
|
||||
to specify that the C++ reference-type parameter \ucode{i} is to match
|
||||
the property of loop index \ucode{i}.
|
||||
|
||||
In the corresponding Fortran code the arguments \ucode{p} and
|
||||
\ucode{i} in the routine \ucode{add_on2} are passed by references.
|
||||
Similar modifiers are used for these variables in the \kcode{linear} clauses
|
||||
to match with the property at the caller loop in the main program.
|
||||
|
||||
When \kcode{linear(\ucode{i}: uval)} is applied to an argument passed by reference, it
|
||||
tells the compiler that its addresses in the vector argument are uniform
|
||||
so that the compiler can generate a scalar load or scalar store and create
|
||||
linear values. This allows more efficient SIMD code to be generated with
|
||||
less source changes.
|
||||
|
||||
\cppexample[5.2]{linear_modifier}{2}
|
||||
\ffreeexample[5.2]{linear_modifier}{2}
|
||||
|
||||
In the following example, the function \ucode{func} takes arrays \ucode{x} and \ucode{y}
|
||||
as arguments, and accesses the array elements referenced by the index \ucode{i}.
|
||||
The caller loop \ucode{i} in the main program passes a linear copy of
|
||||
the variable \ucode{k} to the function \ucode{func}.
|
||||
The \kcode{val} modifier is used for the \kcode{linear} clause
|
||||
in the \kcode{declare simd} directive for the function
|
||||
\ucode{func} to specify that the argument \ucode{i} is to match the property of
|
||||
the actual argument \ucode{k} passed in the SIMD loop.
|
||||
Arrays \ucode{x} and \ucode{y} have uniform addresses across SIMD lanes.
|
||||
|
||||
When \kcode{linear(\ucode{i}: val,step(\ucode{1}))} is applied to an argument,
|
||||
it tells the compiler that its addresses in the vector argument may not be
|
||||
consecutive, however, their values are linear (with stride 1 here). When the value of \ucode{i} is used
|
||||
in subscript of array references (e.g., \ucode{x[i]}), the compiler can generate
|
||||
a vector load or store instead of a gather or scatter. This allows more
|
||||
efficient SIMD code to be generated with less source changes.
|
||||
|
||||
\cexample[5.2]{linear_modifier}{3}
|
||||
\ffreeexample[5.2]{linear_modifier}{3}
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: SIMD.1c
|
||||
* @@name: SIMD.1
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
void star( double *a, double *b, double *c, int n, int *ioff )
|
||||
{
|
@ -1,8 +1,8 @@
|
||||
! @@name: SIMD.1f
|
||||
! @@name: SIMD.1
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine star(a,b,c,n,ioff_ptr)
|
||||
implicit none
|
||||
double precision :: a(*),b(*),c(*)
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: SIMD.2c
|
||||
* @@name: SIMD.2
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: link
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
@ -1,8 +1,8 @@
|
||||
! @@name: SIMD.2f
|
||||
! @@name: SIMD.2
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: link
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
program main
|
||||
implicit none
|
||||
integer, parameter :: N=32
|
||||
@ -19,15 +19,15 @@ program main
|
||||
end program
|
||||
|
||||
function add1(a,b,fact) result(c)
|
||||
!$omp declare simd(add1) uniform(fact)
|
||||
implicit none
|
||||
!$omp declare simd(add1) uniform(fact)
|
||||
double precision :: a,b,fact, c
|
||||
c = a + b + fact
|
||||
end function
|
||||
|
||||
function add2(a,b,i, fact) result(c)
|
||||
!$omp declare simd(add2) uniform(a,b,fact) linear(i:1)
|
||||
implicit none
|
||||
!$omp declare simd(add2) uniform(a,b,fact) linear(i:1)
|
||||
integer :: i
|
||||
double precision :: a(*),b(*),fact, c
|
||||
c = a(i) + b(i) + fact
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: SIMD.3c
|
||||
* @@name: SIMD.3
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
double work( double *a, double *b, int n )
|
||||
{
|
@ -1,8 +1,8 @@
|
||||
! @@name: SIMD.3f
|
||||
! @@name: SIMD.3
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine work( a, b, n, sum )
|
||||
implicit none
|
||||
integer :: i, n
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: SIMD.4c
|
||||
* @@name: SIMD.4
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
void work( float *b, int n, int m )
|
||||
{
|
@ -1,8 +1,8 @@
|
||||
! @@name: SIMD.4f
|
||||
! @@name: SIMD.4
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine work( b, n, m )
|
||||
implicit none
|
||||
real :: b(n)
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: SIMD.5c
|
||||
* @@name: SIMD.5
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
void work( double **a, double **b, double **c, int n )
|
||||
{
|
@ -1,8 +1,8 @@
|
||||
! @@name: SIMD.5f
|
||||
! @@name: SIMD.5
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine work( a, b, c, n )
|
||||
implicit none
|
||||
integer :: i,j,n
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: SIMD.6c
|
||||
* @@name: SIMD.6
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
#pragma omp declare simd linear(p:1) notinbranch
|
||||
int foo(int *p){
|
@ -1,11 +1,11 @@
|
||||
! @@name: SIMD.6f
|
||||
! @@name: SIMD.6
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
function foo(p) result(r)
|
||||
!$omp declare simd(foo) notinbranch
|
||||
implicit none
|
||||
!$omp declare simd(foo) notinbranch
|
||||
integer :: p, r
|
||||
p = p + 10
|
||||
r = p
|
||||
@ -26,8 +26,8 @@ function myaddint(a, b, n) result(r)
|
||||
end function myaddint
|
||||
|
||||
function goo(p) result(r)
|
||||
!$omp declare simd(goo) inbranch
|
||||
implicit none
|
||||
!$omp declare simd(goo) inbranch
|
||||
real :: p, r
|
||||
p = p + 18.5
|
||||
r = p
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: SIMD.7c
|
||||
* @@name: SIMD.7
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@ -32,6 +32,6 @@ int main(void)
|
||||
for (i=0; i < N; i++) {
|
||||
a[i] = fib(b[i]);
|
||||
}
|
||||
printf("Done a[%d] = %d\n", N-1, a[N-1]);
|
||||
printf("Done a[%d] = %d\n", N-1, a[N-1]); //Done a[44] = 701408733
|
||||
return 0;
|
||||
}
|
@ -1,8 +1,8 @@
|
||||
! @@name: SIMD.7f
|
||||
! @@name: SIMD.7
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
program fibonacci
|
||||
implicit none
|
||||
integer,parameter :: N=45
|
||||
@ -25,8 +25,8 @@ program fibonacci
|
||||
end program
|
||||
|
||||
recursive function fib(n) result(r)
|
||||
!$omp declare simd(fib) inbranch
|
||||
implicit none
|
||||
!$omp declare simd(fib) inbranch
|
||||
integer :: n, r
|
||||
|
||||
if (n <= 1) then
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: SIMD.8c
|
||||
* @@name: SIMD.8
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user