mirror of
https://github.com/OpenMP/Examples.git
synced 2025-04-11 00:42:12 +01:00
Compare commits
8 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
415024c369 | ||
![]() |
00bdf88b63 | ||
![]() |
3346a30ce2 | ||
![]() |
11f2efcccf | ||
![]() |
075683d574 | ||
![]() |
08859e6029 | ||
![]() |
03b9a00df9 | ||
![]() |
a5e3d8b3f2 |
@ -8,34 +8,34 @@ The addition of two vectors to form a third vector is a SIMD operation.
|
||||
Many processors have SIMD (vector) units that can perform simultaneously
|
||||
2, 4, 8 or more executions of the same operation (by a single SIMD unit).
|
||||
|
||||
Loops without loop-carried backward dependency (or with dependency preserved using
|
||||
ordered simd) are candidates for vectorization by the compiler for
|
||||
Loops without loop-carried backward dependences (or with dependences preserved using
|
||||
\kcode{ordered simd}) are candidates for vectorization by the compiler for
|
||||
execution with SIMD units. In addition, with state-of-the-art vectorization
|
||||
technology and \code{declare simd} directive extensions for function vectorization
|
||||
technology and \kcode{declare simd} directive extensions for function vectorization
|
||||
in the OpenMP 4.5 specification, loops with function calls can be vectorized as well.
|
||||
The basic idea is that a scalar function call in a loop can be replaced by a vector version
|
||||
of the function, and the loop can be vectorized simultaneously by combining a loop
|
||||
vectorization (\code{simd} directive on the loop) and a function
|
||||
vectorization (\code{declare simd} directive on the function).
|
||||
vectorization (\kcode{simd} directive on the loop) and a function
|
||||
vectorization (\kcode{declare simd} directive on the function).
|
||||
|
||||
A \code{simd} construct states that SIMD operations be performed on the
|
||||
A \kcode{simd} construct states that SIMD operations be performed on the
|
||||
data within the loop. A number of clauses are available to provide
|
||||
data-sharing attributes (\code{private}, \code{linear}, \code{reduction} and
|
||||
\code{lastprivate}). Other clauses provide vector length preference/restrictions
|
||||
(\code{simdlen} / \code{safelen}), loop fusion (\code{collapse}), and data
|
||||
alignment (\code{aligned}).
|
||||
data-sharing attributes (\kcode{private}, \kcode{linear}, \kcode{reduction} and
|
||||
\kcode{lastprivate}). Other clauses provide vector length preference/restrictions
|
||||
(\kcode{simdlen} / \kcode{safelen}), loop fusion (\kcode{collapse}), and data
|
||||
alignment (\kcode{aligned}).
|
||||
|
||||
The \code{declare simd} directive designates
|
||||
The \kcode{declare simd} directive designates
|
||||
that a vector version of the function should also be constructed for
|
||||
execution within loops that contain the function and have a \code{simd}
|
||||
directive. Clauses provide argument specifications (\code{linear},
|
||||
\code{uniform}, and \code{aligned}), a requested vector length
|
||||
(\code{simdlen}), and designate whether the function is always/never
|
||||
called conditionally in a loop (\code{branch}/\code{inbranch}).
|
||||
execution within loops that contain the function and have a \kcode{simd}
|
||||
directive. Clauses provide argument specifications (\kcode{linear},
|
||||
\kcode{uniform}, and \kcode{aligned}), a requested vector length
|
||||
(\kcode{simdlen}), and designate whether the function is always/never
|
||||
called conditionally in a loop (\kcode{notinbranch}/\kcode{inbranch}).
|
||||
The latter is for optimizing performance.
|
||||
|
||||
Also, the \code{simd} construct has been combined with the worksharing loop
|
||||
constructs (\code{for simd} and \code{do simd}) to enable simultaneous thread
|
||||
Also, the \kcode{simd} construct has been combined with the worksharing loop
|
||||
constructs (\kcode{for simd} and \kcode{do simd}) to enable simultaneous thread
|
||||
execution in different SIMD units.
|
||||
%Hence, the \code{simd} construct can be
|
||||
%used alone on a loop to direct vectorization (SIMD execution), or in
|
||||
|
@ -1,8 +1,8 @@
|
||||
\cchapter{OpenMP Affinity}{affinity}
|
||||
\label{chap:openmp_affinity}
|
||||
|
||||
OpenMP Affinity consists of a \code{proc\_bind} policy (thread affinity policy) and a specification of
|
||||
places (\texttt{"}location units\texttt{"} or \plc{processors} that may be cores, hardware
|
||||
OpenMP Affinity consists of a \kcode{proc_bind} policy (thread affinity policy) and a specification of
|
||||
places (``location units'' or \plc{processors} that may be cores, hardware
|
||||
threads, sockets, etc.).
|
||||
OpenMP Affinity enables users to bind computations on specific places.
|
||||
The placement will hold for the duration of the parallel region.
|
||||
@ -11,13 +11,13 @@ to different cores (hardware threads, sockets, etc.) prescribed within a given p
|
||||
if two or more cores (hardware threads, sockets, etc.) have been assigned to a given place.
|
||||
|
||||
Often the binding can be managed without resorting to explicitly setting places.
|
||||
Without the specification of places in the \code{OMP\_PLACES} variable,
|
||||
Without the specification of places in the \kcode{OMP_PLACES} variable,
|
||||
the OpenMP runtime will distribute and bind threads using the entire range of processors for
|
||||
the OpenMP program, according to the \code{OMP\_PROC\_BIND} environment variable
|
||||
or the \code{proc\_bind} clause. When places are specified, the OMP runtime
|
||||
the OpenMP program, according to the \kcode{OMP_PROC_BIND} environment variable
|
||||
or the \kcode{proc_bind} clause. When places are specified, the OMP runtime
|
||||
binds threads to the places according to a default distribution policy, or
|
||||
those specified in the \code{OMP\_PROC\_BIND} environment variable or the
|
||||
\code{proc\_bind} clause.
|
||||
those specified in the \kcode{OMP_PROC_BIND} environment variable or the
|
||||
\kcode{proc_bind} clause.
|
||||
|
||||
In the OpenMP Specifications document a processor refers to an execution unit that
|
||||
is enabled for an OpenMP thread to use. A processor is a core when there is
|
||||
@ -26,12 +26,12 @@ SMT is enabled, a processor is a hardware thread (HW-thread). (This is the
|
||||
usual case; but actually, the execution unit is implementation defined.) Processor
|
||||
numbers are numbered sequentially from 0 to the number of cores less one (without SMT), or
|
||||
0 to the number HW-threads less one (with SMT). OpenMP places use the processor number to designate
|
||||
binding locations (unless an \texttt{"}abstract name\texttt{"} is used.)
|
||||
binding locations (unless an ``abstract name'' is used.)
|
||||
|
||||
|
||||
The processors available to a process may be a subset of the system's
|
||||
processors. This restriction may be the result of a
|
||||
wrapper process controlling the execution (such as \code{numactl} on Linux systems),
|
||||
wrapper process controlling the execution (such as \plc{numactl} on Linux systems),
|
||||
compiler options, library-specific environment variables, or default
|
||||
kernel settings. For instance, the execution of multiple MPI processes,
|
||||
launched on a single compute node, will each have a subset of processors as
|
||||
@ -53,20 +53,20 @@ variables for the MPI library. %Forked threads within an MPI process
|
||||
|
||||
Threads of a team are positioned onto places in a compact manner, a
|
||||
scattered distribution, or onto the primary thread's place, by setting the
|
||||
\code{OMP\_PROC\_BIND} environment variable or the \code{proc\_bind} clause to
|
||||
\code{close}, \code{spread}, or \code{primary} (\code{master} has been deprecated), respectively. When
|
||||
\code{OMP\_PROC\_BIND} is set to FALSE no binding is enforced; and
|
||||
\kcode{OMP_PROC_BIND} environment variable or the \kcode{proc_bind} clause to
|
||||
\kcode{close}, \kcode{spread}, or \kcode{primary} (\kcode{master} has been deprecated), respectively. When
|
||||
\kcode{OMP_PROC_BIND} is set to FALSE no binding is enforced; and
|
||||
when the value is TRUE, the binding is implementation defined to
|
||||
a set of places in the \code{OMP\_PLACES} variable or to places
|
||||
defined by the implementation if the \code{OMP\_PLACES} variable
|
||||
a set of places in the \kcode{OMP_PLACES} variable or to places
|
||||
defined by the implementation if the \kcode{OMP_PLACES} variable
|
||||
is not set.
|
||||
|
||||
The \code{OMP\_PLACES} variable can also be set to an abstract name
|
||||
(\code{threads}, \code{cores}, \code{sockets}) to specify that a place is
|
||||
The \kcode{OMP_PLACES} variable can also be set to an abstract name
|
||||
(\kcode{threads}, \kcode{cores}, \kcode{sockets}) to specify that a place is
|
||||
either a single hardware thread, a core, or a socket, respectively.
|
||||
This description of the \code{OMP\_PLACES} is most useful when the
|
||||
This description of the \kcode{OMP_PLACES} is most useful when the
|
||||
number of threads is equal to the number of hardware thread, cores
|
||||
or sockets. It can also be used with a \code{close} or \code{spread}
|
||||
or sockets. It can also be used with a \kcode{close} or \kcode{spread}
|
||||
distribution policy when the equality doesn't hold.
|
||||
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
\cchapter{Data Environment}{data_environment}
|
||||
\label{chap:data_environment}
|
||||
The OpenMP \plc{data environment} contains data attributes of variables and
|
||||
objects. Many constructs (such as \code{parallel}, \code{simd}, \code{task})
|
||||
objects. Many constructs (such as \kcode{parallel}, \kcode{simd}, \kcode{task})
|
||||
accept clauses to control \plc{data-sharing} attributes
|
||||
of referenced variables in the construct, where \plc{data-sharing} applies to
|
||||
whether the attribute of the variable is \plc{shared},
|
||||
is \plc{private} storage, or has special operational characteristics
|
||||
(as found in the \code{firstprivate}, \code{lastprivate}, \code{linear}, or \code{reduction} clause).
|
||||
(as found in the \kcode{firstprivate}, \kcode{lastprivate}, \kcode{linear}, or \kcode{reduction} clause).
|
||||
|
||||
The data environment for a device (distinguished as a \plc{device data environment})
|
||||
is controlled on the host by \plc{data-mapping} attributes, which determine the
|
||||
@ -21,15 +21,15 @@ Data-sharing attributes of variables can be classified as being \plc{predetermin
|
||||
|
||||
Certain variables and objects have predetermined attributes.
|
||||
A commonly found case is the loop iteration variable in associated loops
|
||||
of a \code{for} or \code{do} construct. It has a private data-sharing attribute.
|
||||
Variables with predetermined data-sharing attributes can not be listed in a data-sharing clause; but there are some
|
||||
of a \kcode{for} or \kcode{do} construct. It has a private data-sharing attribute.
|
||||
Variables with predetermined data-sharing attributes cannot be listed in a data-sharing clause; but there are some
|
||||
exceptions (mainly concerning loop iteration variables).
|
||||
|
||||
Variables with explicitly determined data-sharing attributes are those that are
|
||||
referenced in a given construct and are listed in a data-sharing attribute
|
||||
clause on the construct. Some of the common data-sharing clauses are:
|
||||
\code{shared}, \code{private}, \code{firstprivate}, \code{lastprivate},
|
||||
\code{linear}, and \code{reduction}. % Are these all of them?
|
||||
\kcode{shared}, \kcode{private}, \kcode{firstprivate}, \kcode{lastprivate},
|
||||
\kcode{linear}, and \kcode{reduction}. % Are these all of them?
|
||||
|
||||
Variables with implicitly determined data-sharing attributes are those
|
||||
that are referenced in a given construct, do not have predetermined
|
||||
@ -37,41 +37,41 @@ data-sharing attributes, and are not listed in a data-sharing
|
||||
attribute clause of an enclosing construct.
|
||||
For a complete list of variables and objects with predetermined and
|
||||
implicitly determined attributes, please refer to the
|
||||
\plc{Data-sharing Attribute Rules for Variables Referenced in a Construct}
|
||||
\docref{Data-sharing Attribute Rules for Variables Referenced in a Construct}
|
||||
subsection of the OpenMP Specifications document.
|
||||
|
||||
\bigskip
|
||||
DATA-MAPPING ATTRIBUTES
|
||||
|
||||
The \code{map} clause on a device construct explicitly specifies how the list items in
|
||||
The \kcode{map} clause on a device construct explicitly specifies how the list items in
|
||||
the clause are mapped from the encountering task's data environment (on the host)
|
||||
to the corresponding item in the device data environment (on the device).
|
||||
The common \plc{list items} are arrays, array sections, scalars, pointers, and
|
||||
structure elements (members).
|
||||
|
||||
Procedures and global variables have predetermined data mapping if they appear
|
||||
within the list or block of a \code{declare target} directive. Also, a C/C++ pointer
|
||||
within the list or block of a \kcode{declare target} directive. Also, a C/C++ pointer
|
||||
is mapped as a zero-length array section, as is a C++ variable that is a reference to a pointer.
|
||||
% Waiting for response from Eric on this.
|
||||
|
||||
Without explicit mapping, non-scalar and non-pointer variables within the scope of the \code{target}
|
||||
construct are implicitly mapped with a \plc{map-type} of \code{tofrom}.
|
||||
Without explicit mapping, scalar variables within the scope of the \code{target}
|
||||
Without explicit mapping, non-scalar and non-pointer variables within the scope of the \kcode{target}
|
||||
construct are implicitly mapped with a \plc{map-type} of \kcode{tofrom}.
|
||||
Without explicit mapping, scalar variables within the scope of the \kcode{target}
|
||||
construct are not mapped, but have an implicit firstprivate data-sharing
|
||||
attribute. (That is, the value of the original variable is given to a private
|
||||
variable of the same name on the device.) This behavior can be changed with
|
||||
the \code{defaultmap} clause.
|
||||
the \kcode{defaultmap} clause.
|
||||
|
||||
The \code{map} clause can appear on \code{target}, \code{target data} and
|
||||
\code{target enter/exit data} constructs. The operations of creation and
|
||||
The \kcode{map} clause can appear on \kcode{target}, \kcode{target data} and
|
||||
\kcode{target enter/exit data} constructs. The operations of creation and
|
||||
removal of device storage as well as assignment of the original list item
|
||||
values to the corresponding list items may be complicated when the list
|
||||
item appears on multiple constructs or when the host and device storage
|
||||
is shared. In these cases the item's reference count, the number of times
|
||||
it has been referenced (+1 on entry and -1 on exited) in nested (structured)
|
||||
it has been referenced (increment by 1 on entry and decrement by 1 on exit) in nested (structured)
|
||||
map regions and/or accumulative (unstructured) mappings, determines the operation.
|
||||
Details of the \code{map} clause and reference count operation are specified
|
||||
in the \plc{map Clause} subsection of the OpenMP Specifications document.
|
||||
Details of the \kcode{map} clause and reference count operation are specified
|
||||
in the \docref{\kcode{map} Clause} subsection of the OpenMP Specifications document.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
@ -81,10 +81,12 @@ in the \plc{map Clause} subsection of the OpenMP Specifications document.
|
||||
\input{data_environment/fort_loopvar}
|
||||
\input{data_environment/fort_sp_common}
|
||||
\input{data_environment/fort_sa_private}
|
||||
\input{data_environment/fort_shared_var}
|
||||
\input{data_environment/carrays_fpriv}
|
||||
\input{data_environment/lastprivate}
|
||||
\input{data_environment/reduction}
|
||||
\input{data_environment/udr}
|
||||
\input{data_environment/induction}
|
||||
\input{data_environment/scan}
|
||||
\input{data_environment/copyin}
|
||||
\input{data_environment/copyprivate}
|
||||
|
@ -1,9 +1,9 @@
|
||||
\cchapter{Devices}{devices}
|
||||
\label{chap:devices}
|
||||
|
||||
The \code{target} construct consists of a \code{target} directive
|
||||
and an execution region. The \code{target} region is executed on
|
||||
the default device or the device specified in the \code{device}
|
||||
The \kcode{target} construct consists of a \kcode{target} directive
|
||||
and an execution region. The \kcode{target} region is executed on
|
||||
the default device or the device specified in the \kcode{device}
|
||||
clause.
|
||||
|
||||
In OpenMP version 4.0, by default, all variables within the lexical
|
||||
@ -15,40 +15,40 @@ data to the device storage.
|
||||
|
||||
The constructs that explicitly
|
||||
create storage, transfer data, and free storage on the device
|
||||
are catagorized as structured and unstructured. The
|
||||
\code{target} \code{data} construct is structured. It creates
|
||||
a data region around \code{target} constructs, and is
|
||||
are categorized as structured and unstructured. The
|
||||
\kcode{target data} construct is structured. It creates
|
||||
a data region around \kcode{target} constructs, and is
|
||||
convenient for providing persistent data throughout multiple
|
||||
\code{target} regions. The \code{target} \code{enter} \code{data} and
|
||||
\code{target} \code{exit} \code{data} constructs are unstructured, because
|
||||
they can occur anywhere and do not support a "structure"
|
||||
(a region) for enclosing \code{target} constructs, as does the
|
||||
\code{target} \code{data} construct.
|
||||
\kcode{target} regions. The \kcode{target enter data} and
|
||||
\kcode{target exit data} constructs are unstructured, because
|
||||
they can occur anywhere and do not support a ``structure''
|
||||
(a region) for enclosing \kcode{target} constructs, as does the
|
||||
\kcode{target data} construct.
|
||||
|
||||
The \code{map} clause is used on \code{target}
|
||||
The \kcode{map} clause is used on \kcode{target}
|
||||
constructs and the data-type constructs to map host data. It
|
||||
specifies the device storage and data movement \code{to} and \code{from}
|
||||
specifies the device storage and data movement \plc{to} and \plc{from}
|
||||
the device, and controls on the storage duration.
|
||||
|
||||
There is an important change in the OpenMP 4.5 specification
|
||||
that alters the data model for scalar variables and C/C++ pointer variables.
|
||||
The default behavior for scalar variables and C/C++ pointer variables
|
||||
in an 4.5 compliant code is \code{firstprivate}. Example
|
||||
in a 4.5 compliant code is \kcode{firstprivate}. Example
|
||||
codes that have been updated to reflect this new behavior are
|
||||
annotated with a description that describes changes required
|
||||
for correct execution. Often it is a simple matter of mapping
|
||||
the variable as \code{tofrom} to obtain the intended 4.0 behavior.
|
||||
the variable as \kcode{tofrom} to obtain the intended 4.0 behavior.
|
||||
|
||||
In OpenMP version 4.5 the mechanism for target
|
||||
execution is specified as occuring through a \plc{target task}.
|
||||
When the \code{target} construct is encountered a new
|
||||
\plc{target task} is generated. The \plc{target task}
|
||||
completes after the \code{target} region has executed and all data
|
||||
execution is specified as occurring through a \plc{target task}.
|
||||
When the \kcode{target} construct is encountered a new
|
||||
target task is generated. The target task
|
||||
completes after the \kcode{target} region has executed and all data
|
||||
transfers have finished.
|
||||
|
||||
This new specification does not affect the execution of
|
||||
pre-4.5 code; it is a necessary element for asynchronous
|
||||
execution of the \code{target} region when using the new \code{nowait}
|
||||
execution of the \kcode{target} region when using the new \kcode{nowait}
|
||||
clause introduced in OpenMP 4.5.
|
||||
|
||||
|
||||
@ -59,17 +59,21 @@ clause introduced in OpenMP 4.5.
|
||||
\input{devices/target_structure_mapping}
|
||||
\input{devices/target_fort_allocatable_array_mapping}
|
||||
\input{devices/array_sections}
|
||||
\input{devices/usm}
|
||||
\input{devices/C++_virtual_functions}
|
||||
\input{devices/array_shaping}
|
||||
\input{devices/target_mapper}
|
||||
\input{devices/target_data}
|
||||
\input{devices/target_unstructured_data}
|
||||
\input{devices/target_update}
|
||||
\input{devices/target_associate_ptr}
|
||||
\input{devices/declare_target}
|
||||
\input{devices/lambda_expressions}
|
||||
\input{devices/teams}
|
||||
\input{devices/async_target_depend}
|
||||
\input{devices/async_target_with_tasks}
|
||||
\input{devices/async_target_nowait}
|
||||
\input{devices/async_target_nowait_depend}
|
||||
\input{devices/async_target_nowait_arg}
|
||||
\input{devices/device}
|
||||
\input{devices/device_env_traits}
|
||||
|
||||
|
@ -1,11 +1,12 @@
|
||||
\cchapter{OpenMP Directive Syntax}{directives}
|
||||
\label{chap:directive_syntax}
|
||||
\index{directive syntax}
|
||||
|
||||
OpenMP \emph{directives} use base-language mechanisms to specify OpenMP program behavior.
|
||||
In C code, the directives are formed exclusively with pragmas, whereas in C++
|
||||
code, directives are formed from either pragmas or attributes.
|
||||
OpenMP \plc{directives} use base-language mechanisms to specify OpenMP program behavior.
|
||||
In C/C++ code, the directives are formed with
|
||||
either pragmas or attributes.
|
||||
Fortran directives are formed with comments in free form and fixed form sources (codes).
|
||||
All of these mechanism allow the compilation to ignore the OpenMP directives if
|
||||
All of these mechanisms allow the compilation to ignore the OpenMP directives if
|
||||
OpenMP is not supported or enabled.
|
||||
|
||||
|
||||
@ -19,21 +20,46 @@ The formats for combining a base-language mechanism and a \plc{directive-specifi
|
||||
|
||||
C/C++ pragmas
|
||||
\begin{indentedcodelist}
|
||||
\code{\#pragma omp} \plc{directive-specification}
|
||||
#pragma omp \plc{directive-specification}
|
||||
\end{indentedcodelist}
|
||||
|
||||
C++ attributes
|
||||
C/C++ attribute specifiers
|
||||
\begin{indentedcodelist}
|
||||
\code{[[omp :: directive(} \plc{directive-specification} \code{)]]}
|
||||
\code{[[using omp : directive(} \plc{directive-specification} \code{)]]}
|
||||
[[omp :: directive( \plc{directive-specification} )]]
|
||||
[[omp :: decl( \plc{directive-specification} )]]
|
||||
\end{indentedcodelist}
|
||||
|
||||
C++ attribute specifiers
|
||||
\begin{indentedcodelist}
|
||||
[[using omp : directive( \plc{directive-specification} )]]
|
||||
[[using omp : decl( \plc{directive-specification} )]]
|
||||
\end{indentedcodelist}
|
||||
|
||||
where the \kcode{decl} attribute may be used for declarative
|
||||
directives alternatively.
|
||||
|
||||
Fortran comments
|
||||
\begin{indentedcodelist}
|
||||
\code{!\$omp} \plc{directive-specification}
|
||||
!$omp \plc{directive-specification}
|
||||
\end{indentedcodelist}
|
||||
|
||||
where \code{c\$omp} and \code{*\$omp} may be used in Fortran fixed form sources.
|
||||
where \scode{c$omp} and \scode{*$omp} may be used in Fortran fixed form sources.
|
||||
|
||||
Most OpenMP directives accept clauses that alter the semantics of the directive in some way,
|
||||
and some directives also accept parenthesized arguments that follow the directive name.
|
||||
A clause may just be a keyword (e.g., \kcode{untied}) or it may also accept argument lists
|
||||
(e.g., \kcode{shared(\ucode{x,y,z})}) and/or optional modifiers (e.g., \kcode{tofrom} in
|
||||
\kcode{map(tofrom: \ucode{x,y,z})}).
|
||||
Clause modifiers may be ``simple'' or ``complex'' -- a complex modifier consists of a
|
||||
keyword followed by one or more parameters, bracketed by parentheses, while a simple
|
||||
modifier does not. An example of a complex modifier is the \kcode{iterator} modifier,
|
||||
as in \kcode{map(iterator(\ucode{i=0:n}), tofrom: \ucode{p[i]})}, or the \kcode{step} modifier, as in
|
||||
\kcode{linear(\ucode{x}: ref, step(\ucode{4}))}.
|
||||
In the preceding examples, \kcode{tofrom} and \kcode{ref} are simple modifiers.
|
||||
|
||||
For Fortran, a declarative directive (such as \kcode{declare reduction})
|
||||
must appear after any \bcode{USE}, \bcode{IMPORT}, and \bcode{IMPLICIT} statements
|
||||
in the specification part.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
|
@ -32,7 +32,7 @@
|
||||
% This is a \plc{var-name}.
|
||||
%
|
||||
|
||||
\bchapter{Introduction}
|
||||
\cchapter{Introduction}{introduction}
|
||||
\label{chap:introduction}
|
||||
|
||||
This collection of programming examples supplements the OpenMP API for Shared
|
||||
@ -46,26 +46,28 @@ numerous vendors support the OpenMP API.
|
||||
|
||||
The directives, library routines, and environment variables demonstrated in this
|
||||
document allow users to create and manage parallel programs while permitting
|
||||
portability. The directives extend the C, C++ and Fortran base languages with single
|
||||
program multiple data (SPMD) constructs, tasking constructs, device constructs,
|
||||
worksharing constructs, and synchronization constructs, and they provide support for
|
||||
portability. The directives extend the C, C++ and Fortran base languages with \plc{single
|
||||
program multiple data} (SPMD) constructs, \plc{tasking} constructs, \plc{device} constructs,
|
||||
\plc{worksharing} constructs, and \plc{synchronization} constructs, and they provide support for
|
||||
sharing and privatizing data. The functionality to control the runtime environment is
|
||||
provided by library routines and environment variables. Compilers that support the
|
||||
OpenMP API often include a command line option to the compiler that activates and
|
||||
allows interpretation of all OpenMP directives.
|
||||
|
||||
The latest source codes for OpenMP Examples can be downloaded from the \code{sources}
|
||||
directory at
|
||||
\href{https://github.com/OpenMP/Examples}{https://github.com/OpenMP/Examples}.
|
||||
The codes for this OpenMP \VER{} Examples document have the tag \plc{v\VER}.
|
||||
|
||||
%\href{https://github.com/OpenMP/Examples/tree/main/sources}{https://github.com/OpenMP/Examples/sources}.
|
||||
The documents and source codes for OpenMP Examples can be downloaded from
|
||||
\href{\examplesrepo}{\examplesrepo}.
|
||||
Each directory holds the contents of a chapter and has a \plc{sources} subdirectory of its codes.
|
||||
This OpenMP Examples \VER{} document and its codes are tagged as
|
||||
\examplestree{\VER}{\plc{v\VER}}.
|
||||
|
||||
Complete information about the OpenMP API and a list of the compilers that support
|
||||
the OpenMP API can be found at the OpenMP.org web site
|
||||
|
||||
\code{http://www.openmp.org}
|
||||
\scode{https://www.openmp.org}
|
||||
|
||||
\clearpage
|
||||
|
||||
\input{introduction/Examples}
|
||||
|
||||
% This is the end of introduction.tex of the OpenMP Examples document.
|
||||
|
@ -12,7 +12,7 @@ for transformation, rather than applying more time-consuming general compiler
|
||||
heuristics methods with compiler options that may not be able to discover
|
||||
optimal transformations.
|
||||
|
||||
Loop transformations can be augmented by preprocessor support or OpenMP \code{metadirective}
|
||||
Loop transformations can be augmented by preprocessor support or OpenMP \kcode{metadirective}
|
||||
directives, to select optimal dimension and size parameters for specific platforms,
|
||||
facilitating a single code base for multiple platforms.
|
||||
Moreover, directive-based transformations make experimenting easier:
|
||||
@ -21,5 +21,7 @@ whereby specific hot spots can be affected by transformation directives.
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{loop_transformations/tile}
|
||||
\input{loop_transformations/partial_tile}
|
||||
\input{loop_transformations/unroll}
|
||||
\input{loop_transformations/apply}
|
||||
|
||||
|
@ -4,10 +4,10 @@
|
||||
OpenMP provides a shared-memory model that allows all threads on a given
|
||||
device shared access to \emph{memory}. For a given OpenMP region that may be
|
||||
executed by more than one thread or SIMD lane, variables in memory may be
|
||||
\emph{shared} or \emph{private} with respect to those threads or SIMD lanes. A
|
||||
\plc{shared} or \plc{private} with respect to those threads or SIMD lanes. A
|
||||
variable's data-sharing attribute indicates whether it is shared (the
|
||||
\emph{shared} attribute) or private (the \emph{private}, \emph{firstprivate},
|
||||
\emph{lastprivate}, \emph{linear}, and \emph{reduction} attributes) in the data
|
||||
\plc{shared} attribute) or private (the \plc{private}, \plc{firstprivate},
|
||||
\plc{lastprivate}, \plc{linear}, and \plc{reduction} attributes) in the data
|
||||
environment of an OpenMP region. While private variables in an OpenMP region
|
||||
are new copies of the original variable (with same name) that may then be
|
||||
concurrently accessed or modified by their respective threads or SIMD lanes, a
|
||||
@ -21,27 +21,27 @@ a given variable in their respective temporary views. Threads may employ flush
|
||||
operations for the purposes of making their temporary view of a variable
|
||||
consistent with the value of the variable in memory. The effect of a given
|
||||
flush operation is characterized by its flush properties -- some combination of
|
||||
\emph{strong}, \emph{release}, and \emph{acquire} -- and, for \emph{strong}
|
||||
flushes, a \emph{flush-set}.
|
||||
\plc{strong}, \plc{release}, and \plc{acquire} -- and, for \plc{strong}
|
||||
flushes, a \plc{flush-set}.
|
||||
|
||||
A \emph{strong} flush will force consistency between the temporary view and the
|
||||
memory for all variables in its \emph{flush-set}. Furthermore all strong flushes in a
|
||||
A \plc{strong} flush will force consistency between the temporary view and the
|
||||
memory for all variables in its \plc{flush-set}. Furthermore, all strong flushes in a
|
||||
program that have intersecting flush-sets will execute in some total order, and
|
||||
within a thread strong flushes may not be reordered with respect to other
|
||||
memory operations on variables in its flush-set. \emph{Release} and
|
||||
\emph{acquire} flushes operate in pairs. A release flush may ``synchronize''
|
||||
memory operations on variables in its flush-set. \plc{Release} and
|
||||
\plc{acquire} flushes operate in pairs. A release flush may ``synchronize''
|
||||
with an acquire flush, and when it does so the local memory operations that
|
||||
precede the release flush will appear to have been completed before the local
|
||||
memory operations on the same variables that follow the acquire flush.
|
||||
|
||||
Flush operations arise from explicit \code{flush} directives, implicit
|
||||
\code{flush} directives, and also from the execution of \code{atomic}
|
||||
constructs. The \code{flush} directive forces a consistent view of local
|
||||
variables of the thread executing the \code{flush}. When a list is supplied on
|
||||
Flush operations arise from explicit \kcode{flush} directives, implicit
|
||||
\kcode{flush} directives, and also from the execution of \kcode{atomic}
|
||||
constructs. The \kcode{flush} directive forces a consistent view of local
|
||||
variables of the thread executing the \kcode{flush}. When a list is supplied on
|
||||
the directive, only the items (variables) in the list are guaranteed to be
|
||||
flushed. Implied flushes exist at prescribed locations of certain constructs.
|
||||
For the complete list of these locations and associated constructs, please
|
||||
refer to the \plc{flush Construct} section of the OpenMP Specifications
|
||||
refer to the \docref{\kcode{flush} Construct} section of the OpenMP Specifications
|
||||
document.
|
||||
|
||||
In this chapter, examples illustrate how race conditions may arise for accesses
|
||||
@ -53,7 +53,7 @@ do not have a well-defined \emph{completion order}. The existence of data
|
||||
races in OpenMP programs result in undefined behavior, and so they should
|
||||
generally be avoided for programs to be correct. The completion order of
|
||||
accesses to a shared variable is guaranteed in OpenMP through a set of memory
|
||||
consistency rules that are described in the \plc{OpenMP Memory Consitency}
|
||||
consistency rules that are described in the \docref{OpenMP Memory Consistency}
|
||||
section of the OpenMP Specifications document.
|
||||
|
||||
%This chapter also includes examples that exhibit non-sequentially consistent
|
||||
|
@ -5,39 +5,39 @@ A single thread, the \plc{initial thread}, begins sequential execution of
|
||||
an OpenMP enabled program, as if the whole program is in an implicit parallel
|
||||
region consisting of an implicit task executed by the \plc{initial thread}.
|
||||
|
||||
A \code{parallel} construct encloses code,
|
||||
forming a parallel region. An \plc{initial thread} encountering a \code{parallel}
|
||||
A \kcode{parallel} construct encloses code,
|
||||
forming a parallel region. An \plc{initial thread} encountering a \kcode{parallel}
|
||||
region forks (creates) a team of threads at the beginning of the
|
||||
\code{parallel} region, and joins them (removes from execution) at the
|
||||
\kcode{parallel} region, and joins them (removes from execution) at the
|
||||
end of the region. The initial thread becomes the primary thread of the team in a
|
||||
\code{parallel} region with a \plc{thread} number equal to zero, the other
|
||||
\kcode{parallel} region with a \plc{thread} number equal to zero, the other
|
||||
threads are numbered from 1 to number of threads minus 1.
|
||||
A team may be comprised of just a single thread.
|
||||
|
||||
Each thread of a team is assigned an implicit task consisting of code within the
|
||||
parallel region. The task that creates a parallel region is suspended while the
|
||||
Each \plc{thread} of a team is assigned an implicit task consisting of code within the
|
||||
\kcode{parallel} region. The task that creates a \kcode{parallel} region is suspended while the
|
||||
tasks of the team are executed. A thread is tied to its task; that is,
|
||||
only the thread assigned to the task can execute that task. After completion
|
||||
of the \code{parallel} region, the primary thread resumes execution of the generating task.
|
||||
of the \kcode{parallel} region, the primary thread resumes execution of the generating task.
|
||||
|
||||
%After the \code{parallel} region the primary thread becomes the initial
|
||||
%thread again, and continues to execute the \plc{sequential part}.
|
||||
|
||||
Any task within a \code{parallel} region is allowed to encounter another
|
||||
\code{parallel} region to form a nested \code{parallel} region. The
|
||||
parallelism of a nested \code{parallel} region (whether it forks additional
|
||||
Any task within a \kcode{parallel} region is allowed to encounter another
|
||||
\kcode{parallel} region to form a nested \kcode{parallel} region. The
|
||||
parallelism of a nested \kcode{parallel} region (whether it forks additional
|
||||
threads, or is executed serially by the encountering task) can be controlled by the
|
||||
\code{OMP\_NESTED} environment variable or the \code{omp\_set\_nested()}
|
||||
\kcode{OMP_NESTED} environment variable or the \kcode{omp_set_nested()}
|
||||
API routine with arguments indicating true or false.
|
||||
|
||||
The number of threads of a \code{parallel} region can be set by the \code{OMP\_NUM\_THREADS}
|
||||
environment variable, the \code{omp\_set\_num\_threads()} routine, or on the \code{parallel}
|
||||
directive with the \code{num\_threads}
|
||||
The number of threads of a \kcode{parallel} region can be set by the \kcode{OMP_NUM_THREADS}
|
||||
environment variable, the \kcode{omp_set_num_threads()} routine, or on the \kcode{parallel}
|
||||
directive with the \kcode{num_threads}
|
||||
clause. The routine overrides the environment variable, and the clause overrides all.
|
||||
Use the \code{OMP\_DYNAMIC}
|
||||
or the \code{omp\_set\_dynamic()} function to specify that the OpenMP
|
||||
Use the \kcode{OMP_DYNAMIC}
|
||||
or the \kcode{omp_set_dynamic()} function to specify that the OpenMP
|
||||
implementation dynamically adjust the number of threads for
|
||||
\code{parallel} regions. The default setting for dynamic adjustment is implementation
|
||||
\kcode{parallel} regions. The default setting for dynamic adjustment is implementation
|
||||
defined. When dynamic adjustment is on and the number of threads is specified,
|
||||
the number of threads becomes an upper limit for the number of threads to be
|
||||
provided by the OpenMP runtime.
|
||||
@ -49,61 +49,63 @@ WORKSHARING CONSTRUCTS
|
||||
A worksharing construct distributes the execution of the associated region
|
||||
among the members of the team that encounter it. There is an
|
||||
implied barrier at the end of the worksharing region
|
||||
(there is no barrier at the beginning). The worksharing
|
||||
constructs are:
|
||||
(there is no barrier at the beginning).
|
||||
|
||||
\newpage
|
||||
The worksharing constructs are:
|
||||
|
||||
\begin{compactitem}
|
||||
|
||||
\item loop constructs: {\code{for} and \code{do} }
|
||||
\item \code{sections}
|
||||
\item \code{single}
|
||||
\item \code{workshare}
|
||||
\item loop constructs: {\kcode{for} and \kcode{do} }
|
||||
\item \kcode{sections}
|
||||
\item \kcode{single}
|
||||
\item \kcode{workshare}
|
||||
|
||||
\end{compactitem}
|
||||
|
||||
The \code{for} and \code{do} constructs (loop constructs) create a region
|
||||
The \kcode{for} and \kcode{do} constructs (loop constructs) create a region
|
||||
consisting of a loop. A loop controlled by a loop construct is called
|
||||
an \plc{associated} loop. Nested loops can form a single region when the
|
||||
\code{collapse} clause (with an integer argument) designates the number of
|
||||
\kcode{collapse} clause (with an integer argument) designates the number of
|
||||
\plc{associated} loops to be executed in parallel, by forming a
|
||||
"single iteration space" for the specified number of nested loops.
|
||||
The \code{ordered} clause can also control multiple associated loops.
|
||||
``single iteration space'' for the specified number of nested loops.
|
||||
The \kcode{ordered} clause can also control multiple associated loops.
|
||||
|
||||
An associated loop must adhere to a "canonical form" (specified in the
|
||||
\plc{Canonical Loop Form} of the OpenMP Specifications document) which allows the
|
||||
An associated loop must adhere to a ``canonical form'' (specified in the
|
||||
\docref{Canonical Loop Form} of the OpenMP Specifications document) which allows the
|
||||
iteration count (of all associated loops) to be computed before the
|
||||
(outermost) loop is executed. %[58:27-29].
|
||||
Most common loops comply with the canonical form, including C++ iterators.
|
||||
|
||||
A \code{single} construct forms a region in which only one thread (any one
|
||||
A \kcode{single} construct forms a region in which only one thread (any one
|
||||
of the team) executes the region.
|
||||
The other threads wait at the implied
|
||||
barrier at the end, unless the \code{nowait} clause is specified.
|
||||
barrier at the end, unless the \kcode{nowait} clause is specified.
|
||||
|
||||
The \code{sections} construct forms a region that contains one or more
|
||||
structured blocks. Each block of a \code{sections} directive is
|
||||
constructed with a \code{section} construct, and executed once by
|
||||
The \kcode{sections} construct forms a region that contains one or more
|
||||
structured blocks. Each block of a \kcode{sections} directive is
|
||||
constructed with a \kcode{section} construct, and executed once by
|
||||
one of the threads (any one) in the team. (If only one block is
|
||||
formed in the region, the \code{section} construct, which is used to
|
||||
formed in the region, the \kcode{section} construct, which is used to
|
||||
separate blocks, is not required.)
|
||||
The other threads wait at the implied
|
||||
barrier at the end, unless the \code{nowait} clause is specified.
|
||||
barrier at the end, unless the \kcode{nowait} clause is specified.
|
||||
|
||||
|
||||
The \code{workshare} construct is a Fortran feature that consists of a
|
||||
The \kcode{workshare} construct is a Fortran feature that consists of a
|
||||
region with a single structure block (section of code). Statements in the
|
||||
\code{workshare} region are divided into units of work, and executed (once)
|
||||
\kcode{workshare} region are divided into units of work, and executed (once)
|
||||
by threads of the team.
|
||||
|
||||
\bigskip
|
||||
MASKED CONSTRUCT
|
||||
|
||||
The \code{masked} construct is not a worksharing construct. The \code{masked} region is
|
||||
The \kcode{masked} construct is not a worksharing construct. The \kcode{masked} region is
|
||||
executed only by the primary thread. There is no implicit barrier (and flush)
|
||||
at the end of the \code{masked} region; hence the other threads of the team continue
|
||||
execution beyond code statements beyond the \code{masked} region.
|
||||
The \code{master} contruct, which has been deprecated in OpenMP 5.1, has identical semantics
|
||||
to the \code{masked} contruct with no \code{filter} clause.
|
||||
at the end of the \kcode{masked} region; hence the other threads of the team continue
|
||||
execution beyond code statements beyond the \kcode{masked} region.
|
||||
The \kcode{master} construct, which has been deprecated in OpenMP 5.1, has identical semantics
|
||||
to the \kcode{masked} construct with no \kcode{filter} clause.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
|
@ -7,17 +7,17 @@ are provided in this introduction and illustrated in subsequent examples.
|
||||
\bigskip
|
||||
CONDITIONAL COMPILATION and EXECUTION
|
||||
|
||||
Conditional compilation can be performed with conventional \#ifdef directives
|
||||
in C, C++, and Fortran, and additionally with OpenMP sentinel (\code{!\$}) in Fortran.
|
||||
The \code{if} clause on some directives
|
||||
Conditional compilation can be performed with conventional \bcode{\#ifdef} directives
|
||||
in C, C++, and Fortran, and additionally with OpenMP sentinel (\scode{!$}) in Fortran.
|
||||
The \kcode{if} clause on some directives
|
||||
can direct the runtime to ignore or alter the behavior of the construct.
|
||||
Of course, the base-language \code{if} statements can be used to control the execution
|
||||
of stand-alone directives (such as \code{flush}, \code{barrier}, \code{taskwait},
|
||||
and \code{taskyield}).
|
||||
Of course, the base-language \bcode{if} statements can be used to control the execution
|
||||
of stand-alone directives (such as \kcode{flush}, \kcode{barrier}, \kcode{taskwait},
|
||||
and \kcode{taskyield}).
|
||||
However, the directives must appear in a block structure, and not as a substatement.
|
||||
The \code{metadirective} and \code{declare}~\code{variant} directives provide conditional
|
||||
The \kcode{metadirective} and \kcode{declare variant} directives provide conditional
|
||||
selection of directives and routines for compilation (and use), respectively.
|
||||
The \code{assume} and \code{requires} directives provide invariants
|
||||
The \kcode{assume} and \kcode{requires} directives provide invariants
|
||||
for optimizing compilation, and essential features for compilation
|
||||
and correct execution, respectively.
|
||||
|
||||
@ -26,29 +26,29 @@ and correct execution, respectively.
|
||||
CANCELLATION
|
||||
|
||||
Cancellation (termination) of the normal sequence of execution for the threads in an OpenMP region can
|
||||
be accomplished with the \code{cancel} construct. The construct uses a
|
||||
be accomplished with the \kcode{cancel} construct. The construct uses a
|
||||
\plc{construct-type-clause} to set the region-type to activate for the cancellation.
|
||||
That is, inclusion of one of the \plc{construct-type-clause} names \code{parallel}, \code{for},
|
||||
\code{do}, \code{sections} or \code{taskgroup} on the directive line
|
||||
That is, inclusion of one of the \plc{construct-type-clause} names \kcode{parallel}, \kcode{for},
|
||||
\kcode{do}, \kcode{sections} or \kcode{taskgroup} on the directive line
|
||||
activates the corresponding region.
|
||||
The \code{cancel} construct is activated by the first encountering thread, and it
|
||||
The \kcode{cancel} construct is activated by the first encountering thread, and it
|
||||
continues execution at the end of the named region.
|
||||
The \code{cancel} construct is also a cancellation point for any other thread of the team
|
||||
The \kcode{cancel} construct is also a cancellation point for any other thread of the team
|
||||
to also continue execution at the end of the named region.
|
||||
|
||||
Also, once the specified region has been activated for cancellation any thread that encounnters
|
||||
a \code{cancellation}~\code{point} construct with the same named region (\plc{construct-type-clause}),
|
||||
Also, once the specified region has been activated for cancellation any thread that encounters
|
||||
a \kcode{cancellation point} construct with the same named region (\plc{construct-type-clause}),
|
||||
continues execution at the end of the region.
|
||||
|
||||
For an activated \code{cancel taskgroup} construct, the tasks that
|
||||
For an activated \kcode{cancel taskgroup} construct, the tasks that
|
||||
belong to the taskgroup set of the innermost enclosing taskgroup region will be canceled.
|
||||
|
||||
A task that encounters a \code{cancel}~\code{taskgroup} construct continues execution at the end of its
|
||||
A task that encounters a \kcode{cancel taskgroup} construct continues execution at the end of its
|
||||
task region. Any task of the taskgroup that has already begun execution will run to completion,
|
||||
unless it encounters a \code{cancellation}~\code{point}; tasks that have not begun execution may be
|
||||
unless it encounters a \kcode{cancellation point}; tasks that have not begun execution may be
|
||||
discarded as completed tasks.
|
||||
|
||||
\bigskip
|
||||
\pagebreak
|
||||
CONTROL VARIABLES
|
||||
|
||||
Internal control variables (ICV) are used by implementations to hold values which control the execution
|
||||
@ -56,7 +56,7 @@ CONTROL VARIABLES
|
||||
or set and adjusted through environment variables, clauses, and API functions.
|
||||
%Many of the ICV control values are accessible through API function calls.
|
||||
Initial ICV values are reported by the runtime
|
||||
if the \code{OMP\_DISPLAY\_ENV} environment variable has been set to \code{TRUE} or \code{VERBOSE}.
|
||||
if the \kcode{OMP_DISPLAY_ENV} environment variable has been set to \vcode{TRUE} or \vcode{VERBOSE}.
|
||||
|
||||
%As an example, the \plc{nthreads-var} is the ICV that holds the number of threads
|
||||
%to be used in a \code{parallel} region. It can be set with the \code{OMP\_NUM\_THREADS} environment variable,
|
||||
@ -71,7 +71,7 @@ NESTED CONSTRUCTS
|
||||
|
||||
Certain combinations of nested constructs are permitted, giving rise to \plc{combined} constructs
|
||||
consisting of two or more directives. These can be used when the two (or several) constructs would be used
|
||||
immediately in succession (closely nested). A \plc{combined} construct can use the clauses of the component
|
||||
immediately in succession (closely nested). A combined construct can use the clauses of the component
|
||||
constructs without restrictions.
|
||||
A \plc{composite} construct is a combined construct which has one or more clauses with (an often obviously)
|
||||
modified or restricted meaning, relative to when the constructs are uncombined. %%[appear separately (singly).
|
||||
@ -80,34 +80,37 @@ modified or restricted meaning, relative to when the constructs are uncombined.
|
||||
%construct with one of the loops constructs \code{do} or \code{for}. The
|
||||
%\code{parallel do SIMD} and \code{parallel for SIMD} constructs are composite constructs (composed from
|
||||
%the parallel loop constructs and the \code{SIMD} construct), because the \code{collapse} clause must
|
||||
%explicitly address the ordering of loop chunking \plc{and} SIMD "combined" execution.
|
||||
%explicitly address the ordering of loop chunking \plc{and} SIMD ``combined'' execution.
|
||||
|
||||
Certain nestings are forbidden, and often the reasoning is obvious. For example, worksharing constructs cannot be nested, and
|
||||
the \code{barrier} construct cannot be nested inside a worksharing construct, or a \code{critical} construct.
|
||||
Also, \code{target} constructs cannot be nested, unless the nested target is a reverse offload.
|
||||
the \kcode{barrier} construct cannot be nested inside a worksharing construct, or a \kcode{critical} construct.
|
||||
Also, \kcode{target} constructs cannot be nested, unless the nested target is a reverse offload.
|
||||
|
||||
The \code{parallel} construct can be nested, as well as the \code{task} construct.
|
||||
The parallel execution in the nested parallel construct(s) is controlled by the
|
||||
\code{OMP\_MAX\_ACTIVE\_LEVELS} environment variable, and the \code{omp\_set\_max\_active\_levels} routine.
|
||||
Use the \code{omp\_get\_max\_active\_levels} routine to determine the maximum levels provided by an implementation.
|
||||
As of OpenMP 5.0, use of the \code{OMP\_NESTED} environment variable and the \code{omp\_set\_nested} routine
|
||||
The \kcode{parallel} construct can be nested, as well as the \kcode{task} construct.
|
||||
The parallel execution in the nested \kcode{parallel} construct(s) is controlled by the
|
||||
\kcode{OMP_MAX_ACTIVE_LEVELS} environment variable, and the \kcode{omp_set_max_active_levels} routine.
|
||||
Use the \kcode{omp_get_max_active_levels} routine to determine the maximum levels provided by an implementation.
|
||||
As of OpenMP 5.0, use of the \kcode{OMP_NESTED} environment variable and the \kcode{omp_set_nested} routine
|
||||
has been deprecated.
|
||||
|
||||
More details on nesting can be found in the \plc{Nesting of Regions} of the \plc{Directives}
|
||||
More details on nesting can be found in the \docref{Nesting of Regions} of the \docref{Directives}
|
||||
chapter in the OpenMP Specifications document.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
\input{program_control/assumption}
|
||||
\input{program_control/cond_comp}
|
||||
\input{program_control/icv}
|
||||
\input{program_control/standalone}
|
||||
\input{program_control/cancellation}
|
||||
\input{program_control/requires}
|
||||
\input{program_control/variant}
|
||||
\input{program_control/metadirective}
|
||||
\input{program_control/context_based_variants}
|
||||
\input{program_control/dispatch}
|
||||
\input{program_control/nested_loop}
|
||||
\input{program_control/nesting_restrict}
|
||||
\input{program_control/target_offload}
|
||||
\input{program_control/pause_resource}
|
||||
\input{program_control/reproducible}
|
||||
\input{program_control/interop}
|
||||
\input{program_control/utilities}
|
||||
|
||||
|
@ -1,47 +1,47 @@
|
||||
\cchapter{Synchronization}{synchronization}
|
||||
\label{chap:synchronization}
|
||||
|
||||
The \code{barrier} construct is a stand-alone directive that requires all threads
|
||||
The \kcode{barrier} construct is a stand-alone directive that requires all threads
|
||||
of a team (within a contention group) to execute the barrier and complete
|
||||
execution of all tasks within the region, before continuing past the barrier.
|
||||
|
||||
The \code{critical} construct is a directive that contains a structured block.
|
||||
The \kcode{critical} construct is a directive that contains a structured block.
|
||||
The construct allows only a single thread at a time to execute the structured block (region).
|
||||
Multiple critical regions may exist in a parallel region, and may
|
||||
act cooperatively (only one thread at a time in all \code{critical} regions),
|
||||
or separately (only one thread at a time in each \code{critical} regions when
|
||||
a unique name is supplied on each \code{critical} construct).
|
||||
An optional (lock) \code{hint} clause may be specified on a named \code{critical}
|
||||
Multiple \kcode{critical} regions may exist in a parallel region, and may
|
||||
act cooperatively (only one thread at a time in all \kcode{critical} regions),
|
||||
or separately (only one thread at a time in each \kcode{critical} regions when
|
||||
a unique name is supplied on each \kcode{critical} construct).
|
||||
An optional (lock) \kcode{hint} clause may be specified on a named \kcode{critical}
|
||||
construct to provide the OpenMP runtime guidance in selection a locking
|
||||
mechanism.
|
||||
|
||||
On a finer scale the \code{atomic} construct allows only a single thread at
|
||||
On a finer scale the \kcode{atomic} construct allows only a single thread at
|
||||
a time to have atomic access to a storage location involving a single read,
|
||||
write, update or capture statement, and a limited number of combinations
|
||||
when specifying the \code{capture} \plc{atomic-clause} clause. The
|
||||
when specifying the \kcode{capture} \plc{atomic-clause} clause. The
|
||||
\plc{atomic-clause} clause is required for some expression statements, but is
|
||||
not required for \code{update} statements. The \plc{memory-order} clause can be
|
||||
used to specify the degree of memory ordering enforced by an \code{atomic}
|
||||
construct. From weakest to strongest, they are \code{relaxed} (the default),
|
||||
acquire and/or release clauses (specified with \code{acquire}, \code{release},
|
||||
or \code{acq\_rel}), and \code{seq\_cst}. Please see the details in the
|
||||
\plc{atomic Construct} subsection of the \plc{Directives} chapter in the OpenMP
|
||||
not required for \kcode{update} statements. The \plc{memory-order} clause can be
|
||||
used to specify the degree of memory ordering enforced by an \kcode{atomic}
|
||||
construct. From weakest to strongest, they are \kcode{relaxed} (the default),
|
||||
\plc{acquire} and/or \plc{release} clauses (specified with \kcode{acquire}, \kcode{release},
|
||||
or \kcode{acq_rel}), and \kcode{seq_cst}. Please see the details in the
|
||||
\docref{atomic Construct} subsection of the \docref{Directives} chapter in the OpenMP
|
||||
Specifications document.
|
||||
|
||||
% The following three sentences were stolen from the spec.
|
||||
The \code{ordered} construct either specifies a structured block in a loop,
|
||||
The \kcode{ordered} construct either specifies a structured block in a loop,
|
||||
simd, or loop SIMD region that will be executed in the order of the loop
|
||||
iterations. The ordered construct sequentializes and orders the execution
|
||||
of ordered regions while allowing code outside the region to run in parallel.
|
||||
iterations. The \kcode{ordered} construct sequentializes and orders the execution
|
||||
of \kcode{ordered} regions while allowing code outside the region to run in parallel.
|
||||
|
||||
Since OpenMP 4.5 the \code{ordered} construct can also be a stand-alone
|
||||
directive that specifies cross-iteration dependences in a doacross loop nest.
|
||||
The \code{depend} clause uses a \code{sink} \plc{dependence-type}, along with a
|
||||
iteration vector argument (vec) to indicate the iteration that satisfies the
|
||||
dependence. The \code{depend} clause with a \code{source}
|
||||
Since OpenMP 4.5 the \kcode{ordered} construct can also be a stand-alone
|
||||
directive that specifies cross-iteration dependences in a \plc{doacross} loop nest.
|
||||
The \kcode{depend} clause uses a \kcode{sink} \plc{dependence-type}, along with an
|
||||
iteration vector argument (\plc{vec}) to indicate the iteration that satisfies the
|
||||
dependence. The \kcode{depend} clause with a \kcode{source}
|
||||
\plc{dependence-type} specifies dependence satisfaction.
|
||||
|
||||
The \code{flush} directive is a stand-alone construct for enforcing consistency
|
||||
The \kcode{flush} directive is a stand-alone construct for enforcing consistency
|
||||
between a thread's view of memory and the view of memory for other threads (see
|
||||
the Memory Model chapter of this document for more details). When the construct
|
||||
is used with an explicit variable list, a \plc{strong flush} that forces a
|
||||
@ -55,7 +55,7 @@ semantics. When an explicit variable list is not present and a
|
||||
release memory ordering semantics according to the \plc{memory-order} clause,
|
||||
but no strong flush is performed. A resulting strong flush that applies to a
|
||||
set of variables effectively ensures that no memory (load or store)
|
||||
operation for the affected variables may be reordered across the \code{flush}
|
||||
operation for the affected variables may be reordered across the \kcode{flush}
|
||||
directive.
|
||||
|
||||
General-purpose routines provide mutual exclusion semantics through locks,
|
||||
@ -69,14 +69,14 @@ types of locks, and the variable of a specific lock type cannot be used by the
|
||||
other lock type.
|
||||
|
||||
Any explicit task will observe the synchronization prescribed in a
|
||||
\code{barrier} construct and an implied barrier. Also, additional synchronizations
|
||||
are available for tasks. All children of a task will wait at a \code{taskwait} (for
|
||||
their siblings to complete). A \code{taskgroup} construct creates a region in which the
|
||||
\kcode{barrier} construct and an implied barrier. Also, additional synchronizations
|
||||
are available for tasks. All children of a task will wait at a \kcode{taskwait} (for
|
||||
their siblings to complete). A \kcode{taskgroup} construct creates a region in which the
|
||||
current task is suspended at the end of the region until all sibling tasks,
|
||||
and their descendants, have completed.
|
||||
Scheduling constraints on task execution can be prescribed by the \code{depend}
|
||||
Scheduling constraints on task execution can be prescribed by the \kcode{depend}
|
||||
clause to enforce dependence on previously generated tasks.
|
||||
More details on controlling task executions can be found in the \plc{Tasking} Chapter
|
||||
More details on controlling task executions can be found in the \docref{Tasking} Chapter
|
||||
in the OpenMP Specifications document. %(DO REF. RIGHT.)
|
||||
|
||||
|
||||
@ -85,8 +85,9 @@ in the OpenMP Specifications document. %(DO REF. RIGHT.)
|
||||
\input{synchronization/worksharing_critical}
|
||||
\input{synchronization/barrier_regions}
|
||||
\input{synchronization/atomic}
|
||||
\input{synchronization/atomic_cas}
|
||||
\input{synchronization/atomic_restrict}
|
||||
\input{synchronization/flush_nolist}
|
||||
\input{synchronization/atomic_hint}
|
||||
\input{synchronization/acquire_release}
|
||||
\input{synchronization/ordered}
|
||||
\input{synchronization/depobj}
|
||||
|
@ -2,33 +2,33 @@
|
||||
\label{chap:tasking}
|
||||
|
||||
Tasking constructs provide units of work to a thread for execution.
|
||||
Worksharing constructs do this, too (e.g. \code{for}, \code{do},
|
||||
\code{sections}, and \code{singles} constructs);
|
||||
Worksharing constructs do this, too (e.g. \kcode{for}, \kcode{do},
|
||||
\kcode{sections}, and \kcode{single} constructs);
|
||||
but the work units are tightly controlled by an iteration limit and limited
|
||||
scheduling, or a limited number of \code{sections} or \code{single} regions.
|
||||
scheduling, or a limited number of \kcode{sections} or \kcode{single} regions.
|
||||
Worksharing was designed
|
||||
with \texttt{"}data parallel\texttt{"} computing in mind. Tasking was designed for
|
||||
\texttt{"}task parallel\texttt{"} computing and often involves non-locality or irregularity
|
||||
with ``data parallel'' computing in mind. Tasking was designed for
|
||||
``task parallel'' computing and often involves non-locality or irregularity
|
||||
in memory access.
|
||||
|
||||
The \code{task} construct can be used to execute work chunks: in a while loop;
|
||||
The \kcode{task} construct can be used to execute work chunks: in a while loop;
|
||||
while traversing nodes in a list; at nodes in a tree graph;
|
||||
or in a normal loop (with a \code{taskloop} construct).
|
||||
or in a normal loop (with a \kcode{taskloop} construct).
|
||||
Unlike the statically scheduled loop iterations of worksharing, a task is
|
||||
often enqueued, and then dequeued for execution by any of the threads of the
|
||||
team within a parallel region. The generation of tasks can be from a single
|
||||
generating thread (creating sibling tasks), or from multiple generators
|
||||
in a recursive graph tree traversals.
|
||||
%(creating a parent-descendents hierarchy of tasks, see example 4 and 7 below).
|
||||
A \code{taskloop} construct
|
||||
A \kcode{taskloop} construct
|
||||
bundles iterations of an associated loop into tasks, and provides
|
||||
similar controls found in the \code{task} construct.
|
||||
similar controls found in the \kcode{task} construct.
|
||||
|
||||
Sibling tasks are synchronized by the \code{taskwait} construct, and tasks
|
||||
Sibling tasks are synchronized by the \kcode{taskwait} construct, and tasks
|
||||
and their descendent tasks can be synchronized by containing them in
|
||||
a \code{taskgroup} region. Ordered execution is accomplished by specifying
|
||||
dependences with a \code{depend} clause. Also, priorities can be
|
||||
specified as hints to the scheduler through a \code{priority} clause.
|
||||
a \kcode{taskgroup} region. Ordered execution is accomplished by specifying
|
||||
dependences with a \kcode{depend} clause. Also, priorities can be
|
||||
specified as hints to the scheduler through a \kcode{priority} clause.
|
||||
|
||||
Various clauses can be used to manage and optimize task generation,
|
||||
as well as reduce the overhead of execution and to relinquish
|
||||
@ -36,18 +36,18 @@ control of threads for work balance and forward progress.
|
||||
|
||||
Once a thread starts executing a task, it is the designated thread
|
||||
for executing the task to completion, even though it may leave the
|
||||
execution at a scheduling point and return later. The thread is tied
|
||||
to the task. Scheduling points can be introduced with the \code{taskyield}
|
||||
construct. With an \code{untied} clause any other thread is allowed to continue
|
||||
the task. An \code{if} clause with an expression that evaluates to \plc{false}
|
||||
results in an \emph{undeferred} task, which instructs the runtime to suspend
|
||||
execution at a scheduling point and return later. The thread is \plc{tied}
|
||||
to the task. Scheduling points can be introduced with the \kcode{taskyield}
|
||||
construct. With an \kcode{untied} clause any other thread is allowed to continue
|
||||
the task. An \kcode{if} clause with an expression that evaluates to \plc{false}
|
||||
results in an \plc{undeferred} task, which instructs the runtime to suspend
|
||||
the generating task until the undeferred task completes its execution.
|
||||
By including the data environment of the generating task into the generated task with the
|
||||
\code{mergeable} and \code{final} clauses, task generation overhead can be reduced.
|
||||
\kcode{mergeable} and \kcode{final} clauses, task generation overhead can be reduced.
|
||||
|
||||
A complete list of the tasking constructs and details of their clauses
|
||||
can be found in the \plc{Tasking Constructs} chapter of the OpenMP Specifications,
|
||||
in the \plc{OpenMP Application Programming Interface} section.
|
||||
can be found in the \docref{Tasking Constructs} chapter of the OpenMP Specifications.
|
||||
%in the \docref{OpenMP Application Programming Interface} section.
|
||||
|
||||
|
||||
%===== Examples Sections =====
|
||||
@ -59,4 +59,5 @@ in the \plc{OpenMP Application Programming Interface} section.
|
||||
\input{tasking/taskyield}
|
||||
\input{tasking/taskloop}
|
||||
\input{tasking/parallel_masked_taskloop}
|
||||
\input{tasking/taskloop_dep}
|
||||
|
||||
|
185
Contributions.md
185
Contributions.md
@ -3,7 +3,8 @@
|
||||
The usual process for adding new examples, making changes or adding corrections
|
||||
is to submit an issue for discussion and initial evaluation of changes or example additions.
|
||||
When there is a consensus at a meeting about the contribution,
|
||||
you will be asked to submit a pull request.
|
||||
the issue will be brought forward for voting at the OpenMP Language
|
||||
Committee meetings and you will be asked to submit a pull request.
|
||||
|
||||
Of course, if your contribution is an obvious correction, clarification, or note, you
|
||||
may want to submit a pull request directly.
|
||||
@ -13,7 +14,7 @@ may want to submit a pull request directly.
|
||||
## The OpenMP Examples document
|
||||
|
||||
The OpenMP Examples document is in LaTeX format.
|
||||
Please see the master LaTeX file, `openmp-examples.tex`, for more information.
|
||||
Please see the main LaTeX file, `openmp-examples.tex`, for more information.
|
||||
|
||||
## Maintainer
|
||||
|
||||
@ -22,22 +23,22 @@ For a brief revision history, see `Changes.log` in the repo.
|
||||
|
||||
## Git procedure
|
||||
|
||||
* Fork your own branch of the OpenMP [examples-internal repo](https:/github.com/openmp/examples-internal)
|
||||
* Fork your own branch of the OpenMP [examples-internal repo](https://github.com/OpenMP/examples-internal)
|
||||
* Clone your fork locally
|
||||
* If you are working on generic or old-version updates, create a branch off master.
|
||||
* If you are working on generic or old-version updates, create a branch off main.
|
||||
* If you are working on an example for a release candidate for version #.#, create a branch off work_#.#.
|
||||
1.) `git clone --branch <master|work_#.#> https://github.com/<my_account>/examples-internal`
|
||||
2.) `git checkout -b <branch_name>`
|
||||
3.) ... `add`, `commit`
|
||||
4.) `git push -u origin <branch_name>`
|
||||
5.) `make` or `make diff` will create a full-document pdf or just a pdf with differences (do this at any point).
|
||||
1) `git clone --branch <main|work_#.#> https://github.com/<my_account>/examples-internal`
|
||||
2) `git checkout -b <branch_name>`
|
||||
3) ... `add`, `commit`
|
||||
4) `git push -u origin <branch_name>`
|
||||
5) `make` or `make diff` will create a full-document pdf or just a pdf with differences (do this at any point).
|
||||
* `git status` and `git branch -a` are your friends
|
||||
* Submit an issue for your work (usually with a diff pdf), and then you will be asked to submit a pull request
|
||||
* Create an issue by selecting the (issue tab)[https://github.com/openmp/examples-internal/issues] and clicking on `new issue`.
|
||||
* Create an issue by selecting the (issue tab)[https://github.com/OpenMP/examples-internal/issues] and clicking on `new issue`.
|
||||
* Use this MarkDown Cheatsheet for (issue formatting)[https://wordpress.com/support/markdown-quick-reference/]
|
||||
* More MarkDown details are available (here)[https://markdown-it.github.io]
|
||||
* You can cut and paste markdown formatted text in a (reader)[https://dillinger.io] to see formatting effects.
|
||||
* Forced spaces are available in Markdown. On a Mac is is "option+space".
|
||||
* Forced spaces are available in Markdown. On a Mac it is "option+space".
|
||||
* Polling is available. Go to (gh-poll)[https://app.gh-polls.com/]. Type an option on each line, then click `copy markdown`, and paste the contents into the issue. (Use preview to check your poll, and then submit it.)
|
||||
* Create a pull request
|
||||
|
||||
@ -45,37 +46,53 @@ For a brief revision history, see `Changes.log` in the repo.
|
||||
## Processing source code
|
||||
|
||||
* Prepare source code (C/C++ and Fortran) and a text description (use similar styles found in recent examples)
|
||||
* Determine the *example* name `<ename>`, *sequence* number `<seq-no>` and *compiler* suffix `<csuffix>` for the example
|
||||
* The syntax is: `<ename>.<seq-no>.<csuffix>` (e.g. `affinity_display.1.f90`)
|
||||
* Determine the *example* name `<ename>`, *sequence* identifier `<seq-id>` and *compiler* suffix `<csuffix>` for the example
|
||||
* The syntax is: `<ename>.<seq-id>.<csuffix>` (e.g. `affinity_display.1.f90`)
|
||||
* The example name may be a Section name (e.g. affinity), or a Subsection name (affinity_display)
|
||||
* If you are creating a new Chapter, it may be the chapter name.
|
||||
* New examples are usually added at the end of a Section or Subsection. Number it as the next number in the sequence numbers for examples in that Section or Subsection.
|
||||
* The compiler suffix `<csuffix>` is `c`, `cpp`, `f`, and `f90` for C, C++ and Fortran codes.
|
||||
* The compiler suffix `<csuffix>` is `c`, `cpp`, `f`, and `f90` for C, C++ and Fortran (fixed/free form) codes.
|
||||
* Insert the code in the sources directory for each chapter, and include the following metadata:
|
||||
* Metadata Tags for example sources:
|
||||
```
|
||||
@@name: <ename>.<seq-no>[c|cpp|f|f90]
|
||||
@@name: <ename>.<seq-no>
|
||||
@@type: C|C++|F-fixed|F-free
|
||||
@@compilable: yes|no|maybe
|
||||
@@linkable: yes|no|maybe
|
||||
@@expect: success|failure|nothing|rt-error
|
||||
@@version: omp_<verno>
|
||||
@@operation: view|compile|link|run
|
||||
@@expect: success|ct-error|rt-error|unspecified
|
||||
@@version: [pre_]omp_<verno>
|
||||
@@env: <environment_variables>
|
||||
@@depend: <source_code_name>
|
||||
```
|
||||
* **name**
|
||||
is the name of an example
|
||||
* **type**
|
||||
is the source code type, which can be translated into or from proper file extension (c,cpp,f,f90)
|
||||
* **compilable**
|
||||
indicates whether the source code is compilable
|
||||
* **linkable**
|
||||
indicates whether the source code is linkable
|
||||
* **expect**
|
||||
indicates some expected result for testing purpose "`success|failure|nothing`" applies
|
||||
to the result of code compilation "`rt-error`" is for a case where compilation may be
|
||||
successful, but the code contains potential runtime issues (such as race condition).
|
||||
Alternative would be to just use "`conforming`" or "`non-conforming`".
|
||||
* **version**
|
||||
indicates features for a specific OpenMP version, such as "`omp_5.0`"
|
||||
* **name**
|
||||
- is the name of an example
|
||||
* **type**
|
||||
- is the source code type, which can be translated into or from proper file extension (C:c,C++:cpp,F-fixed:f,F-free:f90)
|
||||
* **operation**
|
||||
- indicates how the source code is treated. Possible values are:
|
||||
- `view` - code for illustration only, not compilable;
|
||||
- `compile` - incomplete program, such as function or subroutine;
|
||||
- `link` - complete program, but no verification value;
|
||||
- `run` - complete program with verification value.
|
||||
* **expect**
|
||||
- indicates some expected result for testing purpose.
|
||||
- `success` means no issue;
|
||||
- `ct-error` applies to the result of code compilation;
|
||||
- `rt-error` is for a case where compilation may be successful, but the code
|
||||
contains potential runtime issues (including race condition);
|
||||
- `unspecified` could result from a non-conforming code or is for code
|
||||
that is viewable only.
|
||||
* **version**
|
||||
- indicates that the example uses features in a specific OpenMP version, such as "`omp_5.0`".
|
||||
The prefix `pre_` indicates that the example uses features prior to a specific version, such as "`pre_omp_3.0`".
|
||||
* **env**
|
||||
- specifies any environment variables needed to run the code.
|
||||
This tag is optional and can be repeated.
|
||||
* **depend**
|
||||
- specifies a source code file on which the current code depends.
|
||||
This tag is optional and can be repeated.
|
||||
* For **env** and **depend**, make sure to specify
|
||||
a proper skipping number `<s>` in the LaTeX macros described below
|
||||
to match with the number of `env` and `depend` tags.
|
||||
|
||||
|
||||
## Process for text
|
||||
@ -89,28 +106,36 @@ For a brief revision history, see `Changes.log` in the repo.
|
||||
* Shepherd your issue to acceptance (discussed at weekly Examples meeting and in issue comments)
|
||||
* When it is in a ready state, you should then submit a pull request.
|
||||
* It will be reviewed and voted on, and changes will be requested.
|
||||
* Once the last changes are made, it will be verified and merged into an appropriate branch (either the `master` branch or a working branch).
|
||||
* Once the last changes are made, it will be verified and merged into an appropriate branch (either the `main` branch or a working branch).
|
||||
|
||||
|
||||
|
||||
|
||||
# LaTeX macros for examples
|
||||
## LaTeX macros for examples
|
||||
|
||||
The following describes LaTeX macros defined specifically for examples.
|
||||
* Source code with language h-rules
|
||||
* Source code without language h-rules
|
||||
* Language h-rules
|
||||
* Macros for keywords in text description
|
||||
* Other macros
|
||||
* See `openmp.sty` for more information
|
||||
|
||||
### Source code with language h-rules
|
||||
```
|
||||
\cexample[<verno>]{<ename>}{<seq-no>} % for C/C++ examples
|
||||
\cppexample[<verno>]{<ename>}{<seq-no>} % for C++ examples
|
||||
\fexample[<verno>]{<ename>}{<seq-no>} % for fixed-form Fortran examples
|
||||
\ffreeexample[<verno>]{<ename>}{<seq-no>} % for free-form Fortran examples
|
||||
\cexample[<verno>]{<ename>}{<seq-no>}[<s>] % for C/C++ examples
|
||||
\cppexample[<verno>]{<ename>}{<seq-no>}[<s>] % for C++ examples
|
||||
\fexample[<verno>]{<ename>}{<seq-no>}[<s>] % for fixed-form Fortran examples
|
||||
\ffreeexample[<verno>]{<ename>}{<seq-no>}[<s>] % for free-form Fortran examples
|
||||
```
|
||||
|
||||
* Source code without language h-rules
|
||||
### Source code without language h-rules
|
||||
```
|
||||
\cnexample[<verno>]{<ename>}{<seq-no>}
|
||||
\cppnexample[<verno>]{<ename>}{<seq-no>}
|
||||
\fnexample[<verno>]{<ename>}{<seq-no>}
|
||||
\ffreenexample[<verno>]{<ename>}{<seq-no>}
|
||||
\srcnexample[<verno>]{<ename>}{<seq-no>}{<ext>}
|
||||
\cnexample[<verno>]{<ename>}{<seq-no>}[<s>]
|
||||
\cppnexample[<verno>]{<ename>}{<seq-no>}[<s>]
|
||||
\fnexample[<verno>]{<ename>}{<seq-no>}[<s>]
|
||||
\ffreenexample[<verno>]{<ename>}{<seq-no>}[<s>]
|
||||
\srcnexample[<verno>]{<ename>}{<seq-no>}{<ext>}[<s>]
|
||||
```
|
||||
|
||||
Optional `<verno>` can be supplied in a macro to include a specific OpenMP
|
||||
@ -120,20 +145,66 @@ For a brief revision history, see `Changes.log` in the repo.
|
||||
prefix `<verno>` with an underscore '\_' symbol in the macro.
|
||||
|
||||
The exception is macro `\srcnexample`, for which the corresponding
|
||||
source code should not contain any `@@` metadata tags. The `ext` argument
|
||||
source code might not contain any `@@` metadata tags. The `ext` argument
|
||||
to this macro is the file extension (such as `h`, `hpp`, `inc`).
|
||||
|
||||
* Language h-rules
|
||||
The `<s>` option to each macro allows finer-control of any additional lines
|
||||
to be skipped due to addition of new `@@` tags, such as `@@env`.
|
||||
The default value for `<s>` is 0.
|
||||
|
||||
### Language h-rules
|
||||
```
|
||||
\cspecificstart, \cspecificend
|
||||
\cppspecificstart, \cppspecificend
|
||||
\ccppspecificstart, \ccppspecificend
|
||||
\fortranspecificstart, \fortranspecificend
|
||||
\begin{cspecific}[s] ... \end{cspecific}
|
||||
\begin{cppspecific}[s] ... \end{cppspecific}
|
||||
\begin{ccppspecific}[s] ... \end{ccppspecific}
|
||||
\begin{fortranspecific}[s] ... \end{fortranspecific}
|
||||
\topmarker{Lang}
|
||||
```
|
||||
|
||||
* Chapter and section macros
|
||||
Use of the structured `\begin{} .. \end{}` environments is the preferred
|
||||
way of specifying language-dependent text over the unstructured approach
|
||||
of using `\*specificstart` and `\*specificend`.
|
||||
The option `[s]` to each of the environments can specify a vertical shift
|
||||
for the beginning rule, such as when followed by a section header.
|
||||
|
||||
The macro `\topmarker` puts a dashed blue line floater at top of a page for
|
||||
"Lang (cont.)" where `Lang` can be `C/C++`, `C++`, `Fortran`.
|
||||
|
||||
### Macros for keywords in text description
|
||||
A partial list:
|
||||
- `\kcode{}` - for OpenMP keywords, such as directives, clauses, environment variables, API routines. Support direct use of '_' (underscore) and ' ' (space)
|
||||
- `\scode{}` - OpenMP specifier with special chars, such as '`$`' in "`!$omp`"
|
||||
- `\bcode{}` - base language keywords (such as `ASSOCIATE` in Fortran)
|
||||
- `\vcode{}` - values of a keyword, such as `TRUE`, `FALSE`, `VERBOSE`
|
||||
- `\plc{}` - OpenMP concept, such ICV names; `\splc{}` - escape '_' (underscore)
|
||||
- `\example{}` - example names, such as `\example{taskloop_reduction.1}`
|
||||
- `\docref{}` - chapter or section name of a document, such as the spec
|
||||
- `\ucode{}` - program variables, procedure names, or expression in examples codes. Support direct use of '_' (underscore) and ' ' (space).
|
||||
- `\pout{}` - program outputs
|
||||
|
||||
Examples:
|
||||
- `\kcode{declare reduction}` for **declare reduction**
|
||||
- `\scode{!$omp}` sentinel, however, `\kcode{\#pragma omp}`
|
||||
- `\kcode{map(iterator(\ucode{i=0:n}), tofrom: \ucode{p[i]})}` for **map(iterator(**_i=0:n_**), tofrom:** _p[i]_**)**
|
||||
- Fortran `\bcode{IMPLICIT NONE}` statement
|
||||
- The `\vcode{VERBOSE}` value for `\kcode{OMP_DISPLAY_ENV}`
|
||||
- OpenMP `\plc{directives}`, the `\plc{num-threads}` ICV
|
||||
- This is an example name `\example{taskloop_reduction.1}`
|
||||
- `(\ucode{x,y,z})` argument for procedure `\ucode{a_proc_name}`
|
||||
- structure constructor `\ucode{point($\ldots$)}`
|
||||
- This is a code output `"\pout{x = 1}"`
|
||||
|
||||
### Other macros
|
||||
```
|
||||
\cchapter{<Chapter Name>}{<chap_directory>}
|
||||
\hexentry[ext1]{<example_name>}[ext2]{<earlier_tag>}
|
||||
\hexmentry[ext1]{<example_name>}[ext2]{<earlier_tag>}{<prior_name>}
|
||||
\examplesref{<verno>}
|
||||
\examplesblob{<verno/file>}
|
||||
```
|
||||
|
||||
The `\cchapter` macro is used for starting a chapter with proper page spacing.
|
||||
@ -146,8 +217,18 @@ A previously-defined macro `\sinput{<section_file>}` to import a section
|
||||
file from `<chap_directory>` is no longer supported. Please use
|
||||
`\input{<chap_directory>/<section_file>}` explicitly.
|
||||
|
||||
* See `openmp.sty` for more information
|
||||
The two macros `\hexentry` and `\hexmentry` are defined for simplifying
|
||||
entries in the feature deprecation and update tables. Option `[ext1]` is
|
||||
the file extension with a default value of `c` and option `[ext2]` is
|
||||
the file extension for the associated second file if present.
|
||||
`<earlier_tag>` is the version tag of the corresponding example
|
||||
in the earlier version. `\hexentry` assumes no name change for an example
|
||||
in different versions; `\hexmentry` can be used to specify a prior name
|
||||
if it is different.
|
||||
|
||||
### License
|
||||
The two macros `\examplesref` and `\examplesblob` are for referencing
|
||||
a specific version of or a file in the github Examples repository.
|
||||
|
||||
For copyright information, please see `omp_copyright.txt`.
|
||||
## License
|
||||
|
||||
For copyright information, please see [omp_copyright.txt](omp_copyright.txt).
|
||||
|
282
Deprecated_Features.tex
Normal file
282
Deprecated_Features.tex
Normal file
@ -0,0 +1,282 @@
|
||||
\cchapter{Feature Deprecations and Updates in Examples}{deprecated_features}
|
||||
\label{chap:deprecated_features}
|
||||
\label{sec:deprecated_features}
|
||||
\index{deprecated features}
|
||||
|
||||
\newcommand\tabpcont[1]{\multicolumn{2}{l}{\small\slshape table continued #1 page}}
|
||||
\newcommand\tabpheader{\textbf{Version} & \textbf{Deprecated Feature} &
|
||||
\textbf{Replacement}}
|
||||
\newcommand\tabuheader{\textbf{Example Name} & \textbf{Earlier Version} &
|
||||
\textbf{Feature Updated}}
|
||||
\newcommand\dpftable[1]{
|
||||
\renewcommand{\arraystretch}{1.0}
|
||||
\tablefirsthead{%
|
||||
\hline\\[-2ex]
|
||||
\tabuheader\\[2pt]
|
||||
\hline\\[-2ex]
|
||||
}
|
||||
\tablehead{%
|
||||
\tabpcont{from previous}\\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\tabuheader\\[2pt]
|
||||
\hline\\[-2ex]
|
||||
}
|
||||
\tabletail{%
|
||||
\hline\\[-2.5ex]
|
||||
\tabpcont{on next}\\
|
||||
}
|
||||
\tablelasttail{\hline\\[-1ex]}
|
||||
\tablecaption{Updated Examples for Features Deprecated in Version #1\label{tab:Updated Examples #1}}
|
||||
}
|
||||
|
||||
|
||||
Deprecation of features began in OpenMP 5.0.
|
||||
Examples that use a deprecated feature have been updated with an equivalent
|
||||
replacement feature.
|
||||
|
||||
Table~\ref{tab:Deprecated Features} summarizes deprecated features and
|
||||
their replacements in each version. Affected examples are updated
|
||||
accordingly and listed in Section~\ref{sec:Updated Examples}.
|
||||
|
||||
\nolinenumbers
|
||||
\renewcommand{\arraystretch}{1.4}
|
||||
\tablefirsthead{%
|
||||
\hline
|
||||
\tabpheader\\
|
||||
\hline\\[-3.5ex]
|
||||
}
|
||||
\tablehead{%
|
||||
\tabpcont{from previous}\\
|
||||
\hline
|
||||
\tabpheader\\
|
||||
\hline\\[-3ex]
|
||||
}
|
||||
\tabletail{%
|
||||
\hline\\[-4ex]
|
||||
\tabpcont{on next}\\
|
||||
}
|
||||
\tablelasttail{\hline\\[-2ex]}
|
||||
\tablecaption{Deprecated Features and Their Replacements\label{tab:Deprecated Features}}
|
||||
\begin{supertabular}{p{0.4in} p{2.3in} p{2.2in}}
|
||||
6.0 & \kcode{declare reduction(}\plc{reduction-id}: \plc{typename-list}: \plc{combiner}\kcode{)}
|
||||
& \kcode{declare reduction(}\plc{reduction-id}: \plc{typename-list}\kcode{)} \kcode{combiner(\plc{combiner-exp})} \\
|
||||
\hline
|
||||
5.2 & \kcode{default} clause on metadirectives
|
||||
& \kcode{otherwise} clause \\
|
||||
5.2 & delimited \kcode{declare target} directive for C/C++
|
||||
& \kcode{begin declare target} directive \\
|
||||
5.2 & \kcode{to} clause on \kcode{declare target} directive
|
||||
& \kcode{enter} clause \\
|
||||
5.2 & non-argument \kcode{destroy} clause on \kcode{depobj} construct
|
||||
& \kcode{destroy(\plc{argument})} \\
|
||||
5.2 & \kcode{allocate} directive for Fortran \bcode{ALLOCATE} statements
|
||||
& \kcode{allocators} directive \\
|
||||
5.2 & \kcode{depend} clause on \kcode{ordered} construct
|
||||
& \kcode{doacross} clause \\
|
||||
5.2 & \kcode{linear(\plc{modifier(list): linear-step})} clause
|
||||
& \kcode{linear(\plc{list:} step(\plc{linear-step})\plc{, modifier})} clause \\
|
||||
\hline
|
||||
5.1 & \kcode{master} construct
|
||||
& \kcode{masked} construct \\
|
||||
5.1 & \kcode{master} affinity policy
|
||||
& \kcode{primary} affinity policy \\
|
||||
\hline
|
||||
5.0 & \kcode{omp_lock_hint_*} constants
|
||||
& \kcode{omp_sync_hint_*} constants \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
These replacements appear in examples that illustrate, otherwise, earlier features.
|
||||
When using a compiler that is compliant with a version prior to
|
||||
the indicated version, the earlier form of an example for a previous
|
||||
version is listed as a reference.
|
||||
|
||||
\newpage
|
||||
\section{Updated Examples for Different Versions}
|
||||
\label{sec:Updated Examples}
|
||||
|
||||
The following tables list the updated examples for different versions as
|
||||
a result of feature deprecation. The \emph{Earlier Version} column of
|
||||
the tables shows the version tag of the earlier version. It also shows
|
||||
the prior name of an example when it has been renamed.
|
||||
|
||||
|
||||
Table~\ref{tab:Updated Examples 6.0} lists the updated examples for
|
||||
features deprecated in OpenMP 6.0
|
||||
in the Examples Document Version
|
||||
\href{https://github.com/OpenMP/Examples/tree/v6.0}{6.0}.
|
||||
The \emph{Earlier Version} column of the table lists the earlier version
|
||||
tags of the examples that can be found in
|
||||
the Examples Document Version
|
||||
\href{https://github.com/OpenMP/Examples/tree/v5.2}{5.2}.
|
||||
|
||||
\index{clauses!combiner@\kcode{combiner}}
|
||||
\index{combiner clause@\kcode{combiner} clause}
|
||||
|
||||
\nolinenumbers
|
||||
\dpftable{6.0}
|
||||
\begin{supertabular}{p{1.7in} p{1.1in} p{2.2in}}
|
||||
\hexentry{udr.1}[f90]{4.0} &
|
||||
\plc{combiner} expression in \kcode{declare} \\
|
||||
\hexentry{udr.2}[f90]{4.0} &
|
||||
\kcode{reduction} directive changed to use \\
|
||||
\hexentry{udr.3}[f90]{4.0} & \kcode{combiner} clause \\
|
||||
\hexentry[f90]{udr.4}{4.0} & \\
|
||||
\hexentry[cpp]{udr.5}{4.0} & \\
|
||||
\hexentry[cpp]{udr.6}{4.0} & \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
Table~\ref{tab:Updated Examples 5.2} lists the updated examples for
|
||||
features deprecated in OpenMP 5.2
|
||||
in the Examples Document Version \examplesref{5.2}.
|
||||
The \emph{Earlier Version} column of the table lists the earlier version
|
||||
tags of the examples that can be found in
|
||||
the Examples Document Version \examplesref{5.1}.
|
||||
|
||||
\index{clauses!default@\kcode{default}}
|
||||
\index{clauses!otherwise@\kcode{otherwise}}
|
||||
\index{clauses!to@\kcode{to}}
|
||||
\index{clauses!enter@\kcode{enter}}
|
||||
\index{clauses!depend@\kcode{depend}}
|
||||
\index{clauses!doacross@\kcode{doacross}}
|
||||
\index{clauses!linear@\kcode{linear}}
|
||||
\index{clauses!destroy@\kcode{destroy}}
|
||||
\index{default clause@\kcode{default} clause}
|
||||
\index{otherwise clause@\kcode{otherwise} clause}
|
||||
\index{to clause@\kcode{to} clause}
|
||||
\index{enter clause@\kcode{enter} clause}
|
||||
\index{depend clause@\kcode{depend} clause}
|
||||
\index{doacross clause@\kcode{doacross} clause}
|
||||
\index{linear clause@\kcode{linear} clause}
|
||||
\index{destroy clause@\kcode{destroy} clause}
|
||||
\index{directives!begin declare target@\kcode{begin declare target}}
|
||||
\index{begin declare target directive@\kcode{begin declare target} directive}
|
||||
\index{allocate directive@\kcode{allocate} directive}
|
||||
\index{allocators directive@\kcode{allocators} directive}
|
||||
|
||||
\nolinenumbers
|
||||
\dpftable{5.2}
|
||||
\begin{supertabular}{p{1.7in} p{1.2in} p{2.1in}}
|
||||
\hexentry{error.1}[f90]{5.1} &
|
||||
\kcode{default} clause on metadirectives \\
|
||||
\hexentry{metadirective.1}[f90]{5.0} &
|
||||
replaced with \kcode{otherwise} clause \\
|
||||
\hexentry{metadirective.2}[f90]{5.0} & \\
|
||||
\hexentry{metadirective.3}[f90]{5.0} & \\
|
||||
\hexentry{metadirective.4}[f90]{5.1} & \\
|
||||
\hexentry{target_ptr_map.4}{5.1} & \\
|
||||
\hexentry{target_ptr_map.5}[f90]{5.1} & \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry[f90]{array_shaping.1}{5.0} &
|
||||
\kcode{to} clause on \kcode{declare target} \\
|
||||
\hexentry{target_reverse_offload.7}{5.0} &
|
||||
directive replaced with \kcode{enter} clause \\
|
||||
\hexentry{target_task_reduction.1}[f90]{5.1} & \\
|
||||
\hexentry{target_task_reduction.2a}[f90]{5.0} & \\
|
||||
\hexentry{target_task_reduction.2b}[f90]{5.1} &\\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry{array_shaping.1}{5.0} &
|
||||
delimited \kcode{declare target} \\
|
||||
\hexentry{async_target.1}{4.0} &
|
||||
directive replaced with \\
|
||||
\hexentry{async_target.2}{4.0} &
|
||||
\kcode{begin declare target} \\
|
||||
\hexentry{declare_target.1}{4.0} &
|
||||
directive for C/C++ \\
|
||||
\hexentry[cpp]{declare_target.2c}{4.0} & \\
|
||||
\hexentry{declare_target.3}{4.0} & \\
|
||||
\hexentry{declare_target.4}{4.0} & \\
|
||||
\hexentry{declare_target.5}{4.0} & \\
|
||||
\hexentry{declare_target.6}{4.0} & \\
|
||||
\hexentry{declare_variant.1}{5.0} & \\
|
||||
\hexentry{device.1}{4.0} & \\
|
||||
\hexentry{metadirective.3}{5.0} & \\
|
||||
\hexentry{target_ptr_map.2}{5.0} & \\
|
||||
\hexentry{target_ptr_map.3a}{5.0} & \\
|
||||
\hexentry{target_ptr_map.3b}{5.0} & \\
|
||||
\hexentry{target_struct_map.1}{5.0} & \\
|
||||
\hexentry[cpp]{target_struct_map.2}{5.0} & \\
|
||||
\hexentry{target_struct_map.3}{5.0} & \\
|
||||
\hexentry{target_struct_map.4}{5.0} & \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry{doacross.1}[f90]{4.5} &
|
||||
\kcode{depend} clause on \kcode{ordered} \\
|
||||
\hexentry{doacross.2}[f90]{4.5} &
|
||||
construct replaced with \kcode{doacross} \\
|
||||
\hexentry{doacross.3}[f90]{4.5} &
|
||||
clause \\
|
||||
\hexentry{doacross.4}[f90]{4.5} & \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry[cpp]{linear_modifier.1}[f90]{4.5} &
|
||||
modifier syntax change for \kcode{linear} \\
|
||||
\hexentry[cpp]{linear_modifier.2}[f90]{4.5} &
|
||||
clause on \kcode{declare simd} directive \\
|
||||
\hexentry{linear_modifier.3}[f90]{4.5} & \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry[f90]{allocators.1}{5.0} &
|
||||
\kcode{allocate} directive replaced with \kcode{allocators} directive
|
||||
for Fortran \bcode{allocate} statements \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry{depobj.1}[f90]{5.0} &
|
||||
argument added to \kcode{destroy} clause on \kcode{depobj}
|
||||
construct \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
\newpage
|
||||
Table~\ref{tab:Updated Examples 5.1} lists the updated examples for
|
||||
features deprecated in OpenMP 5.1
|
||||
in the Examples Document Version \examplesref{5.1}.
|
||||
The \emph{Earlier Version} column of the table lists the earlier version
|
||||
tags and prior names of the examples that can be found in
|
||||
the Examples Document Version \examplesref{5.0.1}.
|
||||
|
||||
\index{affinity!master policy@\kcode{master} policy}
|
||||
\index{affinity!primary policy@\kcode{primary} policy}
|
||||
\index{constructs!master@\kcode{master}}
|
||||
\index{constructs!masked@\kcode{masked}}
|
||||
\index{master construct@\kcode{master} construct}
|
||||
\index{masked construct@\kcode{masked} construct}
|
||||
|
||||
\nolinenumbers
|
||||
\dpftable{5.1}
|
||||
\begin{supertabular}{p{1.8in} p{1.4in} p{1.8in}}
|
||||
\hexentry{affinity.5}[f]{4.0} &
|
||||
\kcode{master} affinity policy replaced with \kcode{primary} policy \\[2pt]
|
||||
\hline\\[-2ex]
|
||||
\hexentry{async_target.3}[f90]{5.0} &
|
||||
\kcode{master} construct replaced \\
|
||||
\hexentry{cancellation.2}[f90]{4.0} &
|
||||
with \kcode{masked} construct \\
|
||||
\hexentry{copyprivate.2}[f]{3.0} & \\
|
||||
\hexentry[f]{fort_sa_private.5}{3.0} & \\
|
||||
\hexentry{lock_owner.1}[f]{3.0} & \\
|
||||
\hexmentry{masked.1}[f]{3.0}{master.1} & \\
|
||||
\hexmentry{parallel_masked_taskloop.1}[f90]{5.0}{parallel_master_taskloop.1} &\\
|
||||
\hexentry{reduction.6}[f]{3.0} & \\
|
||||
\hexentry{target_task_reduction.1}[f90]{5.0} & \\
|
||||
\hexentry{target_task_reduction.2b}[f90]{5.0} & \\
|
||||
\hexentry{taskloop_simd_reduction.1}[f90]{5.0} & \\
|
||||
\hexentry{task_detach.1}[f90]{5.0} & \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
Table~\ref{tab:Updated Examples 5.0} lists the updated examples for
|
||||
features deprecated in OpenMP 5.0
|
||||
in the Examples Document Version \examplesref{5.1}.
|
||||
The \emph{Earlier Version} column of the table lists the earlier version
|
||||
tags of the examples that can be found in
|
||||
the Examples Document Version \examplesref{5.0.1}.
|
||||
|
||||
\nolinenumbers
|
||||
\dpftable{5.0}
|
||||
\begin{supertabular}{p{1.6in} p{1.3in} p{2.1in}}
|
||||
\hexentry{critical.2}[f]{4.5} &
|
||||
\kcode{omp_lock_hint_*} constants \\
|
||||
\hexentry[cpp]{init_lock_with_hint.1}[f]{4.5} &
|
||||
replaced with \kcode{omp_sync_hint_*} constants \\[2pt]
|
||||
\end{supertabular}
|
||||
|
||||
\linenumbers
|
||||
|
@ -1,21 +0,0 @@
|
||||
\bchapter{Deprecated Features}
|
||||
\label{chap:deprecated_features}
|
||||
|
||||
Deprecation of features began in OpenMP 5.0.
|
||||
Examples that use a deprecated feature have been updated with an equivalent replacement feature.
|
||||
|
||||
Deprecations affecting examples are the following:
|
||||
\begin{description}[labelindent=5mm,font=\normalfont]
|
||||
\item[5.1] -- \ \scode{masked} construct replaces \scode{master} construct.
|
||||
\item[5.1] -- \ \scode{primary} affinity policy replaces \scode{master} affinity policy.
|
||||
\item[5.0] -- \ \scode{omp_sync_hint_*} constants replace \scode{omp_lock_hint_*} constants.
|
||||
\end{description}
|
||||
|
||||
These replacements appear in examples that illustrate, otherwise, earlier features.
|
||||
When using a compiler that is compliant with a version prior to
|
||||
the indicated version, the earlier form of
|
||||
an example is restored by a C-style conditional compilation using the \scode{_OPENMP} macro.
|
||||
|
||||
Since Fortran compilers do not preprocess codes by default, a Fortran preprocessor
|
||||
flag will be required to compile Fortran examples with the C-style conditional
|
||||
compilation statements.
|
@ -1,24 +0,0 @@
|
||||
\bchapter{Examples}
|
||||
\label{chap:examples}
|
||||
|
||||
The following are examples of the OpenMP API directives, constructs, and routines.
|
||||
\ccppspecificstart
|
||||
A statement following a directive is compound only when necessary, and a
|
||||
non-compound statement is indented with respect to a directive preceding it.
|
||||
\ccppspecificend
|
||||
|
||||
Each example is labeled as \plc{ename.seqno.ext}, where \plc{ename} is
|
||||
the example name, \plc{seqno} is the sequence number in a section, and
|
||||
\plc{ext} is the source file extension to indicate the code type and
|
||||
source form. \plc{ext} is one of the following:
|
||||
\begin{description}[noitemsep,labelindent=5mm,widest=f90]
|
||||
\item[\plc{c}] -- \ C code,
|
||||
\item[\plc{cpp}] -- \ C++ code,
|
||||
\item[\plc{f}] -- \ Fortran code in fixed form, and
|
||||
\item[\plc{f90}] -- \ Fortran code in free form.
|
||||
\end{description}
|
||||
|
||||
Some of the example labels may include version information
|
||||
(\code{\small{}omp\_\plc{verno}}) to indicate features that are illustrated
|
||||
by an example for a specific OpenMP version, such as ``\plc{scan.1.c}
|
||||
\;(\code{\small{}omp\_5.0}).''
|
@ -1,23 +1,48 @@
|
||||
\bchapter{Foreword}
|
||||
\chapter*{Foreword}
|
||||
\label{chap:foreword}
|
||||
|
||||
The OpenMP Examples document has been updated with new features
|
||||
found in the OpenMP 5.1 Specification. The additional examples and updates
|
||||
are referenced in the Document Revision History of the Appendix on page~\pageref{chap:history}.
|
||||
found in the OpenMP \SVER\ Specification.
|
||||
In order to provide users with new feature examples concurrently
|
||||
with the release of the OpenMP 6.0 Specification,
|
||||
the 6.0 Examples document is being released early
|
||||
with a caveat that some of the 6.0 features
|
||||
(such as \kcode{workdistribute} construct, \kcode{taskgraph} construct,
|
||||
\kcode{threadset} clause and free-agent threads) will be covered
|
||||
in the next release of the document.
|
||||
For a list of the new examples and updates in this release,
|
||||
please refer to the Document Revision History of the Appendix on page~\pageref{chap:history}.
|
||||
|
||||
Text describing an example with a 5.1 feature specifically states
|
||||
that the feature support begins in the OpenMP 5.1 Specification. Also,
|
||||
an \code{\small omp\_5.1} keyword is included in the metadata of the source code.
|
||||
These distinctions are presented to remind readers that a 5.1 compliant
|
||||
Text describing an example with a \SVER\ feature specifically states
|
||||
that the feature support begins in the OpenMP \SVER\ Specification. Also,
|
||||
an \kcode{\small{}omp_\SVER} keyword is included in the metadata of the source code.
|
||||
These distinctions are presented to remind readers that a \SVER\ compliant
|
||||
OpenMP implementation is necessary to use these features in codes.
|
||||
|
||||
Examples for most of the 5.1 features are included in this document,
|
||||
and incremental releases will become available as more feature examples
|
||||
and updates are submitted, and approved by the OpenMP Examples Subcommittee.
|
||||
%Examples for most of the \SVER\ features are included in this document,
|
||||
%and
|
||||
Incremental releases will become available as more feature examples
|
||||
and updates are submitted and approved by the OpenMP Examples Subcommittee.
|
||||
Examples are accepted for this document after discussions, revisions and reviews
|
||||
in the Examples Subcommittee, and two reviews/discussions and two votes
|
||||
in the OpenMP Language Committee.
|
||||
Draft examples are often derived from case studies for new features in the language,
|
||||
and are revised to illustrate the basic application of the features with code comments,
|
||||
and a text description. We are grateful to the numerous members of the Language Committee
|
||||
who took the time to prepare codes and descriptions, and shepherd them through
|
||||
the acceptance process. We sincerely appreciate the Example Subcommittee members, who
|
||||
actively participated and contributed in weekly meetings over the years.
|
||||
|
||||
\bigskip
|
||||
Examples Subcommitee Co-chairs: \smallskip\linebreak
|
||||
Examples Subcommittee Co-chairs: \smallskip\linebreak
|
||||
Henry Jin (\textsc{NASA} Ames Research Center) \linebreak
|
||||
Kent Milfeld (\textsc{TACC}, Texas Advanced Research Center)
|
||||
Swaroop Pophale (Oak Ridge National Laboratory)
|
||||
|
||||
\bigskip
|
||||
\bigskip
|
||||
Past Examples Subcommittee Co-chairs:
|
||||
\begin{itemize}
|
||||
\item Kent Milfeld (2014 - 2022)
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
394
History.tex
394
History.tex
@ -1,6 +1,270 @@
|
||||
\cchapter{Document Revision History}{history}
|
||||
\label{chap:history}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.2.2 to 6.0}
|
||||
\label{sec:history_522_to_60}
|
||||
|
||||
\begin{itemize}
|
||||
\item General changes:
|
||||
\begin{itemize}
|
||||
\item Added a set of structured LaTeX environments for specifying
|
||||
language-dependent text. This allows extracting language-specific
|
||||
content of the Examples document. Refer to the content of
|
||||
\examplesblob{v6.0/Contributions.md} for details.
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following examples for the 6.0 features:
|
||||
\begin{itemize}
|
||||
\item \kcode{omp::decl} attribute for declarative directives in C/C++
|
||||
(\specref{sec:attributes})
|
||||
\item \kcode{transparent} clause on the \kcode{task} construct to enable dependences
|
||||
between non-sibling tasks (\specref{subsec:depend_trans_task})
|
||||
\item Task dependences for \kcode{taskloop} construct
|
||||
(\specref{sec:taskloop_depend})
|
||||
\item \kcode{num_threads} clause that appears inside \kcode{target} region
|
||||
(\specref{subsec:target_teams_num_teams})
|
||||
\item \kcode{nowait} clause with argument on the \kcode{target} construct to control deferment
|
||||
of target task (\specref{subsec:async_target_nowait_arg})
|
||||
\item Traits for specifying devices (\specref{sec:device_env_traits})
|
||||
\item \kcode{apply} clause with modifier argument to
|
||||
support selective loop transformations
|
||||
(\specref{sec:apply_clause})
|
||||
\item Reduction on private variables in a \kcode{parallel} region
|
||||
(\specref{subsec:priv_reduction})
|
||||
\item \kcode{induction} clause (\specref{subsec:induction})
|
||||
and user-defined induction (\specref{subsec:user-defined-induction})
|
||||
\item \kcode{init_complete} clause for \kcode{scan} directive to
|
||||
support initialization phase in scan operation
|
||||
(\specref{sec:scan})
|
||||
\item \kcode{assume} construct with \kcode{no_openmp} and \kcode{no_parallelism} clauses (\specref{sec:assumption})
|
||||
\item \kcode{num_threads} clause with a list
|
||||
(\specref{subsec:icv_nthreads})
|
||||
\item \kcode{dispatch} construct to control variant substitution
|
||||
for a procedure call (\specref{sec:dispatch})
|
||||
\end{itemize}
|
||||
|
||||
\item Other changes:
|
||||
\begin{itemize}
|
||||
\item Changed attribute specifier as a directive form from C++ only to C/C++
|
||||
(\specref{chap:directive_syntax})
|
||||
\item Added missing \bcode{include <omp.h>} in Example \example{atomic.4.c}
|
||||
and \bcode{use omp_lib} in Example \example{atomic.4.f90}
|
||||
(\specref{sec:atomic_hint})
|
||||
\item Fixed the function declaration order for variant functions in
|
||||
Examples \example{selector_scoring.[12].c} and Fortran pointer
|
||||
initialization in Example \example{selector_scoring.2.f90}
|
||||
(\specref{subsec:context_selector_scoring})
|
||||
\item Replaced the deprecated use of \plc{combiner-exp}
|
||||
in \kcode{declare reduction} directive with \kcode{combiner} clause
|
||||
(\specref{subsec:UDR} and \specref{sec:Updated Examples})
|
||||
\item Fixed the initialization of Fortran pointers
|
||||
in Example \example{cancellation.2.f90} and changed to
|
||||
use \kcode{atomic write} for performing atomic writes
|
||||
(\specref{sec:cancellation})
|
||||
\item Added missing \kcode{declare target} directive for external procedure
|
||||
called inside \kcode{target} region in Example
|
||||
\example{requires.1.f90} (\specref{sec:requires})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.2.1 to 5.2.2}
|
||||
\label{sec:history_521_to_522}
|
||||
|
||||
\begin{itemize}
|
||||
\item To improve the style of the document, a set of macros was introduced
|
||||
and consistently used for language keywords, names, concepts, and user codes
|
||||
in the text description of the document. Refer to the content of
|
||||
\examplesblob{v5.2.2/Contributions.md}
|
||||
for details.
|
||||
|
||||
\item Added the following examples:
|
||||
\begin{itemize}
|
||||
\item Orphaned and nested \kcode{loop} constructs (\specref{sec:loop})
|
||||
\item \kcode{all} variable category for the \kcode{defaultmap} clause
|
||||
(\specref{sec:defaultmap})
|
||||
\item \kcode{target update} construct using a custom mapper
|
||||
(\specref{subsec:target_update_mapper})
|
||||
\item \kcode{indirect} clause for indirect procedure calls in a
|
||||
\kcode{target} region (\specref{subsec:indirect})
|
||||
\item \kcode{omp_target_memcpy_async} routine with depend object
|
||||
(\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item Synchronization hint for atomic operation (\specref{sec:atomic_hint})
|
||||
\item Implication of passing shared variable to a procedure
|
||||
in Fortran (\specref{sec:fort_shared_var})
|
||||
\item Assumption directives for providing additional information
|
||||
about program properties (\specref{sec:assumption})
|
||||
\item Mapping behavior of scalars, pointers, references (C++) and associate names
|
||||
(Fortran) when unified shared memory is required
|
||||
(\specref{sec:requires})
|
||||
\item \kcode{begin declare variant} paired with \kcode{end declare variant}
|
||||
example to show use of nested declare variant
|
||||
directives (\specref{subsec:declare_variant})
|
||||
\item Explicit scoring in context selectors
|
||||
(\specref{subsec:context_selector_scoring})
|
||||
\end{itemize}
|
||||
|
||||
\item Miscellaneous changes:
|
||||
\begin{itemize}
|
||||
\item Included a general statement in Introduction about the number of
|
||||
threads used throughout the examples document (\specref{sec:examples})
|
||||
\item Clarified the mapping of virtual functions in \kcode{target} regions
|
||||
(\specref{sec:virtual_functions})
|
||||
\item Added missing \kcode{declare target} directive for procedures
|
||||
called inside \kcode{target} region in \example{Examples}
|
||||
\example{declare_mapper.1.f90} (\specref{sec:declare_mapper}),
|
||||
\example{target_reduction.*.f90} (\specref{subsec:target_reduction}),
|
||||
and \example{target_task_reduction.*.f90}
|
||||
(\specref{subsec:target_task_reduction})
|
||||
\item Added missing \kcode{end target} directive in
|
||||
\example{Example declare_mapper.3.f90}
|
||||
(\specref{sec:declare_mapper})
|
||||
\item Removed example for \kcode{flush} without a list from Synchronization
|
||||
since the example is confusing and the use of \kcode{flush} is already
|
||||
covered in other examples
|
||||
(\specref{chap:synchronization})
|
||||
\item \docref{declare variant Directive} and \docref{Metadirective} sections were moved to
|
||||
subsections in the new \docref{Context-based Variant Selection} section,
|
||||
with a section introduction on context selectors.
|
||||
(\specref{sec:context_based_variants})
|
||||
\item Fixed a typo (`\kcode{for}' $\rightarrow$ `\kcode{do}') in
|
||||
\example{Example metadirective.4.f90}
|
||||
(\specref{subsec:metadirective})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.2 to 5.2.1}
|
||||
\label{sec:history_52_to_521}
|
||||
|
||||
\begin{itemize}
|
||||
\item General changes:
|
||||
\begin{itemize}
|
||||
\item Updated source metadata tags for all examples to use an improved form
|
||||
(see \examplesblob{v5.2.1/Contributions.md})
|
||||
\item Explicitly included the version tag \verlabel[pre\_]{3.0} in those
|
||||
examples that did not contain a version tag previously
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following examples for the 5.2 features:
|
||||
\begin{itemize}
|
||||
\item \kcode{uses_allocators} clause for the use of allocators in
|
||||
\kcode{target} regions (\specref{sec:allocators})
|
||||
\end{itemize}
|
||||
\item Added the following examples for the 5.1 features:
|
||||
\begin{itemize}
|
||||
\item The \kcode{inoutset} dependence type (\specref{subsec:task_concurrent_depend})
|
||||
\item Atomic compare and capture (\specref{sec:cas})
|
||||
\end{itemize}
|
||||
\item Added the following examples for the 5.0 features:
|
||||
\begin{itemize}
|
||||
\item \kcode{declare target} directive with \kcode{device_type(nohost)}
|
||||
clause (\specref{subsec:declare_target_device_type})
|
||||
\item \kcode{omp_pause_resource} and \kcode{omp_pause_resource_all}
|
||||
routines (\specref{sec:pause_resource})
|
||||
\end{itemize}
|
||||
|
||||
\item Miscellaneous fixes:
|
||||
\begin{itemize}
|
||||
\item Cast to implementation-defined enum type \kcode{omp_event_handle_t}
|
||||
now uses \bcode{uintptr_t} (not \bcode{void *}) in
|
||||
\example{Example task_detach.2.c}
|
||||
(\specref{sec:task_detachment})
|
||||
\item Moved Fortran \kcode{requires} directive into program main (\ucode{rev_off}),
|
||||
the program unit, in \example{Example target_reverse_offload.7.f90}
|
||||
(\specref{subsec:target_reverse_offload})
|
||||
\item Fixed an inconsistent use of mapper in \example{Example target_mapper.3.f90}
|
||||
(\specref{sec:declare_mapper})
|
||||
\item Added a missing semicolon at end of \ucode{XOR1} class definition in
|
||||
\example{Example declare_target.2a.cpp}
|
||||
(\specref{subsec:declare_target_class})
|
||||
\item Fixed the placement of \kcode{declare simd} directive in
|
||||
\example{Examples linear_modifier.*.f90} (\specref{sec:linear_modifier})
|
||||
and added a general statement about where a Fortran declarative
|
||||
directive can appear (\specref{chap:directive_syntax})
|
||||
\item Fixed mismatched argument list in \example{Example fort_sa_private.5.f}
|
||||
(\specref{sec:fort_sa_private})
|
||||
\item Moved the placement of \kcode{declare target enter}
|
||||
directive after function declaration
|
||||
(\specref{subsec:target_task_reduction})
|
||||
\item Fixed an incorrect use of \kcode{omp_in_parallel} routine in
|
||||
\example{Example metadirective.4}
|
||||
(\specref{subsec:metadirective})
|
||||
\item Fixed an incorrect value for \kcode{at} clause
|
||||
(\specref{subsec:error})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.1 to 5.2}
|
||||
\label{sec:history_51_to_52}
|
||||
|
||||
\begin{itemize}
|
||||
\item General changes:
|
||||
\begin{itemize}
|
||||
\item Included a description of the semantics for OpenMP directive syntax
|
||||
(see \specref{chap:directive_syntax})
|
||||
\item Reorganized the Introduction Chapter and moved the Feature
|
||||
Deprecation Chapter to Appendix~\ref{chap:deprecated_features}
|
||||
\item Included a list of examples that were updated for feature deprecation
|
||||
and replacement in each version (see Appendix~\ref{sec:Updated Examples})
|
||||
\item Added Index entries
|
||||
\end{itemize}
|
||||
|
||||
\item Updated the examples for feature deprecation and replacement in OpenMP 5.2.
|
||||
See Table~\ref{tab:Deprecated Features} and
|
||||
Table~\ref{tab:Updated Examples 5.2} for details.
|
||||
|
||||
\item Added the following examples for the 5.2 features:
|
||||
\begin{itemize}
|
||||
\item Mapping class objects with virtual functions
|
||||
(\specref{sec:virtual_functions})
|
||||
\item \kcode{allocators} construct for Fortran \bcode{allocate} statement
|
||||
(\specref{sec:allocators})
|
||||
\item Behavior of reallocation of variables through OpenMP allocator in
|
||||
Fortran (\specref{sec:allocators})
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following examples for the 5.1 features:
|
||||
\begin{itemize}
|
||||
\item Clarification of optional \kcode{end} directive for strictly structured
|
||||
block in Fortran (\specref{sec:fortran_free_format_comments})
|
||||
\item \kcode{filter} clause on \kcode{masked} construct (\specref{sec:masked})
|
||||
\item \kcode{omp_all_memory} reserved locator for specifying task dependences
|
||||
(\specref{subsec:depend_undefer_task})
|
||||
\item Behavior of Fortran allocatable variables in \kcode{target} regions
|
||||
(\specref{sec:fort_allocatable_array_mapping})
|
||||
\item Device memory routines in Fortran
|
||||
(\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item Partial tiles from \kcode{tile} construct
|
||||
(\specref{sec:incomplete_tiles})
|
||||
\item Fortran associate names and selectors in \kcode{target} region
|
||||
(\specref{sec:associate_target})
|
||||
\item \kcode{allocate} directive for variable declarations and
|
||||
\kcode{allocate} clause on \kcode{task} constructs
|
||||
(\specref{sec:allocators})
|
||||
\item Controlling concurrency and reproducibility with \kcode{order} clause
|
||||
(\specref{sec:reproducible_modifier})
|
||||
\end{itemize}
|
||||
|
||||
\item Added other examples:
|
||||
\begin{itemize}
|
||||
\item Using lambda expressions with \kcode{target} constructs
|
||||
(\specref{sec:lambda_expressions})
|
||||
\item Target memory and device pointer routines
|
||||
(\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item Examples to illustrate the ordering properties of
|
||||
the \plc{flush} operation (\specref{sec:mem_model})
|
||||
\item User selector in the \kcode{metadirective} directive
|
||||
(\specref{subsec:metadirective})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
%=====================================
|
||||
\section{Changes from 5.0.1 to 5.1}
|
||||
\label{sec:history_501_to_51}
|
||||
@ -8,11 +272,11 @@
|
||||
\begin{itemize}
|
||||
\item General changes:
|
||||
\begin{itemize}
|
||||
\item Replaced \code{master} construct example with equivalent \code{masked} construct example (\specref{sec:masked})
|
||||
\item Replaced \kcode{master} construct example with equivalent \kcode{masked} construct example (\specref{sec:masked})
|
||||
\item Primary thread is now used to describe thread number 0 in the current team
|
||||
\item \code{primary} thread affinity policy is now used to specify that every
|
||||
\item \kcode{primary} thread affinity policy is now used to specify that every
|
||||
thread in the team is assigned to the same place as the primary thread (\specref{subsec:affinity_primary})
|
||||
\item The \scode{omp_lock_hint_*} constants have been renamed \scode{omp_sync_hint_*} (\specref{sec:critical}, \specref{sec:locks})
|
||||
\item The \kcode{omp_lock_hint_*} constants have been renamed \kcode{omp_sync_hint_*} (\specref{sec:critical}, \specref{sec:locks})
|
||||
\end{itemize}
|
||||
|
||||
\item Added the following new chapters:
|
||||
@ -27,34 +291,34 @@
|
||||
\begin{itemize}
|
||||
\item OpenMP directives in C++ \plc{attribute} specifiers
|
||||
(\specref{sec:attributes})
|
||||
\item Directive syntax adjustment to allow Fortran \code{BLOCK} ...
|
||||
\code{END}~\code{BLOCK} as a structured block
|
||||
\item Directive syntax adjustment to allow Fortran \bcode{BLOCK} ...
|
||||
\bcode{END BLOCK} as a structured block
|
||||
(\specref{sec:fortran_free_format_comments})
|
||||
\item \code{omp\_target\_is\_accessible} API routine
|
||||
\item \kcode{omp_target_is_accessible} API routine
|
||||
(\specref{sec:pointer_mapping})
|
||||
\item Fortran allocatable array mapping in \code{target} regions (\specref{sec:fort_allocatable_array_mapping})
|
||||
\item \code{begin}~\code{declare}~\code{target} (with
|
||||
\code{end}~\code{declare}~\code{target}) directive
|
||||
\item Fortran allocatable array mapping in \kcode{target} regions (\specref{sec:fort_allocatable_array_mapping})
|
||||
\item \kcode{begin declare target} (with
|
||||
\kcode{end declare target}) directive
|
||||
(\specref{subsec:declare_target_class})
|
||||
\item \code{tile} construct (\specref{sec:tile})
|
||||
\item \code{unroll} construct (\specref{sec:unroll})
|
||||
\item Reduction with the \code{scope} construct
|
||||
\item \kcode{tile} construct (\specref{sec:tile})
|
||||
\item \kcode{unroll} construct (\specref{sec:unroll})
|
||||
\item Reduction with the \kcode{scope} construct
|
||||
(\specref{subsec:reduction_scope})
|
||||
\item \code{metadirective} directive with dynamic \code{condition} selector
|
||||
(\specref{sec:metadirective})
|
||||
\item \code{interop} construct (\specref{sec:interop})
|
||||
\item Environment display with the \scode{omp_display_env} routine
|
||||
\item \kcode{metadirective} directive with dynamic \kcode{condition} selector
|
||||
(\specref{subsec:metadirective})
|
||||
\item \kcode{interop} construct (\specref{sec:interop})
|
||||
\item Environment display with the \kcode{omp_display_env} routine
|
||||
(\specref{subsec:display_env})
|
||||
\item \code{error} directive (\specref{subsec:error})
|
||||
\item \kcode{error} directive (\specref{subsec:error})
|
||||
\end{itemize}
|
||||
|
||||
\item Included additional examples for the 5.0 features:
|
||||
\begin{itemize}
|
||||
\item \code{collapse} clause for non-rectangular loop nest
|
||||
\item \kcode{collapse} clause for non-rectangular loop nest
|
||||
(\specref{sec:collapse})
|
||||
\item \code{detach} clause for tasks (\specref{sec:task_detachment})
|
||||
\item \kcode{detach} clause for tasks (\specref{sec:task_detachment})
|
||||
\item Pointer attachment for a structure member (\specref{sec:structure_mapping})
|
||||
\item Host and device pointer association with the \scode{omp_target_associate_ptr} routine (\specref{sec:target_associate_ptr})
|
||||
\item Host and device pointer association with the \kcode{omp_target_associate_ptr} routine (\specref{sec:target_associate_ptr})
|
||||
|
||||
\item Sample code on activating the tool interface
|
||||
(\specref{sec:ompt_start})
|
||||
@ -62,7 +326,7 @@
|
||||
|
||||
\item Added other examples:
|
||||
\begin{itemize}
|
||||
\item The \scode{omp_get_wtime} routine (\specref{subsec:get_wtime})
|
||||
\item The \kcode{omp_get_wtime} routine (\specref{subsec:get_wtime})
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
@ -72,22 +336,22 @@
|
||||
\label{sec:history_50_to_501}
|
||||
|
||||
\begin{itemize}
|
||||
\item Added version tags (\code{\small{}omp\_}\plc{x.y}) in example labels
|
||||
\item Added version tags \verlabel{\plc{x.y}} in example labels
|
||||
and the corresponding source codes for all examples that feature
|
||||
OpenMP 3.0 and later.
|
||||
|
||||
\item Included additional examples for the 5.0 features:
|
||||
|
||||
\begin{itemize}
|
||||
\item Extension to the \code{defaultmap} clause
|
||||
\item Extension to the \kcode{defaultmap} clause
|
||||
(\specref{sec:defaultmap})
|
||||
\item Transferring noncontiguous data with the \code{target}~\code{update} directive in Fortran (\specref{sec:array-shaping})
|
||||
\item \code{conditional} modifier for the \code{lastprivate} clause (\specref{sec:lastprivate})
|
||||
\item \code{task} modifier for the \code{reduction} clause (\specref{subsec:task_reduction})
|
||||
\item Transferring noncontiguous data with the \kcode{target update} directive in Fortran (\specref{sec:array-shaping})
|
||||
\item \kcode{conditional} modifier for the \kcode{lastprivate} clause (\specref{sec:lastprivate})
|
||||
\item \kcode{task} modifier for the \kcode{reduction} clause (\specref{subsec:task_reduction})
|
||||
\item Reduction on combined target constructs (\specref{subsec:target_reduction})
|
||||
\item Task reduction with \code{target} constructs
|
||||
\item Task reduction with \kcode{target} constructs
|
||||
(\specref{subsec:target_task_reduction})
|
||||
\item \code{scan} directive for returning the \emph{prefix sum} of a reduction (\specref{sec:scan})
|
||||
\item \kcode{scan} directive for returning the \emph{prefix sum} of a reduction (\specref{sec:scan})
|
||||
|
||||
\end{itemize}
|
||||
|
||||
@ -96,7 +360,7 @@ OpenMP 3.0 and later.
|
||||
\begin{itemize}
|
||||
\item Dependence for undeferred tasks
|
||||
(\specref{subsec:depend_undefer_task})
|
||||
\item \code{ref}, \code{val}, \code{uval} modifiers for \code{linear} clause (\specref{sec:linear_modifier})
|
||||
\item \kcode{ref}, \kcode{val}, \kcode{uval} modifiers for \kcode{linear} clause (\specref{sec:linear_modifier})
|
||||
|
||||
\end{itemize}
|
||||
|
||||
@ -115,37 +379,39 @@ in \specref{sec:mem_model}.
|
||||
\item Added the following examples for the 5.0 features:
|
||||
|
||||
\begin{itemize}
|
||||
\item Extended \code{teams} construct for host execution (\specref{sec:host_teams})
|
||||
\item \code{loop} and \code{teams}~\code{loop} constructs specify loop iterations that can execute concurrently
|
||||
\item Extended \kcode{teams} construct for host execution (\specref{sec:host_teams})
|
||||
\item \kcode{loop} and \kcode{teams loop} constructs specify loop iterations that can execute concurrently
|
||||
(\specref{sec:loop})
|
||||
\item Task data affinity is indicated by \code{affinity} clause of \code{task} construct
|
||||
\item Task data affinity is indicated by \kcode{affinity} clause of \kcode{task} construct
|
||||
(\specref{sec: task_affinity})
|
||||
\item Display thread affinity with \code{OMP\_DISPLAY\_AFFINITY} environment variable or \code{omp\_display\_affinity()} API routine
|
||||
\item Display thread affinity with \kcode{OMP_DISPLAY_AFFINITY} environment variable or \kcode{omp_display_affinity()} API routine
|
||||
(\specref{sec:affinity_display})
|
||||
\item \code{taskwait} with dependences (\specref{subsec:taskwait_depend})
|
||||
\item \code{mutexinoutset} task dependences (\specref{subsec:task_dep_mutexinoutset})
|
||||
\item Multidependence Iterators (in \code{depend} clauses) (\specref{subsec:depend_iterator})
|
||||
\item Combined constructs: \code{parallel}~\code{master}~\code{taskloop} and \code{parallel}~\code{master}~\code{taskloop}~\code{simd}
|
||||
\item \kcode{taskwait} with dependences (\specref{subsec:taskwait_depend})
|
||||
\item \kcode{mutexinoutset} task dependences (\specref{subsec:task_dep_mutexinoutset})
|
||||
\item Multidependence Iterators (in \kcode{depend} clauses) (\specref{subsec:depend_iterator})
|
||||
\item Combined constructs: \kcode{parallel master taskloop} and \kcode{parallel master taskloop simd}
|
||||
(\specref{sec:parallel_masked_taskloop})
|
||||
\item Reverse Offload through \plc{ancestor} modifier of \code{device} clause. (\specref{subsec:target_reverse_offload})
|
||||
\item Reverse Offload through \kcode{ancestor} modifier of \kcode{device} clause. (\specref{subsec:target_reverse_offload})
|
||||
\item Pointer Mapping - behavior of mapped pointers (\specref{sec:pointer_mapping}) %Example_target_ptr_map*
|
||||
\item Structure Mapping - behavior of mapped structures (\specref{sec:structure_mapping}) %Examples_target_structure_mapping.tex target_struct_map*
|
||||
\item Array Shaping with the \plc{shape-operator} (\specref{sec:array-shaping})
|
||||
\item The \code{declare}~\code{mapper} directive (\specref{sec:declare_mapper})
|
||||
\item The \kcode{declare mapper} directive (\specref{sec:declare_mapper})
|
||||
\item Acquire and Release Semantics Synchronization: Memory ordering
|
||||
clauses \code{acquire}, \code{release}, and \code{acq\_rel} were added
|
||||
clauses \kcode{acquire}, \kcode{release}, and \kcode{acq_rel} were added
|
||||
to flush and atomic constructs
|
||||
(\specref{sec:acquire_and_release_semantics})
|
||||
\item \code{depobj} construct provides dependence objects for subsequent use in \code{depend} clauses
|
||||
\item \kcode{depobj} construct provides dependence objects for subsequent use in \kcode{depend} clauses
|
||||
(\specref{sec:depobj})
|
||||
\item \code{reduction} clause for \code{task} construct (\specref{subsec:task_reduction})
|
||||
\item \code{reduction} clause for \code{taskloop} construct (\specref{subsec:taskloop_reduction})
|
||||
\item \code{reduction} clause for \code{taskloop}~\code{simd} construct (\specref{subsec:taskloop_reduction})
|
||||
\item \kcode{reduction} clause for \kcode{task} construct (\specref{subsec:task_reduction})
|
||||
\item \kcode{reduction} clause for \kcode{taskloop} construct (\specref{subsec:taskloop_reduction})
|
||||
\item \kcode{reduction} clause for \kcode{taskloop simd} construct (\specref{subsec:taskloop_reduction})
|
||||
\item Memory Allocators for making OpenMP memory requests with traits (\specref{sec:allocators})
|
||||
\item \code{requires} directive specifies required features of implementation (\specref{sec:requires})
|
||||
\item \code{declare}~\code{variant} directive - for function variants (\specref{sec:declare_variant})
|
||||
\item \code{metadirective} directive - for directive variants (\specref{sec:metadirective})
|
||||
\item \code{OMP\_TARGET\_OFFLOAD} Environment Variable - controls offload behavior (\specref{sec:target_offload})
|
||||
\item \kcode{requires} directive specifies required features of implementation (\specref{sec:requires})
|
||||
\item \kcode{declare variant} directive - for function variants
|
||||
(\specref{subsec:declare_variant})
|
||||
\item \kcode{metadirective} directive - for directive variants
|
||||
(\specref{subsec:metadirective})
|
||||
\item \kcode{OMP_TARGET_OFFLOAD} Environment Variable - controls offload behavior (\specref{sec:target_offload})
|
||||
\end{itemize}
|
||||
|
||||
\item Included the following additional examples for the 4.x features:
|
||||
@ -162,22 +428,22 @@ in \specref{sec:mem_model}.
|
||||
\begin{itemize}
|
||||
\item Reorganized into chapters of major topics
|
||||
\item Included file extensions in example labels to indicate source type
|
||||
\item Applied the explicit \code{map(tofrom)} for scalar variables
|
||||
\item Applied the explicit \kcode{map(tofrom)} for scalar variables
|
||||
in a number of examples to comply with
|
||||
the change of the default behavior for scalar variables from
|
||||
\code{map(tofrom)} to \code{firstprivate} in the 4.5 specification
|
||||
\kcode{map(tofrom)} to \kcode{firstprivate} in the 4.5 specification
|
||||
\item Added the following new examples:
|
||||
|
||||
\begin{itemize}
|
||||
\item \code{linear} clause in loop constructs (\specref{sec:linear_in_loop})
|
||||
\item \code{priority} clause for \code{task} construct (\specref{sec:task_priority})
|
||||
\item \code{taskloop} construct (\specref{sec:taskloop})
|
||||
\item \plc{directive-name} modifier in multiple \code{if} clauses on
|
||||
\item \kcode{linear} clause in loop constructs (\specref{sec:linear_in_loop})
|
||||
\item \kcode{priority} clause for \kcode{task} construct (\specref{sec:task_priority})
|
||||
\item \kcode{taskloop} construct (\specref{sec:taskloop})
|
||||
\item \plc{directive-name} modifier in multiple \kcode{if} clauses on
|
||||
a combined construct (\specref{subsec:target_if})
|
||||
\item unstructured data mapping (\specref{sec:target_enter_exit_data})
|
||||
\item \code{link} clause for \code{declare}~\code{target} directive
|
||||
\item \kcode{link} clause for \kcode{declare target} directive
|
||||
(\specref{subsec:declare_target_link})
|
||||
\item asynchronous target execution with \code{nowait} clause (\specref{sec:async_target_exec_depend})
|
||||
\item asynchronous target execution with \kcode{nowait} clause (\specref{sec:async_target_exec_depend})
|
||||
\item device memory routines and device pointers (\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item doacross loop nest (\specref{sec:doacross})
|
||||
\item locks with hints (\specref{sec:locks})
|
||||
@ -200,8 +466,8 @@ a combined construct (\specref{subsec:target_if})
|
||||
|
||||
Added the following new examples:
|
||||
\begin{itemize}
|
||||
\item the \code{proc\_bind} clause (\specref{sec:affinity})
|
||||
\item the \code{taskgroup} construct (\specref{sec:taskgroup})
|
||||
\item the \kcode{proc_bind} clause (\specref{sec:affinity})
|
||||
\item the \kcode{taskgroup} construct (\specref{sec:taskgroup})
|
||||
\end{itemize}
|
||||
|
||||
\section{Changes from 3.1 to 4.0}
|
||||
@ -213,13 +479,13 @@ Added the following new examples:
|
||||
|
||||
\begin{itemize}
|
||||
\item task dependences (\specref{sec:task_depend})
|
||||
\item \code{target} construct (\specref{sec:target})
|
||||
\item \kcode{target} construct (\specref{sec:target})
|
||||
\item array sections in device constructs (\specref{sec:array_sections})
|
||||
\item \code{target}~\code{data} construct (\specref{sec:target_data})
|
||||
\item \code{target}~\code{update} construct (\specref{sec:target_update})
|
||||
\item \code{declare}~\code{target} directive (\specref{sec:declare_target})
|
||||
\item \code{teams} constructs (\specref{sec:teams})
|
||||
\item asynchronous execution of a \code{target} region using tasks (\specref{subsec:async_target_with_tasks})
|
||||
\item \kcode{target data} construct (\specref{sec:target_data})
|
||||
\item \kcode{target update} construct (\specref{sec:target_update})
|
||||
\item \kcode{declare target} directive (\specref{sec:declare_target})
|
||||
\item \kcode{teams} constructs (\specref{sec:teams})
|
||||
\item asynchronous execution of a \kcode{target} region using tasks (\specref{subsec:async_target_with_tasks})
|
||||
\item device runtime routines (\specref{sec:device})
|
||||
\item Fortran ASSOCIATE construct (\specref{sec:associate})
|
||||
\item cancellation constructs (\specref{sec:cancellation})
|
||||
|
66
Makefile
66
Makefile
@ -1,17 +1,29 @@
|
||||
# Makefile for the OpenMP Examples document in LaTex format.
|
||||
# For more information, see the main document, openmp-examples.tex.
|
||||
SHELL=bash
|
||||
|
||||
include versioninfo
|
||||
|
||||
version=5.1
|
||||
default: openmp-examples.pdf
|
||||
diff: openmp-diff-abridged.pdf
|
||||
diff: clean openmp-diff-abridged.pdf
|
||||
|
||||
release: VERSIONSTR="$(version_date)"
|
||||
release: clean openmp-examples.pdf
|
||||
|
||||
book: BOOK_BUILD="\\def\\bookbuild{1}"
|
||||
book: clean release
|
||||
mv openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf
|
||||
|
||||
ccpp-only: LANG_OPT="\\ccpptrue\\fortranfalse"
|
||||
ccpp-only: clean release
|
||||
|
||||
fortran-only: LANG_OPT="\\ccppfalse\\fortrantrue"
|
||||
fortran-only: clean release
|
||||
|
||||
CHAPTERS=Title_Page.tex \
|
||||
Foreword_Chapt.tex \
|
||||
Introduction_Chapt.tex \
|
||||
Examples_Chapt.tex \
|
||||
Deprecated_Features_Chapt.tex \
|
||||
Chap_*.tex \
|
||||
Deprecated_Features.tex \
|
||||
History.tex \
|
||||
*/*.tex
|
||||
|
||||
@ -22,6 +34,8 @@ SOURCES=*/sources/*.c \
|
||||
|
||||
INTERMEDIATE_FILES=openmp-examples.pdf \
|
||||
openmp-examples.toc \
|
||||
openmp-examples.lof \
|
||||
openmp-examples.lot \
|
||||
openmp-examples.idx \
|
||||
openmp-examples.aux \
|
||||
openmp-examples.ilg \
|
||||
@ -29,20 +43,37 @@ INTERMEDIATE_FILES=openmp-examples.pdf \
|
||||
openmp-examples.out \
|
||||
openmp-examples.log
|
||||
|
||||
LATEXCMD=pdflatex -interaction=batchmode -file-line-error
|
||||
LATEXDCMD=$(LATEXCMD) -draftmode
|
||||
|
||||
# check for branches names with "name_XXX"
|
||||
DIFF_TICKET_ID=$(shell git rev-parse --abbrev-ref HEAD)
|
||||
GITREV=$(shell git rev-parse --short HEAD || echo "??")
|
||||
VERSIONSTR="GIT rev $(GITREV)"
|
||||
LANG_OPT="\\ccpptrue\\fortrantrue"
|
||||
|
||||
openmp-examples.pdf: $(CHAPTERS) $(SOURCES) openmp.sty openmp-examples.tex openmp-logo.png
|
||||
openmp-examples.pdf: $(CHAPTERS) $(SOURCES) openmp.sty openmp-examples.tex openmp-logo.png generated-include.tex
|
||||
rm -f $(INTERMEDIATE_FILES)
|
||||
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
|
||||
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
|
||||
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
|
||||
touch generated-include.tex
|
||||
$(LATEXDCMD) openmp-examples.tex
|
||||
makeindex -s openmp-index.ist openmp-examples.idx
|
||||
$(LATEXDCMD) openmp-examples.tex
|
||||
$(LATEXCMD) openmp-examples.tex
|
||||
cp openmp-examples.pdf openmp-examples-${version}.pdf
|
||||
|
||||
check:
|
||||
sources/check_tags
|
||||
|
||||
clean:
|
||||
rm -f $(INTERMEDIATE_FILES)
|
||||
rm -f generated-include.tex
|
||||
rm -f openmp-diff-full.pdf openmp-diff-abridged.pdf
|
||||
rm -rf *.tmpdir
|
||||
cd util; make clean
|
||||
rm -f chk_tags.log sources/*.log
|
||||
|
||||
realclean: clean
|
||||
rm -f openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf
|
||||
|
||||
ifdef DIFF_TO
|
||||
VC_DIFF_TO := -r ${DIFF_TO}
|
||||
@ -52,21 +83,30 @@ endif
|
||||
ifdef DIFF_FROM
|
||||
VC_DIFF_FROM := -r ${DIFF_FROM}
|
||||
else
|
||||
VC_DIFF_FROM := -r work_5.1
|
||||
VC_DIFF_FROM := -r work_6.0
|
||||
endif
|
||||
|
||||
DIFF_TO:=HEAD
|
||||
DIFF_FROM:=work_5.1
|
||||
DIFF_FROM:=work_6.0
|
||||
DIFF_TYPE:=UNDERLINE
|
||||
|
||||
COMMON_DIFF_OPTS:=--math-markup=whole \
|
||||
--append-safecmd=plc,code,hcode,scode,pcode,splc \
|
||||
--append-safecmd=plc,code,kcode,scode,ucode,vcode,splc,bcode,pvar,pout,example \
|
||||
--append-textcmd=subsubsubsection
|
||||
|
||||
VC_DIFF_OPTS:=${COMMON_DIFF_OPTS} --force -c latexdiff.cfg --flatten --type="${DIFF_TYPE}" --git --pdf ${VC_DIFF_FROM} ${VC_DIFF_TO} --subtype=ZLABEL --graphics-markup=none
|
||||
|
||||
VC_DIFF_MINIMAL_OPTS:= --only-changes --force
|
||||
|
||||
generated-include.tex:
|
||||
echo "$(BOOK_BUILD)" > $@
|
||||
echo "\\def\\VER{${version}}" >> $@
|
||||
echo "\\def\\SVER{${version_spec}}" >> $@
|
||||
echo "\\def\\VERDATE{${VERSIONSTR}}" >> $@
|
||||
@echo "\\newif\\ifccpp\\newif\\iffortran" >> $@
|
||||
echo "$(LANG_OPT)" >> $@
|
||||
util/list_tags -vtag */sources/* >> $@
|
||||
|
||||
%.tmpdir: $(wildcard *.sty) $(wildcard *.png) $(wildcard *.aux) openmp-examples.pdf
|
||||
mkdir -p $@/sources
|
||||
for i in affinity devices loop_transformations parallel_execution SIMD tasking \
|
||||
@ -88,3 +128,5 @@ openmp-diff-minimal.pdf: diffs-slow-minimal.tmpdir
|
||||
env PATH="$(shell pwd)/util/latexdiff:$(PATH)" latexdiff-vc ${VC_DIFF_MINIMAL_OPTS} -d $< ${VC_DIFF_OPTS} openmp-examples.tex
|
||||
cp $</openmp-examples.pdf $@
|
||||
if [ "x$(DIFF_TICKET_ID)" != "x" ]; then cp $@ ${@:.pdf=-$(DIFF_TICKET_ID).pdf}; fi
|
||||
|
||||
.PHONY: diff default book clean realclean
|
||||
|
@ -1,8 +1,10 @@
|
||||
%\pagebreak
|
||||
\section{\code{simd} and \code{declare} \code{simd} Directives}
|
||||
\section{\kcode{simd} and \kcode{declare simd} Directives}
|
||||
\label{sec:SIMD}
|
||||
|
||||
The following example illustrates the basic use of the \code{simd} construct
|
||||
\index{constructs!simd@\kcode{simd}}
|
||||
\index{simd construct@\kcode{simd} construct}
|
||||
The following example illustrates the basic use of the \kcode{simd} construct
|
||||
to assure the compiler that the loop can be vectorized.
|
||||
|
||||
\cexample[4.0]{SIMD}{1}
|
||||
@ -10,32 +12,38 @@ to assure the compiler that the loop can be vectorized.
|
||||
\ffreeexample[4.0]{SIMD}{1}
|
||||
|
||||
|
||||
\index{directives!declare simd@\kcode{declare simd}}
|
||||
\index{declare simd directive@\kcode{declare simd} directive}
|
||||
\index{clauses!uniform@\kcode{uniform}}
|
||||
\index{uniform clause@\kcode{uniform} clause}
|
||||
\index{clauses!linear@\kcode{linear}}
|
||||
\index{linear clause@\kcode{linear} clause}
|
||||
When a function can be inlined within a loop the compiler has an opportunity to
|
||||
vectorize the loop. By guaranteeing SIMD behavior of a function's operations,
|
||||
characterizing the arguments of the function and privatizing temporary
|
||||
variables of the loop, the compiler can often create faster, vector code for
|
||||
the loop. In the examples below the \code{declare} \code{simd} directive is
|
||||
used on the \plc{add1} and \plc{add2} functions to enable creation of their
|
||||
the loop. In the examples below the \kcode{declare simd} directive is
|
||||
used on the \ucode{add1} and \ucode{add2} functions to enable creation of their
|
||||
corresponding SIMD function versions for execution within the associated SIMD
|
||||
loop. The functions characterize two different approaches of accessing data
|
||||
within the function: by a single variable and as an element in a data array,
|
||||
respectively. The \plc{add3} C function uses dereferencing.
|
||||
respectively. The \ucode{add3} C function uses dereferencing.
|
||||
|
||||
The \code{declare} \code{simd} directives also illustrate the use of
|
||||
\code{uniform} and \code{linear} clauses. The \code{uniform(fact)} clause
|
||||
indicates that the variable \plc{fact} is invariant across the SIMD lanes. In
|
||||
the \plc{add2} function \plc{a} and \plc{b} are included in the \code{uniform}
|
||||
The \kcode{declare simd} directives also illustrate the use of
|
||||
\kcode{uniform} and \kcode{linear} clauses. The \kcode{uniform(\ucode{fact})} clause
|
||||
indicates that the variable \ucode{fact} is invariant across the SIMD lanes. In
|
||||
the \ucode{add2} function \ucode{a} and \ucode{b} are included in the \kcode{uniform}
|
||||
list because the C pointer and the Fortran array references are constant. The
|
||||
\plc{i} index used in the \plc{add2} function is included in a \code{linear}
|
||||
\ucode{i} index used in the \ucode{add2} function is included in a \kcode{linear}
|
||||
clause with a constant-linear-step of 1, to guarantee a unity increment of the
|
||||
associated loop. In the \code{declare} \code{simd} directive for the \plc{add3}
|
||||
C function the \code{linear(a,b:1)} clause instructs the compiler to generate
|
||||
associated loop. In the \kcode{declare simd} directive for the \ucode{add3}
|
||||
C function the \kcode{linear(\ucode{a,b:1})} clause instructs the compiler to generate
|
||||
unit-stride loads across the SIMD lanes; otherwise, costly \emph{gather}
|
||||
instructions would be generated for the unknown sequence of access of the
|
||||
pointer dereferences.
|
||||
|
||||
In the \code{simd} constructs for the loops the \code{private(tmp)} clause is
|
||||
necessary to assure that the each vector operation has its own \plc{tmp}
|
||||
In the \kcode{simd} constructs for the loops the \kcode{private(\ucode{tmp})} clause is
|
||||
necessary to assure that each vector operation has its own \ucode{tmp}
|
||||
variable.
|
||||
|
||||
\cexample[4.0]{SIMD}{2}
|
||||
@ -43,11 +51,16 @@ variable.
|
||||
\ffreeexample[4.0]{SIMD}{2}
|
||||
|
||||
%\pagebreak
|
||||
\index{clauses!private@\kcode{private}}
|
||||
\index{private clause@\kcode{private} clause}
|
||||
\index{clauses!reduction@\kcode{reduction}}
|
||||
\index{reduction clause@\kcode{reduction} clause}
|
||||
\index{reductions!reduction clause@\kcode{reduction} clause}
|
||||
A thread that encounters a SIMD construct executes a vectorized code of the
|
||||
iterations. Similar to the concerns of a worksharing loop a loop vectorized
|
||||
with a SIMD construct must assure that temporary and reduction variables are
|
||||
privatized and declared as reductions with clauses. The example below
|
||||
illustrates the use of \code{private} and \code{reduction} clauses in a SIMD
|
||||
illustrates the use of \kcode{private} and \kcode{reduction} clauses in a SIMD
|
||||
construct.
|
||||
|
||||
\cexample[4.0]{SIMD}{3}
|
||||
@ -56,14 +69,16 @@ construct.
|
||||
|
||||
|
||||
%\pagebreak
|
||||
A \code{safelen(N)} clause in a \code{simd} construct assures the compiler that
|
||||
there are no loop-carried dependencies for vectors of size \plc{N} or below. If
|
||||
the \code{safelen} clause is not specified, then the default safelen value is
|
||||
\index{clauses!safelen@\kcode{safelen}}
|
||||
\index{safelen clause@\kcode{safelen} clause}
|
||||
A \kcode{safelen(\ucode{N})} clause in a \kcode{simd} construct assures the compiler that
|
||||
there are no loop-carried dependences for vectors of size \ucode{N} or below. If
|
||||
the \kcode{safelen} clause is not specified, then the default safelen value is
|
||||
the number of loop iterations.
|
||||
|
||||
The \code{safelen(16)} clause in the example below guarantees that the vector
|
||||
code is safe for vectors up to and including size 16. In the loop, \plc{m} can
|
||||
be 16 or greater, for correct code execution. If the value of \plc{m} is less
|
||||
The \kcode{safelen(\ucode{16})} clause in the example below guarantees that the vector
|
||||
code is safe for vectors up to and including size 16. In the loop, \ucode{m} can
|
||||
be 16 or greater, for correct code execution. If the value of \ucode{m} is less
|
||||
than 16, the behavior is undefined.
|
||||
|
||||
\cexample[4.0]{SIMD}{4}
|
||||
@ -71,8 +86,10 @@ than 16, the behavior is undefined.
|
||||
\ffreeexample[4.0]{SIMD}{4}
|
||||
|
||||
%\pagebreak
|
||||
The following SIMD construct instructs the compiler to collapse the \plc{i} and
|
||||
\plc{j} loops into a single SIMD loop in which SIMD chunks are executed by
|
||||
\index{clauses!collapse@\kcode{collapse}}
|
||||
\index{collapse clause@\kcode{collapse} clause}
|
||||
The following SIMD construct instructs the compiler to collapse the \ucode{i} and
|
||||
\ucode{j} loops into a single SIMD loop in which SIMD chunks are executed by
|
||||
threads of the team. Within the workshared loop chunks of a thread, the SIMD
|
||||
chunks are executed in the lanes of the vector units.
|
||||
|
||||
@ -82,27 +99,31 @@ chunks are executed in the lanes of the vector units.
|
||||
|
||||
|
||||
%%% section
|
||||
\section{\code{inbranch} and \code{notinbranch} Clauses}
|
||||
\section{\kcode{inbranch} and \kcode{notinbranch} Clauses}
|
||||
\label{sec:SIMD_branch}
|
||||
\index{clauses!inbranch@\kcode{inbranch}}
|
||||
\index{inbranch clause@\kcode{inbranch} clause}
|
||||
\index{clauses!notinbranch@\kcode{notinbranch}}
|
||||
\index{notinbranch clause@\kcode{notinbranch} clause}
|
||||
|
||||
The following examples illustrate the use of the \code{declare} \code{simd}
|
||||
directive with the \code{inbranch} and \code{notinbranch} clauses. The
|
||||
\code{notinbranch} clause informs the compiler that the function \plc{foo} is
|
||||
never called conditionally in the SIMD loop of the function \plc{myaddint}. On
|
||||
the other hand, the \code{inbranch} clause for the function goo indicates that
|
||||
The following examples illustrate the use of the \kcode{declare simd}
|
||||
directive with the \kcode{inbranch} and \kcode{notinbranch} clauses. The
|
||||
\kcode{notinbranch} clause informs the compiler that the function \ucode{foo} is
|
||||
never called conditionally in the SIMD loop of the function \ucode{myaddint}. On
|
||||
the other hand, the \kcode{inbranch} clause for the function goo indicates that
|
||||
the function is always called conditionally in the SIMD loop inside
|
||||
the function \plc{myaddfloat}.
|
||||
the function \ucode{myaddfloat}.
|
||||
|
||||
\cexample[4.0]{SIMD}{6}
|
||||
|
||||
\ffreeexample[4.0]{SIMD}{6}
|
||||
|
||||
|
||||
In the code below, the function \plc{fib()} is called in the main program and
|
||||
also recursively called in the function \plc{fib()} within an \code{if}
|
||||
In the code below, the function \ucode{fib()} is called in the main program and
|
||||
also recursively called in the function \ucode{fib()} within an \bcode{if}
|
||||
condition. The compiler creates a masked vector version and a non-masked vector
|
||||
version for the function \plc{fib()} while retaining the original scalar
|
||||
version of the \plc{fib()} function.
|
||||
version for the function \ucode{fib()} while retaining the original scalar
|
||||
version of the \ucode{fib()} function.
|
||||
|
||||
\cexample[4.0]{SIMD}{7}
|
||||
|
||||
@ -111,16 +132,17 @@ version of the \plc{fib()} function.
|
||||
|
||||
|
||||
%%% section
|
||||
\pagebreak
|
||||
%\pagebreak
|
||||
\section{Loop-Carried Lexical Forward Dependence}
|
||||
\label{sec:SIMD_forward_dep}
|
||||
\index{dependences!loop-carried lexical forward}
|
||||
|
||||
|
||||
The following example tests the restriction on an SIMD loop with the loop-carried lexical forward-dependence. This dependence must be preserved for the correct execution of SIMD loops.
|
||||
|
||||
A loop can be vectorized even though the iterations are not completely independent when it has loop-carried dependences that are forward lexical dependences, indicated in the code below by the read of \plc{A[j+1]} and the write to \plc{A[j]} in C/C++ code (or \plc{A(j+1)} and \plc{A(j)} in Fortran). That is, the read of \plc{A[j+1]} (or \plc{A(j+1)} in Fortran) before the write to \plc{A[j]} (or \plc{A(j)} in Fortran) ordering must be preserved for each iteration in \plc{j} for valid SIMD code generation.
|
||||
A loop can be vectorized even though the iterations are not completely independent when it has loop-carried dependences that are forward lexical dependences, indicated in the code below by the read of \ucode{A[j+1]} and the write to \ucode{A[j]} in C/C++ code (or \ucode{A(j+1)} and \ucode{A(j)} in Fortran). That is, the read of \ucode{A[j+1]} (or \ucode{A(j+1)} in Fortran) before the write to \ucode{A[j]} (or \ucode{A(j)} in Fortran) ordering must be preserved for each iteration in \ucode{j} for valid SIMD code generation.
|
||||
|
||||
This test assures that the compiler preserves the loop carried lexical forward-dependence for generating a correct SIMD code.
|
||||
This test assures that the compiler preserves the loop-carried lexical forward-dependence for generating a correct SIMD code.
|
||||
|
||||
\cexample[4.0]{SIMD}{8}
|
||||
|
||||
|
@ -1,76 +1,83 @@
|
||||
%%% section
|
||||
\section{\code{ref}, \code{val}, \code{uval} Modifiers for \code{linear} Clause}
|
||||
\section{\kcode{ref}, \kcode{val}, \kcode{uval} Modifiers for \kcode{linear} Clause}
|
||||
\label{sec:linear_modifier}
|
||||
\index{modifiers, linear@modifiers, \kcode{linear}!ref@\kcode{ref}}
|
||||
\index{modifiers, linear@modifiers, \kcode{linear}!val@\kcode{val}}
|
||||
\index{modifiers, linear@modifiers, \kcode{linear}!uval@\kcode{uval}}
|
||||
\index{clauses!linear@\kcode{linear}}
|
||||
\index{linear clause@\kcode{linear} clause}
|
||||
|
||||
When generating vector functions from \code{declare}~\code{simd} directives, it is important for a compiler to know the proper types of function arguments in
|
||||
When generating vector functions from \kcode{declare simd} directives,
|
||||
it is important for a compiler to know the proper types of function arguments in
|
||||
order to generate efficient codes.
|
||||
This is especially true for C++ reference types and Fortran arguments.
|
||||
|
||||
In the following example, the function \plc{add\_one2} has a C++ reference
|
||||
parameter (or Fortran argument) \plc{p}. Variable \plc{p} gets incremented by 1 in the function.
|
||||
The caller loop \plc{i} in the main program passes
|
||||
a variable \plc{k} as a reference to the function \plc{add\_one2} call.
|
||||
The \code{ref} modifier for the \code{linear} clause on the
|
||||
\code{declare}~\code{simd} directive is used to annotate the
|
||||
reference-type parameter \plc{p} to match the property of the variable
|
||||
\plc{k} in the loop.
|
||||
In the following example, the function \ucode{add_one2} has a C++ reference
|
||||
parameter (or Fortran argument) \ucode{p}. Variable \ucode{p} gets incremented by 1 in the function.
|
||||
The caller loop \ucode{i} in the main program passes
|
||||
a variable \ucode{k} as a reference to the function \ucode{add_one2} call.
|
||||
The \kcode{ref} modifier for the \kcode{linear} clause on the
|
||||
\kcode{declare simd} directive specifies that the
|
||||
reference-type parameter \ucode{p} is to match the property of the variable
|
||||
\ucode{k} in the loop.
|
||||
This use of reference type is equivalent to the second call to
|
||||
\plc{add\_one2} with a direct passing of the array element \plc{a[i]}.
|
||||
\ucode{add_one2} with a direct passing of the array element \ucode{a[i]}.
|
||||
In the example, the preferred vector
|
||||
length 8 is specified for both the caller loop and the callee function.
|
||||
|
||||
When \code{linear(ref(p))} is applied to an argument passed by reference,
|
||||
When \kcode{linear(\ucode{p}: ref)} is applied to an argument passed by reference,
|
||||
it tells the compiler that the addresses in its vector argument are consecutive,
|
||||
and so the compiler can generate a single vector load or store instead of
|
||||
a gather or scatter. This allows more efficient SIMD code to be generated with
|
||||
less source changes.
|
||||
|
||||
\cppexample[4.5]{linear_modifier}{1}
|
||||
\ffreeexample[4.5]{linear_modifier}{1}
|
||||
\clearpage
|
||||
\cppexample[5.2]{linear_modifier}{1}
|
||||
\ffreeexample[5.2]{linear_modifier}{1}
|
||||
%\clearpage
|
||||
|
||||
|
||||
The following example is a variant of the above example. The function \plc{add\_one2} in the C++ code includes an additional C++ reference parameter \plc{i}.
|
||||
The loop index \plc{i} of the caller loop \plc{i} in the main program
|
||||
is passed as a reference to the function \plc{add\_one2} call.
|
||||
The loop index \plc{i} has a uniform address with
|
||||
The following example is a variant of the above example. The function \ucode{add_one2}
|
||||
in the C++ code includes an additional C++ reference parameter \ucode{i}.
|
||||
The loop index \ucode{i} of the caller loop \ucode{i} in the main program
|
||||
is passed as a reference to the function \ucode{add_one2} call.
|
||||
The loop index \ucode{i} has a uniform address with
|
||||
linear value of step 1 across SIMD lanes.
|
||||
Thus, the \code{uval} modifier is used for the \code{linear} clause
|
||||
to annotate the C++ reference-type parameter \plc{i} to match
|
||||
the property of loop index \plc{i}.
|
||||
Thus, the \kcode{uval} modifier is used for the \kcode{linear} clause
|
||||
to specify that the C++ reference-type parameter \ucode{i} is to match
|
||||
the property of loop index \ucode{i}.
|
||||
|
||||
In the correponding Fortran code the arguments \plc{p} and
|
||||
\plc{i} in the routine \plc{add\_on2} are passed by references.
|
||||
Similar modifiers are used for these variables in the \code{linear} clauses
|
||||
In the corresponding Fortran code the arguments \ucode{p} and
|
||||
\ucode{i} in the routine \ucode{add_on2} are passed by references.
|
||||
Similar modifiers are used for these variables in the \kcode{linear} clauses
|
||||
to match with the property at the caller loop in the main program.
|
||||
|
||||
When \code{linear(uval(i))} is applied to an argument passed by reference, it
|
||||
When \kcode{linear(\ucode{i}: uval)} is applied to an argument passed by reference, it
|
||||
tells the compiler that its addresses in the vector argument are uniform
|
||||
so that the compiler can generate a scalar load or scalar store and create
|
||||
linear values. This allows more efficient SIMD code to be generated with
|
||||
less source changes.
|
||||
|
||||
\cppexample[4.5]{linear_modifier}{2}
|
||||
\ffreeexample[4.5]{linear_modifier}{2}
|
||||
\cppexample[5.2]{linear_modifier}{2}
|
||||
\ffreeexample[5.2]{linear_modifier}{2}
|
||||
|
||||
In the following example, the function \plc{func} takes arrays \plc{x} and \plc{y} as arguments, and accesses the array elements referenced by
|
||||
the index \plc{i}.
|
||||
The caller loop \plc{i} in the main program passes a linear copy of
|
||||
the variable \plc{k} to the function \plc{func}.
|
||||
The \code{val} modifier is used for the \code{linear} clause
|
||||
in the \code{declare}~\code{simd} directive for the function
|
||||
\plc{func} to annotate argument \plc{i} to match the property of
|
||||
the actual argument \plc{k} passed in the SIMD loop.
|
||||
Arrays \plc{x} and \plc{y} have uniform addresses across SIMD lanes.
|
||||
In the following example, the function \ucode{func} takes arrays \ucode{x} and \ucode{y}
|
||||
as arguments, and accesses the array elements referenced by the index \ucode{i}.
|
||||
The caller loop \ucode{i} in the main program passes a linear copy of
|
||||
the variable \ucode{k} to the function \ucode{func}.
|
||||
The \kcode{val} modifier is used for the \kcode{linear} clause
|
||||
in the \kcode{declare simd} directive for the function
|
||||
\ucode{func} to specify that the argument \ucode{i} is to match the property of
|
||||
the actual argument \ucode{k} passed in the SIMD loop.
|
||||
Arrays \ucode{x} and \ucode{y} have uniform addresses across SIMD lanes.
|
||||
|
||||
When \code{linear(val(i):1)} is applied to an argument,
|
||||
When \kcode{linear(\ucode{i}: val,step(\ucode{1}))} is applied to an argument,
|
||||
it tells the compiler that its addresses in the vector argument may not be
|
||||
consecutive, however, their values are linear (with stride 1 here). When the value of \plc{i} is used
|
||||
in subscript of array references (e.g., \plc{x[i]}), the compiler can generate
|
||||
consecutive, however, their values are linear (with stride 1 here). When the value of \ucode{i} is used
|
||||
in subscript of array references (e.g., \ucode{x[i]}), the compiler can generate
|
||||
a vector load or store instead of a gather or scatter. This allows more
|
||||
efficient SIMD code to be generated with less source changes.
|
||||
|
||||
\cexample[4.5]{linear_modifier}{3}
|
||||
\ffreeexample[4.5]{linear_modifier}{3}
|
||||
\cexample[5.2]{linear_modifier}{3}
|
||||
\ffreeexample[5.2]{linear_modifier}{3}
|
||||
|
||||
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: SIMD.1c
|
||||
* @@name: SIMD.1
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: SIMD.1f
|
||||
! @@name: SIMD.1
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine star(a,b,c,n,ioff_ptr)
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: SIMD.2c
|
||||
* @@name: SIMD.2
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: link
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: SIMD.2f
|
||||
! @@name: SIMD.2
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: link
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
program main
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: SIMD.3c
|
||||
* @@name: SIMD.3
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: SIMD.3f
|
||||
! @@name: SIMD.3
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine work( a, b, n, sum )
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: SIMD.4c
|
||||
* @@name: SIMD.4
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: SIMD.4f
|
||||
! @@name: SIMD.4
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine work( b, n, m )
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: SIMD.5c
|
||||
* @@name: SIMD.5
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: SIMD.5f
|
||||
! @@name: SIMD.5
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine work( a, b, c, n )
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: SIMD.6c
|
||||
* @@name: SIMD.6
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: SIMD.6f
|
||||
! @@name: SIMD.6
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
function foo(p) result(r)
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: SIMD.7c
|
||||
* @@name: SIMD.7
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
@ -33,6 +32,6 @@ int main(void)
|
||||
for (i=0; i < N; i++) {
|
||||
a[i] = fib(b[i]);
|
||||
}
|
||||
printf("Done a[%d] = %d\n", N-1, a[N-1]);
|
||||
printf("Done a[%d] = %d\n", N-1, a[N-1]); //Done a[44] = 701408733
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: SIMD.7f
|
||||
! @@name: SIMD.7
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
program fibonacci
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: SIMD.8c
|
||||
* @@name: SIMD.8
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: SIMD.8f
|
||||
! @@name: SIMD.8
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
module work
|
||||
|
@ -1,17 +1,16 @@
|
||||
/*
|
||||
* @@name: linear_modifier.1cpp
|
||||
* @@name: linear_modifier.1
|
||||
* @@type: C++
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_4.5
|
||||
* @@version: omp_5.2
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
||||
#define NN 1023
|
||||
int a[NN];
|
||||
|
||||
#pragma omp declare simd linear(ref(p)) simdlen(8)
|
||||
#pragma omp declare simd linear(p: ref) simdlen(8)
|
||||
void add_one2(int& p)
|
||||
{
|
||||
p += 1;
|
||||
|
@ -1,17 +1,17 @@
|
||||
! @@name: linear_modifier.1.f90
|
||||
! @@name: linear_modifier.1
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_4.5
|
||||
! @@version: omp_5.2
|
||||
module m
|
||||
integer, parameter :: NN = 1023
|
||||
integer :: a(NN)
|
||||
|
||||
contains
|
||||
subroutine add_one2(p)
|
||||
!$omp declare simd(add_one2) linear(ref(p)) simdlen(8)
|
||||
implicit none
|
||||
!$omp declare simd(add_one2) linear(p: ref) simdlen(8)
|
||||
|
||||
integer :: p
|
||||
|
||||
p = p + 1
|
||||
@ -45,4 +45,3 @@ program main
|
||||
end do
|
||||
print *, "passed"
|
||||
end program
|
||||
|
||||
|
@ -1,17 +1,16 @@
|
||||
/*
|
||||
* @@name: linear_modifier.2cpp
|
||||
* @@name: linear_modifier.2
|
||||
* @@type: C++
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_4.5
|
||||
* @@version: omp_5.2
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
||||
#define NN 1023
|
||||
int a[NN];
|
||||
|
||||
#pragma omp declare simd linear(ref(p)) linear(uval(i))
|
||||
#pragma omp declare simd linear(p: ref) linear(i: uval)
|
||||
void add_one2(int& p, const int& i)
|
||||
{
|
||||
p += i;
|
||||
|
@ -1,17 +1,17 @@
|
||||
! @@name: linear_modifier.2f90
|
||||
! @@name: linear_modifier.2
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_4.5
|
||||
! @@version: omp_5.2
|
||||
module m
|
||||
integer, parameter :: NN = 1023
|
||||
integer :: a(NN)
|
||||
|
||||
contains
|
||||
subroutine add_one2(p, i)
|
||||
!$omp declare simd(add_one2) linear(ref(p)) linear(uval(i))
|
||||
implicit none
|
||||
!$omp declare simd(add_one2) linear(p: ref) linear(i: uval)
|
||||
|
||||
integer :: p
|
||||
integer, intent(in) :: i
|
||||
|
||||
|
@ -1,16 +1,15 @@
|
||||
/*
|
||||
* @@name: linear_modifier.3c
|
||||
* @@name: linear_modifier.3
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_4.5
|
||||
* @@version: omp_5.2
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
||||
#define N 128
|
||||
|
||||
#pragma omp declare simd simdlen(4) uniform(x, y) linear(val(i):1)
|
||||
#pragma omp declare simd simdlen(4) uniform(x, y) linear(i:val,step(1))
|
||||
double func(double x[], double y[], int i)
|
||||
{
|
||||
return (x[i] + y[i]);
|
||||
|
@ -1,14 +1,14 @@
|
||||
! @@name: linear_modifier.3f
|
||||
! @@name: linear_modifier.3
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_4.5
|
||||
! @@version: omp_5.2
|
||||
module func_mod
|
||||
contains
|
||||
real(8) function func(x, y, i)
|
||||
!$omp declare simd(func) simdlen(4) uniform(x, y) linear(val(i):1)
|
||||
implicit none
|
||||
implicit none
|
||||
!$omp declare simd(func) simdlen(4) uniform(x, y) linear(i:val,step(1))
|
||||
|
||||
real(8), intent(in) :: x(*), y(*)
|
||||
integer, intent(in) :: i
|
||||
|
||||
|
46
STYLE_GUIDE.md
Normal file
46
STYLE_GUIDE.md
Normal file
@ -0,0 +1,46 @@
|
||||
### OpenMP Examples Coding Style Guide
|
||||
|
||||
Must Dos:
|
||||
- Indents and Braces
|
||||
- Code: Follow common base language practices.
|
||||
- Where indents are normally used, use 2 spaces instead of tabs.
|
||||
- Comments: Follow the indent of the base language for which the comment applies.
|
||||
- OpenMP directives should be indented as if it's base language code where possible.
|
||||
- Braces `{}` around structured blocks following directives must be on a new line and must follow base language indent.
|
||||
- For C/C++ examples, for code blocks with multiple lines, the if-else statements must follow the following format:
|
||||
```
|
||||
if {
|
||||
} else {
|
||||
}
|
||||
```
|
||||
- All Section and sub-section headings must be in Title case. For example: " This is a Useful Example of X Directive ".
|
||||
|
||||
- Comments
|
||||
- Comments go on a new line before the relevant code/code block.
|
||||
- Expected results may go on the same line.
|
||||
- Keep comments terse; detailed explanations go in the text.
|
||||
|
||||
- Output
|
||||
- If there is a deterministic output, provide it.
|
||||
- It can be done in one of the following ways:
|
||||
- Specify the correct value in a comment.
|
||||
- Code prints out "expected" and "run" values.
|
||||
- Test for the correctness of a value in a conditional.
|
||||
- If the test is expected to execute, return values must be used to indicate success or failure.
|
||||
- For tests that produce incorrect results, use:
|
||||
- `return(1)` for C/C++
|
||||
- `stop 1` for Fortran (do not exit)
|
||||
- For tests that need to discontinue execution, use:
|
||||
- `exit(1)` for C/C++
|
||||
- `error stop` for Fortran
|
||||
- Validation messages such as "Pass" / "Fail" are not mandatory.
|
||||
- A single "pass" or "fail" is sufficient for a multi-case test.
|
||||
|
||||
- To Verify Metadata:
|
||||
- A tool in the repository at the top level, "make check", scans all sources for version tags and ensures line length is 75 characters max.
|
||||
- Inside `utils`, there is `chk_tags` (see different options) that can accept 1 file and scan for all specified values.
|
||||
|
||||
Don’ts:
|
||||
- Unless required by the feature, use free-format Fortran for new examples.
|
||||
- Do not use all-caps for emphasis in the document.
|
||||
|
@ -12,22 +12,22 @@
|
||||
\textsf{OpenMP\\Application Programming\\Interface}
|
||||
|
||||
% An optional subtitle can go here:
|
||||
\vspace{0.5in}\textsf{Examples}\vspace{-0.7in}
|
||||
\vspace{0.5in}\textsf{\langselect Examples}\vspace{-0.7in}
|
||||
\normalsize
|
||||
|
||||
\vspace{1.0in}
|
||||
|
||||
\textbf{Version \PVER{} -- \VERDATE}
|
||||
\textbf{Version \VER{} -- \VERDATE}
|
||||
\end{center}
|
||||
\end{adjustwidth}
|
||||
|
||||
\vspace{2.3in} %was 3.0
|
||||
|
||||
Source codes for OpenMP \PVER{} Examples can be downloaded from
|
||||
\href{https://github.com/OpenMP/Examples/tree/v\VER}{github}.\\
|
||||
Source codes for OpenMP Examples \VER{} are available at
|
||||
\examplestree{\VER}{github (\examplesrepo/tree/v\VER)}.\\
|
||||
|
||||
\begin{adjustwidth}{0pt}{1em}\setlength{\parskip}{0.25\baselineskip}%
|
||||
Copyright \copyright{} 1997-2021 OpenMP Architecture Review Board.\\
|
||||
Copyright \copyright{} 1997-2024 OpenMP Architecture Review Board.\\
|
||||
Permission to copy without fee all or part of this material is granted,
|
||||
provided the OpenMP Architecture Review Board copyright notice and
|
||||
the title of this document appear. Notice is given that copying is by
|
||||
@ -37,7 +37,7 @@ permission of OpenMP Architecture Review Board.\end{adjustwidth}
|
||||
|
||||
% Blank page
|
||||
|
||||
\cleardoublepage
|
||||
%\cleardoublepage
|
||||
|
||||
%For final version, uncomment the line above, comment out the lines below
|
||||
%This working version enacted the following tickets: 287, 519, 550, 593,
|
||||
|
@ -1,35 +1,44 @@
|
||||
\pagebreak
|
||||
\section{\code{proc\_bind} Clause}
|
||||
\section{\kcode{proc_bind} Clause}
|
||||
\label{sec:affinity}
|
||||
\index{affinity!proc_bind clause@\kcode{proc_bind} clause}
|
||||
\index{clauses!proc_bind@\kcode{proc_bind}}
|
||||
\index{proc_bind clause@\kcode{proc_bind} clause}
|
||||
|
||||
The following examples demonstrate how to use the \code{proc\_bind} clause to
|
||||
control the thread binding for a team of threads in a \code{parallel} region.
|
||||
The machine architecture is depicted in the figure below. It consists of two sockets,
|
||||
The following examples demonstrate how to use the \kcode{proc_bind} clause to
|
||||
control the thread binding for a team of threads in a \kcode{parallel} region.
|
||||
The machine architecture is depicted in Figure~\ref{fig:mach_arch}. It consists of two sockets,
|
||||
each equipped with a quad-core processor and configured to execute two hardware
|
||||
threads simultaneously on each core. These examples assume a contiguous core numbering
|
||||
starting from 0, such that the hardware threads 0,1 form the first physical core.
|
||||
|
||||
\ifpdf
|
||||
%\begin{figure}[htbp]
|
||||
\centerline{\includegraphics[width=3.8in,keepaspectratio=true]%
|
||||
\begin{figure}[htb]
|
||||
\centerline{\includegraphics[width=3.0in,keepaspectratio=true]%
|
||||
{figs/proc_bind_fig.pdf}}
|
||||
%\end{figure}
|
||||
\caption{A machine architecture with two quad-core processors}
|
||||
\label{fig:mach_arch}
|
||||
\end{figure}
|
||||
\fi
|
||||
|
||||
The following equivalent place list declarations consist of eight places (which
|
||||
we designate as p0 to p7):
|
||||
|
||||
\code{OMP\_PLACES=\texttt{"}\{0,1\},\{2,3\},\{4,5\},\{6,7\},\{8,9\},\{10,11\},\{12,13\},\{14,15\}\texttt{"}}
|
||||
|
||||
\begin{boxeducode}
|
||||
\kcode{export OMP_PLACES=}"{0,1},{2,3},{4,5},{6,7},{8,9},{10,11},{12,13},
|
||||
{14,15}"
|
||||
\end{boxeducode}
|
||||
or
|
||||
|
||||
\code{OMP\_PLACES=\texttt{"}\{0:2\}:8:2\texttt{"}}
|
||||
\begin{boxeducode}
|
||||
\kcode{export OMP_PLACES=}"{0:2}:8:2"
|
||||
\end{boxeducode}
|
||||
|
||||
\subsection{Spread Affinity Policy}
|
||||
\label{subsec:affinity_spread}
|
||||
\index{affinity!spread policy@\kcode{spread} policy}
|
||||
\index{spread policy@\kcode{spread} policy}
|
||||
|
||||
|
||||
The following example shows the result of the \code{spread} affinity policy on
|
||||
The following example shows the result of the \kcode{spread} affinity policy on
|
||||
the partition list when the number of threads is less than or equal to the number
|
||||
of places in the parent's place partition, for the machine architecture depicted
|
||||
above. Note that the threads are bound to the first place of each subpartition.
|
||||
@ -66,13 +75,13 @@ and distribution of the place partition would be as follows:
|
||||
\item thread 3 executes on p0 with the place partition p0,p1
|
||||
\end{compactitem}
|
||||
|
||||
The following example illustrates the \code{spread} thread affinity policy when
|
||||
The following example illustrates the \kcode{spread} thread affinity policy when
|
||||
the number of threads is greater than the number of places in the parent's place
|
||||
partition.
|
||||
|
||||
Let \plc{T} be the number of threads in the team, and \plc{P} be the number of places in the
|
||||
parent's place partition. The first \plc{T/P} threads of the team (including the primary
|
||||
thread) execute on the parent's place. The next \plc{T/P} threads execute on the next
|
||||
Let \ucode{T} be the number of threads in the team, and \ucode{P} be the number of places in the
|
||||
parent's place partition. The first \ucode{T/P} threads of the team (including the primary
|
||||
thread) execute on the parent's place. The next \ucode{T/P} threads execute on the next
|
||||
place in the place partition, and so on, with wrap around.
|
||||
|
||||
\cexample[4.0]{affinity}{2}
|
||||
@ -124,11 +133,13 @@ and distribution of the place partition would be as follows:
|
||||
|
||||
\subsection{Close Affinity Policy}
|
||||
\label{subsec:affinity_close}
|
||||
\index{affinity!close policy@\kcode{close} policy}
|
||||
\index{close policy@\kcode{close} policy}
|
||||
|
||||
The following example shows the result of the \code{close} affinity policy on
|
||||
The following example shows the result of the \kcode{close} affinity policy on
|
||||
the partition list when the number of threads is less than or equal to the number
|
||||
of places in parent's place partition, for the machine architecture depicted above.
|
||||
The place partition is not changed by the \code{close} policy.
|
||||
The place partition is not changed by the \kcode{close} policy.
|
||||
|
||||
\cexample[4.0]{affinity}{3}
|
||||
|
||||
@ -136,7 +147,7 @@ The place partition is not changed by the \code{close} policy.
|
||||
|
||||
It is unspecified on which place the primary thread is initially started. If the
|
||||
primary thread is initially started on p0, the following placement of threads will
|
||||
be applied in the \code{parallel} region:
|
||||
be applied in the \kcode{parallel} region:
|
||||
|
||||
\begin{compactitem}
|
||||
\item thread 0 executes on p0 with the place partition p0-p7
|
||||
@ -161,15 +172,15 @@ and distribution of the place partition would be as follows:
|
||||
\item thread 3 executes on p5 with the place partition p0-p7
|
||||
\end{compactitem}
|
||||
|
||||
The following example illustrates the \code{close} thread affinity policy when
|
||||
The following example illustrates the \kcode{close} thread affinity policy when
|
||||
the number of threads is greater than the number of places in the parent's place
|
||||
partition.
|
||||
|
||||
Let \plc{T} be the number of threads in the team, and \plc{P} be the number of places in the
|
||||
parent's place partition. The first \plc{T/P} threads of the team (including the primary
|
||||
thread) execute on the parent's place. The next \plc{T/P} threads execute on the next
|
||||
Let \ucode{T} be the number of threads in the team, and \ucode{P} be the number of places in the
|
||||
parent's place partition. The first \ucode{T/P} threads of the team (including the primary
|
||||
thread) execute on the parent's place. The next \ucode{T/P} threads execute on the next
|
||||
place in the place partition, and so on, with wrap around. The place partition
|
||||
is not changed by the \code{close} policy.
|
||||
is not changed by the \kcode{close} policy.
|
||||
|
||||
\cexample[4.0]{affinity}{4}
|
||||
|
||||
@ -220,14 +231,16 @@ and distribution of the place partition would be as follows:
|
||||
|
||||
\subsection{Primary Affinity Policy}
|
||||
\label{subsec:affinity_primary}
|
||||
\index{affinity!primary policy@\kcode{primary} policy}
|
||||
\index{primary policy@\kcode{primary} policy}
|
||||
|
||||
The following example shows the result of the \code{primary} affinity policy on
|
||||
The following example shows the result of the \kcode{primary} affinity policy on
|
||||
the partition list for the machine architecture depicted above. The place partition
|
||||
is not changed by the primary policy.
|
||||
|
||||
\cexample[4.0]{affinity}{5}
|
||||
\cexample[5.1]{affinity}{5}
|
||||
|
||||
\fexample[4.0]{affinity}{5}[1]
|
||||
\fexample[5.1]{affinity}{5}
|
||||
\clearpage
|
||||
|
||||
It is unspecified on which place the primary thread is initially started. If the
|
||||
|
@ -1,23 +1,32 @@
|
||||
\section{Affinity Display}
|
||||
\label{sec:affinity_display}
|
||||
\index{affinity display!OMP_DISPLAY_AFFINITY@\kcode{OMP_DISPLAY_AFFINITY}}
|
||||
\index{environment variables!OMP_DISPLAY_AFFINITY@\kcode{OMP_DISPLAY_AFFINITY}}
|
||||
\index{OMP_DISPLAY_AFFINITY@\kcode{OMP_DISPLAY_AFFINITY}}
|
||||
\index{affinity display!OMP_AFFINITY_FORMAT@\kcode{OMP_AFFINITY_FORMAT}}
|
||||
\index{environment variables!OMP_AFFINITY_FORMAT@\kcode{OMP_AFFINITY_FORMAT}}
|
||||
\index{OMP_AFFINITY_FORMAT@\kcode{OMP_AFFINITY_FORMAT}}
|
||||
\index{affinity display!omp_display_affinity routine@\kcode{omp_display_affinity} routine}
|
||||
\index{routines!omp_display_affinity@\kcode{omp_display_affinity}}
|
||||
\index{omp_display_affinity routine@\kcode{omp_display_affinity} routine}
|
||||
|
||||
The following examples illustrate ways to display thread affinity.
|
||||
Automatic display of affinity can be invoked by setting
|
||||
the \code{OMP\_DISPLAY\_AFFINITY} environment variable to \code{TRUE}.
|
||||
the \kcode{OMP_DISPLAY_AFFINITY} environment variable to \vcode{TRUE}.
|
||||
The format of the output can be customized by setting the
|
||||
\code{OMP\_AFFINITY\_FORMAT} environment variable to an appropriate string.
|
||||
\kcode{OMP_AFFINITY_FORMAT} environment variable to an appropriate string.
|
||||
Also, there are API calls for the user to display thread affinity
|
||||
at selected locations within code.
|
||||
|
||||
For the first example the environment variable \code{OMP\_DISPLAY\_AFFINITY} has been
|
||||
set to \code{TRUE}, and execution occurs on an 8-core system with \code{OMP\_NUM\_THREADS} set to 8.
|
||||
For the first example the environment variable \kcode{OMP_DISPLAY_AFFINITY} has been
|
||||
set to \vcode{TRUE}, and execution occurs on an 8-core system with \kcode{OMP_NUM_THREADS} set to 8.
|
||||
|
||||
The affinity for the primary thread is reported through a call to the API
|
||||
\code{omp\_display\_affinity()} routine. For default affinity settings
|
||||
\kcode{omp_display_affinity()} routine. For default affinity settings
|
||||
the report shows that the primary thread can execute on any of the cores.
|
||||
In the following parallel region the affinity for each of the team threads is reported
|
||||
automatically since the \code{OMP\_DISPLAY\_AFFINITY} environment variable has been set
|
||||
to \code{TRUE}.
|
||||
automatically since the \kcode{OMP_DISPLAY_AFFINITY} environment variable has been set
|
||||
to \vcode{TRUE}.
|
||||
|
||||
These two reports are often useful (as in hybrid codes using both MPI and OpenMP)
|
||||
to observe the affinity (for an MPI task) before the parallel region,
|
||||
@ -28,9 +37,9 @@ not changed, so affinity is NOT reported.
|
||||
In the last parallel region, the thread affinities are reported
|
||||
because the thread affinity has changed.
|
||||
|
||||
\cexample[5.0]{affinity_display}{1}
|
||||
\cexample[5.0]{affinity_display}{1}[1]
|
||||
|
||||
\ffreeexample[5.0]{affinity_display}{1}
|
||||
\ffreeexample[5.0]{affinity_display}{1}[1]
|
||||
|
||||
|
||||
In the following example 2 threads are forked, and each executes on a socket. Next,
|
||||
@ -38,54 +47,67 @@ a nested parallel region runs half of the available threads on each socket.
|
||||
|
||||
These OpenMP environment variables have been set:
|
||||
|
||||
\begin{compactitem}
|
||||
\item \code{OMP\_PROC\_BIND}="TRUE"
|
||||
\item \code{OMP\_NUM\_THREADS}="2,4"
|
||||
\item \code{OMP\_PLACES}="\{0,2,4,6\},\{1,3,5,7\}"
|
||||
\item \code{OMP\_AFFINITY\_FORMAT}="nest\_level= \%L, parent\_thrd\_num= \%a, thrd\_num= \%n, thrd\_affinity= \%A"
|
||||
\end{compactitem}
|
||||
\begin{boxeducode}
|
||||
\kcode{export OMP_PROC_BIND=}"TRUE"
|
||||
\kcode{export OMP_NUM_THREADS=}"2,4"
|
||||
\kcode{export OMP_PLACES=}"{0,2,4,6},{1,3,5,7}"
|
||||
\kcode{export OMP_AFFINITY_FORMAT=}"nest_level= %L, parent_thrd_num= %a,
|
||||
thrd_num= %n, thrd_affinity= %A"
|
||||
\end{boxeducode}
|
||||
|
||||
where the numbers correspond to core ids for the system. Note, \code{OMP\_DISPLAY\_AFFINITY} is not
|
||||
set and is \code{FALSE} by default. This example shows how to use API routines to
|
||||
where the numbers correspond to core ids for the system. Note, \kcode{OMP_DISPLAY_AFFINITY} is not
|
||||
set and is \vcode{FALSE} by default. This example shows how to use API routines to
|
||||
perform affinity display operations.
|
||||
|
||||
For each of the two first-level threads the \code{OMP\_PLACES} variable specifies
|
||||
\index{environment variables!OMP_PLACES@\kcode{OMP_PLACES}}
|
||||
\index{OMP_PLACES@\kcode{OMP_PLACES}}
|
||||
For each of the two first-level threads the \kcode{OMP_PLACES} variable specifies
|
||||
a place with all the core-ids of the socket (\{0,2,4,6\} for one thread and \{1,3,5,7\} for the other).
|
||||
(As is sometimes the case in 2-socket systems, one socket may consist
|
||||
of the even id numbers, while the other may have the odd id numbers.) The affinities
|
||||
are printed according to the \code{OMP\_AFFINITY\_FORMAT} format: providing
|
||||
the parallel nesting level (\%L), the ancestor thread number (\%a), the thread number (\%n)
|
||||
and the thread affinity (\%A). In the nested parallel region within the \plc{socket\_work} routine
|
||||
are printed according to the \kcode{OMP_AFFINITY_FORMAT} format: providing
|
||||
the parallel nesting level (\ucode{\%L}), the ancestor thread number (\ucode{\%a}), the thread number (\ucode{\%n})
|
||||
and the thread affinity (\ucode{\%A}). In the nested parallel region within the \ucode{socket_work} routine
|
||||
the affinities for the threads on each socket are printed according to this format.
|
||||
|
||||
\cexample[5.0]{affinity_display}{2}
|
||||
\cexample[5.0]{affinity_display}{2}[3]
|
||||
|
||||
\ffreeexample[5.0]{affinity_display}{2}
|
||||
\ffreeexample[5.0]{affinity_display}{2}[3]
|
||||
|
||||
%\newpage
|
||||
\index{affinity display!omp_get_affinity_format routine@\kcode{omp_get_affinity_format} routine}
|
||||
\index{routines!omp_get_affinity_format@\kcode{omp_get_affinity_format}}
|
||||
\index{omp_get_affinity_format routine@\kcode{omp_get_affinity_format} routine}
|
||||
\index{affinity display!omp_set_affinity_format routine@\kcode{omp_set_affinity_format} routine}
|
||||
\index{routines!omp_set_affinity_format@\kcode{omp_set_affinity_format}}
|
||||
\index{omp_set_affinity_format routine@\kcode{omp_set_affinity_format} routine}
|
||||
The next example illustrates more details about affinity formatting.
|
||||
First, the \code{omp\_get\_affininity\_format()} API routine is used to
|
||||
First, the \kcode{omp_get_affinity_format()} API routine is used to
|
||||
obtain the default format. The code checks to make sure the storage
|
||||
provides enough space to hold the format.
|
||||
Next, the \code{omp\_set\_affinity\_format()} API routine sets a user-defined
|
||||
format: \plc{host=\%20H thrd\_num=\%0.4n binds\_to=\%A}.
|
||||
Next, the \kcode{omp_set_affinity_format()} API routine sets a user-defined
|
||||
format: \ucode{host=\%20H~thrd_num=\%0.4n~binds_to=\%A}.
|
||||
|
||||
The host, thread number and affinity fields are specified by \plc{\%20H},
|
||||
\plc{\%0.4n} and \plc{\%A}: \plc{H}, \plc{n} and \plc{A} are single character "short names"
|
||||
The host, thread number and affinity fields are specified by \ucode{\%20H},
|
||||
\ucode{\%0.4n} and \ucode{\%A}: \ucode{H}, \ucode{n} and \ucode{A} are single character ``short names''
|
||||
for the host, thread\_num and thread\_affinity data to be printed,
|
||||
with format sizes of \plc{20}, \plc{4}, and "size as needed".
|
||||
with format sizes of \ucode{20}, \ucode{4}, and ``size as needed''.
|
||||
The period (.) indicates that the field is displayed right-justified (default is left-justified)
|
||||
and the "0" indicates that any unused space is to be prefixed with zeros
|
||||
(e.g. instead of "1", "0001" is displayed for the field size of 4).
|
||||
and the ``0'' indicates that any unused space is to be prefixed with zeros
|
||||
(e.g. instead of ``1'', ``0001'' is displayed for the field size of 4).
|
||||
|
||||
%The period (.) indicates that the field is displayed left-justified and the "0" indicates
|
||||
%The period (.) indicates that the field is displayed left-justified and the ``0'' indicates
|
||||
%that leading zeros are to be added so that the total length for the display of this “n” (thread_num) field is 4.
|
||||
|
||||
%The period (\plc{.}) indicates right justified and \plc{0} leading zeros.
|
||||
%All other text in the format is just user narrative.
|
||||
|
||||
\index{affinity display!omp_capture_affinity routine@\kcode{omp_capture_affinity} routine}
|
||||
\index{routines!omp_capture_affinity@\kcode{omp_capture_affinity}}
|
||||
\index{omp_capture_affinity routine@\kcode{omp_capture_affinity} routine}
|
||||
Within the parallel region the affinity for each thread is captured by
|
||||
\code{omp\_capture\_affinity()} into a buffer array with elements indexed
|
||||
by the thread number (\plc{thrd\_num}).
|
||||
\kcode{omp_capture_affinity()} into a buffer array with elements indexed
|
||||
by the thread number (\ucode{thrd_num}).
|
||||
After the parallel region, the thread affinities are printed in thread-number order.
|
||||
|
||||
If the storage area in buffer is inadequate for holding the affinity
|
||||
@ -93,10 +115,10 @@ data, the stored affinity data is truncated.
|
||||
%The \plc{max} reduction on the required storage, returned by
|
||||
%\code{omp\_capture\_affinity} in \plc{nchars}, is used to report
|
||||
%possible truncation (if \plc{max\_req\_store} > \plc{buffer\_store}).
|
||||
The maximum value for the number of characters (\plc{nchars}) returned by
|
||||
\code{omp\_capture\_affinity} is captured by the \code{reduction(max:max\_req\_store)}
|
||||
clause and the \plc{if(nchars >= max\_req\_store) max\_req\_store=nchars} statement.
|
||||
It is used to report possible truncation (if \plc{max\_req\_store} > \plc{buffer\_store}).
|
||||
The maximum value for the number of characters (\ucode{nchars}) returned by
|
||||
\kcode{omp_capture_affinity} is captured by the \kcode{reduction(max: \ucode{max_req_store})}
|
||||
clause and the \ucode{if(nchars >= max_req_store) max_req_store=nchars} statement.
|
||||
It is used to report possible truncation (if \ucode{max_req_store} > \ucode{buffer_store}).
|
||||
|
||||
\cexample[5.0]{affinity_display}{3}
|
||||
|
||||
|
@ -1,5 +1,19 @@
|
||||
\newpage
|
||||
\section{Affinity Query Functions}
|
||||
\label{sec: affinity_query}
|
||||
\index{affinity query!omp_get_num_places routine@\kcode{omp_get_num_places} routine}
|
||||
\index{routines!omp_get_num_places@\kcode{omp_get_num_places}}
|
||||
\index{omp_get_num_places routine@\kcode{omp_get_num_places} routine}
|
||||
\index{affinity query!omp_get_place_num routine@\kcode{omp_get_place_num} routine}
|
||||
\index{routines!omp_get_place_num@\kcode{omp_get_place_num}}
|
||||
\index{omp_get_place_num routine@\kcode{omp_get_place_num} routine}
|
||||
\index{affinity query!omp_get_place_num_procs routine@\kcode{omp_get_place_num_procs} routine}
|
||||
\index{routines!omp_get_place_num_procs@\kcode{omp_get_place_num_procs}}
|
||||
\index{omp_get_place_num_procs routine@\kcode{omp_get_place_num_procs} routine}
|
||||
\index{affinity!spread policy@\kcode{spread} policy}
|
||||
\index{spread policy@\kcode{spread} policy}
|
||||
\index{environment variables!OMP_PLACES@\kcode{OMP_PLACES}}
|
||||
\index{OMP_PLACES@\kcode{OMP_PLACES}}
|
||||
|
||||
In the example below a team of threads is generated on each socket of
|
||||
the system, using nested parallelism. Several query functions are used
|
||||
@ -9,25 +23,25 @@ socket and thread numbers.
|
||||
For proper execution of the code, the user must create a place partition, such that
|
||||
each place is a listing of the core numbers for a socket. For example,
|
||||
in a 2 socket system with 8 cores in each socket, and sequential numbering
|
||||
in the socket for the core numbers, the \code{OMP\_PLACES} variable would be set
|
||||
to "\{0:8\},\{8:8\}", using the place syntax \{\plc{lower\_bound}:\plc{length}:\plc{stride}\},
|
||||
in the socket for the core numbers, the \kcode{OMP_PLACES} variable would be set
|
||||
to "\{0:8\},\{8:8\}", using the place syntax \{\splc{lower_bound:length:stride}\},
|
||||
and the default stride of 1.
|
||||
|
||||
The code determines the number of sockets (\plc{n\_sockets})
|
||||
using the \code{omp\_get\_num\_places()} query function.
|
||||
The code determines the number of sockets (\ucode{n_sockets})
|
||||
using the \kcode{omp_get_num_places()} query function.
|
||||
In this example each place is constructed with a list of
|
||||
each socket's core numbers, hence the number of places is equal
|
||||
to the number of sockets.
|
||||
|
||||
The outer parallel region forms a team of threads, and each thread
|
||||
executes on a socket (place) because the \code{proc\_bind} clause uses
|
||||
\code{spread} in the outer \code{parallel} construct.
|
||||
Next, in the \plc{socket\_init} function, an inner parallel region creates a team
|
||||
executes on a socket (place) because the \kcode{proc_bind} clause uses
|
||||
\kcode{spread} in the outer \kcode{parallel} construct.
|
||||
Next, in the \ucode{socket_init} function, an inner parallel region creates a team
|
||||
of threads equal to the number of elements (core numbers) from the place
|
||||
of the parent thread. Because the outer \code{parallel} construct uses
|
||||
a \code{spread} affinity policy, each of its threads inherits a subpartition of
|
||||
the original partition. Hence, the \code{omp\_get\_place\_num\_procs} query function
|
||||
returns the number of elements (here procs = cores) in the subpartition of the thread.
|
||||
of the parent thread. Because the outer \kcode{parallel} construct uses
|
||||
a \kcode{spread} affinity policy, each of its threads inherits a sub-partition of
|
||||
the original partition. Hence, the \kcode{omp_get_place_num_procs} query function
|
||||
returns the number of elements (here procs = cores) in the sub-partition of the thread.
|
||||
After each parent thread creates its nested parallel region on the section,
|
||||
the socket number and thread number are reported.
|
||||
|
||||
|
@ -1,12 +1,10 @@
|
||||
/*
|
||||
* @@name: affinity.1c
|
||||
* @@name: affinity.1
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
||||
void work();
|
||||
|
||||
int main()
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: affinity.1f
|
||||
! @@name: affinity.1
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
PROGRAM EXAMPLE
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: affinity.2c
|
||||
* @@name: affinity.2
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: affinity.2f
|
||||
! @@name: affinity.2
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine foo
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: affinity.3c
|
||||
* @@name: affinity.3
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: affinity.3f
|
||||
! @@name: affinity.3
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
PROGRAM EXAMPLE
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: affinity.4c
|
||||
* @@name: affinity.4
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.0
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: affinity.4f
|
||||
! @@name: affinity.4
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
subroutine foo
|
||||
|
@ -1,15 +1,10 @@
|
||||
/*
|
||||
* @@name: affinity.5c
|
||||
* @@name: affinity.5
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_5.1
|
||||
*/
|
||||
#if _OPENMP < 202011
|
||||
#define primary master
|
||||
#endif
|
||||
|
||||
void work();
|
||||
int main()
|
||||
{
|
||||
|
@ -1,14 +1,8 @@
|
||||
! @@name: affinity.5f
|
||||
! @@name: affinity.5
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@requires: preprocessing
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_5.1
|
||||
#if _OPENMP < 202011
|
||||
#define primary master
|
||||
#endif
|
||||
|
||||
PROGRAM EXAMPLE
|
||||
!$OMP PARALLEL PROC_BIND(primary) NUM_THREADS(4)
|
||||
CALL WORK()
|
||||
|
@ -1,10 +1,9 @@
|
||||
/*
|
||||
* @@name: affinity.1.c
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@expect: success
|
||||
* @@version: omp_5.0
|
||||
* @@name: affinity.6
|
||||
* @@type: C
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_5.0
|
||||
*/
|
||||
double * alloc_init_B(double *A, int N);
|
||||
void compute_on_B(double *B, int N);
|
||||
@ -24,4 +23,3 @@ void task_affinity(double *A, int N)
|
||||
|
||||
#pragma omp taskwait
|
||||
}
|
||||
|
||||
|
@ -1,8 +1,7 @@
|
||||
! @@name: affinity.6f
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@expect: success
|
||||
! @@name: affinity.6
|
||||
! @@type: F-free
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_5.0
|
||||
subroutine task_affinity(A, N)
|
||||
|
||||
@ -21,4 +20,3 @@ subroutine task_affinity(A, N)
|
||||
!$omp taskwait
|
||||
|
||||
end subroutine
|
||||
|
||||
|
@ -1,60 +1,61 @@
|
||||
/*
|
||||
* @@name: affinity_display.1.c
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
* @@version: omp_5.0
|
||||
* @@name: affinity_display.1
|
||||
* @@type: C
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_5.0
|
||||
* @@env: OMP_DISPLAY_AFFINITY=TRUE OMP_NUM_THREADS=8
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int main(void){ //MAX threads = 8, single socket system
|
||||
int main(void){ //MAX threads = 8, single socket system
|
||||
|
||||
omp_display_affinity(NULL); //API call-- Displays Affinity of Primary Thread
|
||||
//API call-- Displays Affinity of Primary Thread
|
||||
omp_display_affinity(NULL);
|
||||
|
||||
// API CALL OUTPUT (default format):
|
||||
//team_num= 0, nesting_level= 0, thread_num= 0, thread_affinity= 0,1,2,3,4,5,6,7
|
||||
// API CALL OUTPUT (default format):
|
||||
// team_num= 0, nesting_level= 0, thread_num= 0,
|
||||
// thread_affinity= 0,1,2,3,4,5,6,7
|
||||
|
||||
|
||||
// OMP_DISPLAY_AFFINITY=TRUE, OMP_NUM_THREADS=8
|
||||
// OMP_DISPLAY_AFFINITY=TRUE, OMP_NUM_THREADS=8
|
||||
#pragma omp parallel num_threads(omp_get_num_procs())
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
if(omp_get_thread_num()==0)
|
||||
printf("1st Parallel Region -- Affinity Reported \n");
|
||||
|
||||
// DISPLAY OUTPUT (default format) has been sorted:
|
||||
// team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0
|
||||
// team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 1
|
||||
// ...
|
||||
// team_num= 0, nesting_level= 1, thread_num= 7, thread_affinity= 7
|
||||
// DISPLAY OUTPUT (default format) has been sorted:
|
||||
// team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0
|
||||
// team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 1
|
||||
// ...
|
||||
// team_num= 0, nesting_level= 1, thread_num= 7, thread_affinity= 7
|
||||
|
||||
// doing work here
|
||||
// doing work here
|
||||
}
|
||||
|
||||
#pragma omp parallel num_threads( omp_get_num_procs() )
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
printf("%s%s\n","Same Affinity as in Previous Parallel Region",
|
||||
" -- no Affinity Reported\n");
|
||||
if(omp_get_thread_num()==0)
|
||||
printf("%s%s\n","Same Affinity as in Previous Parallel Region",
|
||||
" -- no Affinity Reported\n");
|
||||
|
||||
// NO AFFINITY OUTPUT:
|
||||
//(output in 1st parallel region only for OMP_DISPLAY_AFFINITY=TRUE)
|
||||
|
||||
// doing more work here
|
||||
// NO AFFINITY OUTPUT:
|
||||
//(output in 1st parallel region only for OMP_DISPLAY_AFFINITY=TRUE)
|
||||
|
||||
// doing more work here
|
||||
}
|
||||
|
||||
// Report Affinity for 1/2 number of threads
|
||||
// Report Affinity for 1/2 number of threads
|
||||
#pragma omp parallel num_threads( omp_get_num_procs()/2 )
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
if(omp_get_thread_num()==0)
|
||||
printf("Report Affinity for using 1/2 of max threads.\n");
|
||||
|
||||
// DISPLAY OUTPUT (default format) has been sorted:
|
||||
// team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0,1
|
||||
// team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 2,3
|
||||
// team_num= 0, nesting_level= 1, thread_num= 2, thread_affinity= 4,5
|
||||
// team_num= 0, nesting_level= 1, thread_num= 3, thread_affinity= 6,7
|
||||
|
||||
// DISPLAY OUTPUT (default format) has been sorted:
|
||||
// team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0,1
|
||||
// team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 2,3
|
||||
// team_num= 0, nesting_level= 1, thread_num= 2, thread_affinity= 4,5
|
||||
// team_num= 0, nesting_level= 1, thread_num= 3, thread_affinity= 6,7
|
||||
|
||||
// do work
|
||||
}
|
||||
|
@ -1,22 +1,24 @@
|
||||
! @@name: affinity_display.1.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
! @@version: omp_5.0
|
||||
! @@name: affinity_display.1
|
||||
! @@type: F-free
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_5.0
|
||||
! @@env: OMP_DISPLAY_AFFINITY=TRUE OMP_NUM_THREADS=8
|
||||
program affinity_display ! MAX threads = 8, single socket system
|
||||
|
||||
use omp_lib
|
||||
implicit none
|
||||
character(len=0) :: null
|
||||
|
||||
call omp_display_affinity(null) !API call- Displays Affinity of Primary Thrd
|
||||
! API call - Displays Affinity of Primary Thread
|
||||
call omp_display_affinity(null)
|
||||
|
||||
! API CALL OUTPUT (default format):
|
||||
!team_num= 0, nesting_level= 0, thread_num= 0, thread_affinity= 0,1,2,3,4,5,6,7
|
||||
! API CALL OUTPUT (default format):
|
||||
! team_num= 0, nesting_level= 0, thread_num= 0, &
|
||||
! thread_affinity= 0,1,2,3,4,5,6,7
|
||||
|
||||
|
||||
! OMP_DISPLAY_AFFINITY=TRUE, OMP_NUM_THREADS=8
|
||||
! OMP_DISPLAY_AFFINITY=TRUE, OMP_NUM_THREADS=8
|
||||
|
||||
!$omp parallel num_threads(omp_get_num_procs())
|
||||
|
||||
@ -24,11 +26,11 @@ program affinity_display ! MAX threads = 8, single socket system
|
||||
print*, "1st Parallel Region -- Affinity Reported"
|
||||
endif
|
||||
|
||||
! DISPLAY OUTPUT (default format) has been sorted:
|
||||
! team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0
|
||||
! team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 1
|
||||
! ...
|
||||
! team_num= 0, nesting_level= 1, thread_num= 7, thread_affinity= 7
|
||||
! DISPLAY OUTPUT (default format) has been sorted:
|
||||
! team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0
|
||||
! team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 1
|
||||
! ...
|
||||
! team_num= 0, nesting_level= 1, thread_num= 7, thread_affinity= 7
|
||||
|
||||
! doing work here
|
||||
|
||||
@ -40,25 +42,30 @@ program affinity_display ! MAX threads = 8, single socket system
|
||||
print*, "Same Affinity in Parallel Region -- no Affinity Reported"
|
||||
endif
|
||||
|
||||
! NO AFFINITY OUTPUT:
|
||||
!(output in 1st parallel region only for OMP_DISPLAY_AFFINITY=TRUE)
|
||||
! NO AFFINITY OUTPUT:
|
||||
! (output in 1st parallel region only for
|
||||
! OMP_DISPLAY_AFFINITY=TRUE)
|
||||
|
||||
! doing more work here
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
! Report Affinity for 1/2 number of threads
|
||||
! Report Affinity for 1/2 number of threads
|
||||
!$omp parallel num_threads( omp_get_num_procs()/2 )
|
||||
|
||||
if(omp_get_thread_num()==0) then
|
||||
print*, "Different Affinity in Parallel Region -- Affinity Reported"
|
||||
print*, "Altered Affinity in Parallel Region -- Affinity Reported"
|
||||
endif
|
||||
|
||||
! DISPLAY OUTPUT (default format) has been sorted:
|
||||
! team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0,1
|
||||
! team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 2,3
|
||||
! team_num= 0, nesting_level= 1, thread_num= 2, thread_affinity= 4,5
|
||||
! team_num= 0, nesting_level= 1, thread_num= 3, thread_affinity= 6,7
|
||||
! DISPLAY OUTPUT (default format) has been sorted:
|
||||
! team_num= 0, nesting_level= 1, thread_num= 0, &
|
||||
! thread_affinity= 0,1
|
||||
! team_num= 0, nesting_level= 1, thread_num= 1, &
|
||||
! thread_affinity= 2,3
|
||||
! team_num= 0, nesting_level= 1, thread_num= 2, &
|
||||
! thread_affinity= 4,5
|
||||
! team_num= 0, nesting_level= 1, thread_num= 3, &
|
||||
! thread_affinity= 6,7
|
||||
|
||||
! do work
|
||||
|
||||
|
@ -1,10 +1,12 @@
|
||||
/*
|
||||
* @@name: affinity_display.2c
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
* @@name: affinity_display.2
|
||||
* @@type: C
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_5.0
|
||||
* @@env: OMP_PROC_BIND=TRUE OMP_NUM_THREADS="2,4"
|
||||
* @@env: OMP_PLACES="{0,2,4,6},{1,3,5,7}"
|
||||
* @@env: OMP_AFFINITY_FORMAT="nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A"
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@ -14,62 +16,65 @@ void socket_work(int socket_num, int n_thrds);
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int n_sockets, socket_num, n_thrds_on_socket;
|
||||
int n_sockets, socket_num, n_thrds_on_socket;
|
||||
|
||||
omp_set_nested(1); // or env var= OMP_NESTED=true
|
||||
omp_set_max_active_levels(2); // or env var= OMP_MAX_ACTIVE_LEVELS=2
|
||||
omp_set_nested(1); // or env var= OMP_NESTED=true
|
||||
omp_set_max_active_levels(2); // or env var= OMP_MAX_ACTIVE_LEVELS=2
|
||||
|
||||
n_sockets = omp_get_num_places();
|
||||
n_thrds_on_socket = omp_get_place_num_procs(0);
|
||||
n_sockets = omp_get_num_places();
|
||||
n_thrds_on_socket = omp_get_place_num_procs(0);
|
||||
|
||||
// OMP_NUM_THREADS=2,4
|
||||
// OMP_PLACES="{0,2,4,6},{1,3,5,7}" #2 sockets; even/odd proc-ids
|
||||
// OMP_AFFINITY_FORMAT=\
|
||||
// "nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A"
|
||||
|
||||
#pragma omp parallel num_threads(n_sockets) private(socket_num)
|
||||
{
|
||||
socket_num = omp_get_place_num();
|
||||
// OMP_NUM_THREADS=2,4
|
||||
// OMP_PLACES="{0,2,4,6},{1,3,5,7}" #2 sockets; even/odd proc-ids
|
||||
// OMP_AFFINITY_FORMAT=\
|
||||
// "nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A"
|
||||
|
||||
if(socket_num==0)
|
||||
printf(" LEVEL 1 AFFINITIES 1 thread/socket, %d sockets:\n\n", n_sockets);
|
||||
#pragma omp parallel num_threads(n_sockets) private(socket_num)
|
||||
{
|
||||
socket_num = omp_get_place_num();
|
||||
|
||||
omp_display_affinity(NULL); // not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
if(socket_num==0)
|
||||
printf(" LEVEL 1 AFFINITIES 1 thread/socket, %d sockets:\n\n",
|
||||
n_sockets);
|
||||
|
||||
// OUTPUT:
|
||||
// LEVEL 1 AFFINITIES 1 thread/socket, 2 sockets:
|
||||
// nest_level= 1, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0,2,4,6
|
||||
// nest_level= 1, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 1,3,5,7
|
||||
// not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
omp_display_affinity(NULL);
|
||||
|
||||
socket_work(socket_num, n_thrds_on_socket);
|
||||
}
|
||||
|
||||
return 0;
|
||||
// OUTPUT:
|
||||
// LEVEL 1 AFFINITIES 1 thread/socket, 2 sockets:
|
||||
// nest_level= 1, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0,2,4,6
|
||||
// nest_level= 1, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 1,3,5,7
|
||||
|
||||
socket_work(socket_num, n_thrds_on_socket);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void socket_work(int socket_num, int n_thrds)
|
||||
{
|
||||
#pragma omp parallel num_threads(n_thrds)
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
printf(" LEVEL 2 AFFINITIES, %d threads on socket %d\n",n_thrds, socket_num);
|
||||
|
||||
omp_display_affinity(NULL); // not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
|
||||
// OUTPUT:
|
||||
// LEVEL 2 AFFINITIES, 4 threads on socket 0
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 2
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 2, thrd_affinity= 4
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 3, thrd_affinity= 6
|
||||
{
|
||||
#pragma omp parallel num_threads(n_thrds)
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
printf(" LEVEL 2 AFFINITIES, %d threads on socket %d\n",
|
||||
n_thrds, socket_num);
|
||||
|
||||
// not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
omp_display_affinity(NULL);
|
||||
|
||||
// OUTPUT:
|
||||
// LEVEL 2 AFFINITIES, 4 threads on socket 0
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 2
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 2, thrd_affinity= 4
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 3, thrd_affinity= 6
|
||||
|
||||
// LEVEL 2 AFFINITIES, 4 threads on socket 1
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 0, thrd_affinity= 1
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 1, thrd_affinity= 3
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 2, thrd_affinity= 5
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 3, thrd_affinity= 7
|
||||
|
||||
// LEVEL 2 AFFINITIES, 4 threads on socket 1
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 0, thrd_affinity= 1
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 1, thrd_affinity= 3
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 2, thrd_affinity= 5
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 3, thrd_affinity= 7
|
||||
|
||||
// ... Do Some work on Socket
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,9 +1,11 @@
|
||||
! @@name: affinity_display.2.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
! @@version: omp_5.0
|
||||
! @@name: affinity_display.2
|
||||
! @@type: F-free
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_5.0
|
||||
! @@env: OMP_PROC_BIND=TRUE OMP_NUM_THREADS="2,4"
|
||||
! @@env: OMP_PLACES="{0,2,4,6},{1,3,5,7}"
|
||||
! @@env: OMP_AFFINITY_FORMAT="nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A"
|
||||
program affinity_display
|
||||
|
||||
use omp_lib
|
||||
@ -20,22 +22,26 @@ program affinity_display
|
||||
! OMP_NUM_THREADS=2,4
|
||||
! OMP_PLACES="{0,2,4,6},{1,3,5,7}" #2 sockets; even/odd proc-ids
|
||||
! OMP_AFFINITY_FORMAT=\
|
||||
! "nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A"
|
||||
!"nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A"
|
||||
|
||||
!$omp parallel num_threads(n_sockets) private(socket_num)
|
||||
|
||||
socket_num = omp_get_place_num()
|
||||
|
||||
if(socket_num==0) then
|
||||
write(*,'("LEVEL 1 AFFINITIES 1 thread/socket ",i0," sockets")')n_sockets
|
||||
write(*,'("LEVEL 1 AFFINITIES 1 thread/socket ",i0," sockets")') &
|
||||
n_sockets
|
||||
endif
|
||||
|
||||
call omp_display_affinity(null) !not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
call omp_display_affinity(null) ! not needed
|
||||
! if OMP_DISPLAY_AFFINITY=TRUE
|
||||
|
||||
! OUTPUT:
|
||||
! LEVEL 1 AFFINITIES 1 thread/socket, 2 sockets:
|
||||
! nest_level= 1, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0,2,4,6
|
||||
! nest_level= 1, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 1,3,5,7
|
||||
! nest_level= 1, parent_thrd_num= 0, thrd_num= 0, &
|
||||
! thrd_affinity= 0,2,4,6
|
||||
! nest_level= 1, parent_thrd_num= 0, thrd_num= 1, &
|
||||
! thrd_affinity= 1,3,5,7
|
||||
|
||||
call socket_work(socket_num, n_thrds_on_socket)
|
||||
|
||||
@ -56,7 +62,8 @@ subroutine socket_work(socket_num, n_thrds)
|
||||
n_thrds,socket_num
|
||||
endif
|
||||
|
||||
call omp_display_affinity(null); !not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
call omp_display_affinity(null) ! not needed
|
||||
! if OMP_DISPLAY_AFFINITY=TRUE
|
||||
|
||||
! OUTPUT:
|
||||
! LEVEL 2 AFFINITIES, 4 threads on socket 0
|
||||
|
@ -1,10 +1,9 @@
|
||||
/*
|
||||
* @@name: affinity_display.3.c
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
* @@version: omp_5.0
|
||||
* @@name: affinity_display.3
|
||||
* @@type: C
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_5.0
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h> // also null is in <stddef.h>
|
||||
@ -25,9 +24,9 @@ int main(void){
|
||||
char **buffer;
|
||||
|
||||
|
||||
// CODE SEGMENT 1 AFFINITY FORMAT
|
||||
// CODE SEGMENT 1 AFFINITY FORMAT
|
||||
|
||||
// Get and Display Default Affinity Format
|
||||
// Get and Display Default Affinity Format
|
||||
|
||||
nchars = omp_get_affinity_format(default_format,(size_t)FORMAT_STORE);
|
||||
printf("Default Affinity Format is: %s\n",default_format);
|
||||
@ -37,44 +36,49 @@ int main(void){
|
||||
printf(" FORMAT_STORE to %d.\n", nchars+1);
|
||||
}
|
||||
|
||||
// Set Affinity Format
|
||||
// Set Affinity Format
|
||||
|
||||
omp_set_affinity_format(my_format);
|
||||
printf("Affinity Format set to: %s\n",my_format);
|
||||
|
||||
|
||||
// CODE SEGMENT 2 CAPTURE AFFINITY
|
||||
// CODE SEGMENT 2 CAPTURE AFFINITY
|
||||
|
||||
// Set up buffer for affinity of n threads
|
||||
// Set up buffer for affinity of n threads
|
||||
|
||||
n = omp_get_num_procs();
|
||||
buffer = (char **)malloc( sizeof(char *) * n );
|
||||
for(i=0;i<n;i++){ buffer[i]=(char *)malloc( sizeof(char) * BUFFER_STORE); }
|
||||
for(i=0;i<n;i++){
|
||||
buffer[i]=(char *)malloc( sizeof(char) * BUFFER_STORE);
|
||||
}
|
||||
|
||||
// Capture Affinity using Affinity Format set above.
|
||||
// Use max reduction to check size of buffer areas
|
||||
// Capture Affinity using Affinity Format set above.
|
||||
// Use max reduction to check size of buffer areas
|
||||
max_req_store = 0;
|
||||
#pragma omp parallel private(thrd_num,nchars) reduction(max:max_req_store)
|
||||
#pragma omp parallel private(thrd_num,nchars) \
|
||||
reduction(max:max_req_store)
|
||||
{
|
||||
if(omp_get_num_threads()>n) exit(1); //safety: don't exceed # of buffers
|
||||
//safety: don't exceed # of buffers
|
||||
if(omp_get_num_threads()>n) exit(1);
|
||||
|
||||
thrd_num=omp_get_thread_num();
|
||||
nchars=omp_capture_affinity(buffer[thrd_num],(size_t)BUFFER_STORE,NULL);
|
||||
nchars=omp_capture_affinity(buffer[thrd_num],
|
||||
(size_t)BUFFER_STORE,NULL);
|
||||
if(nchars > max_req_store) max_req_store=nchars;
|
||||
|
||||
// ...
|
||||
}
|
||||
|
||||
for(i=0;i<n;i++){
|
||||
printf("thrd_num= %d, affinity: %s\n", i,buffer[i]);
|
||||
for(i=0;i<n;i++){
|
||||
printf("thrd_num= %d, affinity: %s\n", i,buffer[i]);
|
||||
}
|
||||
// For 4 threads with OMP_PLACES='{0,1},{2,3},{4,5},{6,7}'
|
||||
// Format host=%20H thrd_num=%0.4n binds_to=%A
|
||||
// For 4 threads with OMP_PLACES='{0,1},{2,3},{4,5},{6,7}'
|
||||
// Format host=%20H thrd_num=%0.4n binds_to=%A
|
||||
|
||||
// affinity: host=hpc.cn567 thrd_num=0000 binds_to=0,1
|
||||
// affinity: host=hpc.cn567 thrd_num=0001 binds_to=2,3
|
||||
// affinity: host=hpc.cn567 thrd_num=0002 binds_to=4,5
|
||||
// affinity: host=hpc.cn567 thrd_num=0003 binds_to=6,7
|
||||
// affinity: host=hpc.cn567 thrd_num=0000 binds_to=0,1
|
||||
// affinity: host=hpc.cn567 thrd_num=0001 binds_to=2,3
|
||||
// affinity: host=hpc.cn567 thrd_num=0002 binds_to=4,5
|
||||
// affinity: host=hpc.cn567 thrd_num=0003 binds_to=6,7
|
||||
|
||||
|
||||
if(max_req_store>=BUFFER_STORE){
|
||||
|
@ -1,9 +1,8 @@
|
||||
! @@name: affinity_display.3.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
! @@version: omp_5.0
|
||||
! @@name: affinity_display.3
|
||||
! @@type: F-free
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_5.0
|
||||
program affinity_display
|
||||
use omp_lib
|
||||
implicit none
|
||||
|
@ -1,8 +1,7 @@
|
||||
/*
|
||||
* @@name: affinity_query.1c
|
||||
* @@name: affinity_query.1
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: omp_4.5
|
||||
*/
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: affinity_query.1f
|
||||
! @@name: affinity_query.1
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_4.5
|
||||
subroutine socket_init(socket_num)
|
||||
|
@ -1,29 +1,33 @@
|
||||
\section{Task Affinity}
|
||||
\label{sec: task_affinity}
|
||||
\index{affinity!task affinity}
|
||||
\index{affinity!affinity clause@\kcode{affinity} clause}
|
||||
\index{clauses!affinity@\kcode{affinity}}
|
||||
\index{affinity clause@\kcode{affinity} clause}
|
||||
|
||||
The next example illustrates the use of the \code{affinity}
|
||||
clause with a \code{task} construct.
|
||||
The variables in the \code{affinity} clause provide a
|
||||
The next example illustrates the use of the \kcode{affinity}
|
||||
clause with a \kcode{task} construct.
|
||||
The variables in the \kcode{affinity} clause provide a
|
||||
hint to the runtime that the task should execute
|
||||
"close" to the physical storage location of the variables. For example,
|
||||
``close'' to the physical storage location of the variables. For example,
|
||||
on a two-socket platform with a local memory component
|
||||
close to each processor socket, the runtime will attempt to
|
||||
schedule the task execution on the socket where the storage is located.
|
||||
|
||||
Because the C/C++ code employs a pointer, an array section is used in
|
||||
the \code{affinity} clause.
|
||||
the \kcode{affinity} clause.
|
||||
Fortran code can use an array reference to specify the storage, as
|
||||
shown here.
|
||||
|
||||
Note, in the second task of the C/C++ code the \plc{B} pointer is declared
|
||||
Note, in the second task of the C/C++ code the \ucode{B} pointer is declared
|
||||
shared. Otherwise, by default, it would be firstprivate since it is a local
|
||||
variable, and would probably be saved for the second task before being assigned
|
||||
a storage address by the first task. Also, one might think it reasonable to use
|
||||
the \code{affinity} clause \plc{affinity(B[:N])} on the second \code{task} construct.
|
||||
However, the storage behind \plc{B} is created in the first task, and the
|
||||
the \kcode{affinity} clause \kcode{affinity(\ucode{B[:N]})} on the second \kcode{task} construct.
|
||||
However, the storage behind \ucode{B} is created in the first task, and the
|
||||
array section reference may not be valid when the second task is generated.
|
||||
The use of the \plc{A} array is sufficient for this case, because one
|
||||
would expect the storage for \plc{A} and \plc{B} would be physically "close"
|
||||
The use of the \ucode{A} array is sufficient for this case, because one
|
||||
would expect the storage for \ucode{A} and \ucode{B} would be physically ``close''
|
||||
(as provided by the hint in the first task).
|
||||
|
||||
\cexample[5.0]{affinity}{6}
|
||||
|
@ -1,33 +1,67 @@
|
||||
\pagebreak
|
||||
\section{Fortran \code{ASSOCIATE} Construct}
|
||||
\fortranspecificstart
|
||||
%\pagebreak
|
||||
\begin{fortranspecific}[4ex]
|
||||
\section{Fortran \bcode{ASSOCIATE} Construct}
|
||||
\label{sec:associate}
|
||||
\index{ASSOCIATE construct, Fortran@\bcode{ASSOCIATE} construct, Fortran}
|
||||
|
||||
The following is an invalid example of specifying an associate name on a data-sharing attribute
|
||||
clause. The constraint in the Data Sharing Attribute Rules section in the OpenMP
|
||||
4.0 API Specifications states that an associate name preserves the association
|
||||
with the selector established at the \code{ASSOCIATE} statement. The associate
|
||||
name \plc{b} is associated with the shared variable \plc{a}. With the predetermined data-sharing
|
||||
attribute rule, the associate name \plc{b} is not allowed to be specified on the \code{private}
|
||||
clause. The constraint in the \docref{Data Sharing Attribute Rules} section in the OpenMP
|
||||
4.0 API Specification states that an associate name preserves the association
|
||||
with the selector established at the \bcode{ASSOCIATE} statement. The associate
|
||||
name \ucode{b} is associated with the shared variable \ucode{a}. With the predetermined data-sharing
|
||||
attribute rule, the associate name \ucode{b} is not allowed to be specified on the \kcode{private}
|
||||
clause.
|
||||
|
||||
%\pagebreak
|
||||
\fnexample[4.0]{associate}{1}
|
||||
|
||||
In next example, within the \code{parallel} construct, the association name \plc{thread\_id}
|
||||
is associated with the private copy of \plc{i}. The print statement should output the
|
||||
In next example, within the \kcode{parallel} construct, the association name \ucode{thread_id}
|
||||
is associated with the private copy of \ucode{i}. The print statement should output the
|
||||
unique thread number.
|
||||
|
||||
\fnexample[4.0]{associate}{2}
|
||||
|
||||
The following example illustrates the effect of specifying a selector name on a data-sharing
|
||||
attribute clause. The associate name \plc{u} is associated with \plc{v} and the variable \plc{v}
|
||||
is specified on the \code{private} clause of the \code{parallel} construct.
|
||||
The construct association is established prior to the \code{parallel} region.
|
||||
The association between \plc{u} and the original \plc{v} is retained (see the Data Sharing
|
||||
Attribute Rules section in the OpenMP 4.0 API Specifications). Inside the \code{parallel}
|
||||
region, \plc{v} has the value of -1 and \plc{u} has the value of the original \plc{v}.
|
||||
attribute clause. The associate name \ucode{u} is associated with \ucode{v} and the variable \ucode{v}
|
||||
is specified on the \kcode{private} clause of the \kcode{parallel} construct.
|
||||
The construct association is established prior to the \kcode{parallel} region.
|
||||
The association between \ucode{u} and the original \ucode{v} is retained (see the \docref{Data Sharing
|
||||
Attribute Rules} section in the OpenMP 4.0 API Specification). Inside the \kcode{parallel}
|
||||
region, \ucode{v} has the value of -1 and \ucode{u} has the value of the original \ucode{v}.
|
||||
|
||||
\pagebreak
|
||||
\topmarker{Fortran}
|
||||
\ffreenexample[4.0]{associate}{3}
|
||||
\fortranspecificend
|
||||
|
||||
%\topmarker{Fortran}
|
||||
\label{sec:associate_target}
|
||||
|
||||
\bigskip
|
||||
The following example illustrates mapping behavior for a Fortran
|
||||
associate name and its selector for a \kcode{target} construct.
|
||||
|
||||
For the first 3 \kcode{target} constructs the associate name \ucode{a_aray} is
|
||||
associated with the selector \ucode{aray}, an array.
|
||||
For the \kcode{target} construct of code block TARGET 1 just the selector
|
||||
\ucode{aray} is used and is implicitly mapped,
|
||||
likewise for the associate name \ucode{a_aray} in the TARGET 2 block.
|
||||
However, mapping an associate name and its selector is not valid for the same
|
||||
\kcode{target} construct. Hence the TARGET 3 block is non-conforming.
|
||||
|
||||
|
||||
In TARGET 4, the \ucode{scalr} selector used in the \kcode{target} region
|
||||
has an implicit data-sharing attribute of firstprivate since it is a scalar.
|
||||
Hence, the assigned value is not returned.
|
||||
In TARGET 5, the associate name \ucode{a_scalr} is implicitly mapped and the
|
||||
assigned value is returned to the host (default \kcode{tofrom} mapping behavior).
|
||||
In TARGET 6, the use of the associate name and its selector in the \kcode{target}
|
||||
region is conforming because the scalar firstprivate behavior of the selector
|
||||
and the implicit mapping of the associate name are allowed.
|
||||
At the end of the \kcode{target} region only the
|
||||
associate name's value is returned to the host.
|
||||
In TARGET 7, the selector and associate name appear in
|
||||
an explicit mapping for the same \kcode{target} construct,
|
||||
hence the code block is non-conforming.
|
||||
|
||||
\ffreenexample[5.1]{associate}{4}
|
||||
\end{fortranspecific}
|
||||
|
||||
|
@ -1,37 +1,39 @@
|
||||
\pagebreak
|
||||
\section{C/C++ Arrays in a \code{firstprivate} Clause}
|
||||
\ccppspecificstart
|
||||
%\pagebreak
|
||||
\begin{ccppspecific}[4ex]
|
||||
\section{C/C++ Arrays in a \kcode{firstprivate} Clause}
|
||||
\label{sec:carrays_fpriv}
|
||||
\index{clauses!firstprivate@\kcode{firstprivate}}
|
||||
\index{firstprivate clause@\kcode{firstprivate} clause!C/C++ arrays in}
|
||||
|
||||
The following example illustrates the size and value of list items of array or
|
||||
pointer type in a \code{firstprivate} clause . The size of new list items is
|
||||
pointer type in a \kcode{firstprivate} clause. The size of new list items is
|
||||
based on the type of the corresponding original list item, as determined by the
|
||||
base language.
|
||||
|
||||
In this example:
|
||||
|
||||
\begin{compactitem}
|
||||
\item The type of \code{A} is array of two arrays of two ints.
|
||||
\item The type of \ucode{A} is array of two arrays of two \bcode{int}s.
|
||||
|
||||
\item The type of \code{B} is adjusted to pointer to array of \code{n}
|
||||
ints, because it is a function parameter.
|
||||
\item The type of \ucode{B} is adjusted to pointer to array of \ucode{n}
|
||||
\bcode{int}s, because it is a function parameter.
|
||||
|
||||
\item The type of \code{C} is adjusted to pointer to int, because
|
||||
\item The type of \ucode{C} is adjusted to pointer to \bcode{int}, because
|
||||
it is a function parameter.
|
||||
|
||||
\item The type of \code{D} is array of two arrays of two ints.
|
||||
\item The type of \ucode{D} is array of two arrays of two \bcode{int}s.
|
||||
|
||||
\item The type of \code{E} is array of \code{n} arrays of \code{n}
|
||||
ints.
|
||||
\item The type of \ucode{E} is array of \ucode{n} arrays of \ucode{n}
|
||||
\bcode{int}s.
|
||||
\end{compactitem}
|
||||
|
||||
Note that \code{B} and \code{E} involve variable length array types.
|
||||
Note that \ucode{B} and \ucode{E} involve variable length array types.
|
||||
|
||||
The new items of array type are initialized as if each integer element of the original
|
||||
array is assigned to the corresponding element of the new array. Those of pointer
|
||||
type are initialized as if by assignment from the original item to the new item.
|
||||
|
||||
\cnexample{carrays_fpriv}{1}
|
||||
\ccppspecificend
|
||||
\end{ccppspecific}
|
||||
|
||||
|
||||
|
@ -1,9 +1,13 @@
|
||||
\pagebreak
|
||||
\section{\code{copyin} Clause}
|
||||
%\pagebreak
|
||||
\section{\kcode{copyin} Clause}
|
||||
\label{sec:copyin}
|
||||
\index{clauses!copyin@\kcode{copyin}}
|
||||
\index{copyin clause@\kcode{copyin} clause}
|
||||
\index{directives!threadprivate@\kcode{threadprivate}}
|
||||
\index{threadprivate directive@\kcode{threadprivate} directive}
|
||||
|
||||
The \code{copyin} clause is used to initialize threadprivate data upon entry
|
||||
to a \code{parallel} region. The value of the threadprivate variable in the primary
|
||||
The \kcode{copyin} clause is used to initialize threadprivate data upon entry
|
||||
to a \kcode{parallel} region. The value of the threadprivate variable in the primary
|
||||
thread is copied to the threadprivate variable of each other team member.
|
||||
|
||||
\cexample{copyin}{1}
|
||||
|
@ -1,18 +1,22 @@
|
||||
\pagebreak
|
||||
\section{\code{copyprivate} Clause}
|
||||
%\pagebreak
|
||||
\section{\kcode{copyprivate} Clause}
|
||||
\label{sec:copyprivate}
|
||||
\index{clauses!copyprivate@\kcode{copyprivate}}
|
||||
\index{copyprivate clause@\kcode{copyprivate} clause}
|
||||
|
||||
The \code{copyprivate} clause can be used to broadcast values acquired by a single
|
||||
The \kcode{copyprivate} clause can be used to broadcast values acquired by a single
|
||||
thread directly to all instances of the private variables in the other threads.
|
||||
In this example, if the routine is called from the sequential part, its behavior
|
||||
is not affected by the presence of the directives. If it is called from a \code{parallel}
|
||||
region, then the actual arguments with which \code{a} and \code{b} are associated
|
||||
is not affected by the presence of the directives. If it is called from a \kcode{parallel}
|
||||
region, then the actual arguments with which \ucode{a} and \ucode{b} are associated
|
||||
must be private.
|
||||
|
||||
The thread that executes the structured block associated with the \code{single}
|
||||
construct broadcasts the values of the private variables \code{a}, \code{b},
|
||||
\code{x}, and
|
||||
\code{y} from its implicit task's data environment to the data environments
|
||||
\index{constructs!single@\kcode{single}}
|
||||
\index{single construct@\kcode{single} construct}
|
||||
The thread that executes the structured block associated with the \kcode{single}
|
||||
construct broadcasts the values of the private variables \ucode{a}, \ucode{b},
|
||||
\ucode{x}, and
|
||||
\ucode{y} from its implicit task's data environment to the data environments
|
||||
of the other implicit tasks in the thread team. The broadcast completes before
|
||||
any of the threads have left the barrier at the end of the construct.
|
||||
|
||||
@ -20,32 +24,34 @@ any of the threads have left the barrier at the end of the construct.
|
||||
|
||||
\fexample{copyprivate}{1}
|
||||
|
||||
\index{constructs!masked@\kcode{masked}}
|
||||
\index{masked construct@\kcode{masked} construct}
|
||||
In this example, assume that the input must be performed by the primary thread.
|
||||
Since the \code{masked} construct does not support the \code{copyprivate} clause,
|
||||
it cannot broadcast the input value that is read. However, \code{copyprivate}
|
||||
Since the \kcode{masked} construct does not support the \kcode{copyprivate} clause,
|
||||
it cannot broadcast the input value that is read. However, \kcode{copyprivate}
|
||||
is used to broadcast an address where the input value is stored.
|
||||
|
||||
\cexample[5.1]{copyprivate}{2}
|
||||
|
||||
\fexample[5.1]{copyprivate}{2}[1]
|
||||
\fexample[5.1]{copyprivate}{2}
|
||||
|
||||
Suppose that the number of lock variables required within a \code{parallel} region
|
||||
cannot easily be determined prior to entering it. The \code{copyprivate} clause
|
||||
Suppose that the number of lock variables required within a \kcode{parallel} region
|
||||
cannot easily be determined prior to entering it. The \kcode{copyprivate} clause
|
||||
can be used to provide access to shared lock variables that are allocated within
|
||||
that \code{parallel} region.
|
||||
that \kcode{parallel} region.
|
||||
|
||||
\cexample{copyprivate}{3}
|
||||
|
||||
\fortranspecificstart
|
||||
\begin{fortranspecific}
|
||||
\fnexample{copyprivate}{3}
|
||||
|
||||
Note that the effect of the \code{copyprivate} clause on a variable with the
|
||||
\code{allocatable} attribute is different than on a variable with the \code{pointer}
|
||||
attribute. The value of \code{A} is copied (as if by intrinsic assignment) and
|
||||
the pointer \code{B} is copied (as if by pointer assignment) to the corresponding
|
||||
list items in the other implicit tasks belonging to the \code{parallel} region.
|
||||
Note that the effect of the \kcode{copyprivate} clause on a variable with the
|
||||
\bcode{allocatable} attribute is different than on a variable with the \bcode{pointer}
|
||||
attribute. The value of \ucode{A} is copied (as if by intrinsic assignment) and
|
||||
the pointer \ucode{B} is copied (as if by pointer assignment) to the corresponding
|
||||
list items in the other implicit tasks belonging to the \kcode{parallel} region.
|
||||
|
||||
\fnexample{copyprivate}{4}
|
||||
\fortranspecificend
|
||||
\end{fortranspecific}
|
||||
|
||||
|
||||
|
@ -1,14 +1,16 @@
|
||||
\begin{cppspecific}[4ex]
|
||||
\section{C++ Reference in Data-Sharing Clauses}
|
||||
\cppspecificstart
|
||||
\label{sec:cpp_reference}
|
||||
\index{clauses!data-sharing, C++ reference in}
|
||||
\index{data-sharing clauses, C++ reference in}
|
||||
|
||||
C++ reference types are allowed in data-sharing attribute clauses as of OpenMP 4.5, except
|
||||
for the \code{threadprivate}, \code{copyin} and \code{copyprivate} clauses.
|
||||
(See the Data-Sharing Attribute Clauses Section of the 4.5 OpenMP specification.)
|
||||
for the \kcode{threadprivate}, \kcode{copyin} and \kcode{copyprivate} clauses.
|
||||
(See the \docref{Data-Sharing Attribute Clauses} section of the 4.5 OpenMP specification.)
|
||||
When a variable with C++ reference type is privatized, the object the reference refers to is privatized in addition to the reference itself.
|
||||
The following example shows the use of reference types in data-sharing clauses in the usual way.
|
||||
Additionally it shows how the data-sharing of formal arguments with a C++ reference type on an orphaned task generating construct is determined implicitly. (See the Data-sharing Attribute Rules for Variables Referenced in a Construct Section of the 4.5 OpenMP specification.)
|
||||
Additionally it shows how the data-sharing of formal arguments with a C++ reference type on an orphaned task generating construct is determined implicitly. (See the \docref{Data-sharing Attribute Rules for Variables Referenced in a Construct} section of the 4.5 OpenMP specification.)
|
||||
|
||||
|
||||
\cppnexample[4.5]{cpp_reference}{1}
|
||||
\cppspecificend
|
||||
\end{cppspecific}
|
||||
|
@ -1,18 +1,20 @@
|
||||
\pagebreak
|
||||
\section{\code{default(none)} Clause}
|
||||
%\pagebreak
|
||||
\section{\kcode{default(none)} Clause}
|
||||
\label{sec:default_none}
|
||||
\index{clauses!default(none)@\kcode{default(none)}}
|
||||
\index{default(none) clause@\kcode{default(none)} clause}
|
||||
|
||||
The following example distinguishes the variables that are affected by the \code{default(none)}
|
||||
The following example distinguishes the variables that are affected by the \kcode{default(none)}
|
||||
clause from those that are not.
|
||||
|
||||
\ccppspecificstart
|
||||
Beginning with OpenMP 4.0, variables with \code{const}-qualified type and no mutable member
|
||||
are no longer predetermined shared. Thus, these variables (variable \plc{c} in the example)
|
||||
\begin{ccppspecific}
|
||||
Beginning with OpenMP 4.0, variables with \bcode{const}-qualified type and no mutable member
|
||||
are no longer predetermined shared. Thus, these variables (variable \ucode{c} in the example)
|
||||
need to be explicitly listed
|
||||
in data-sharing attribute clauses when the \code{default(none)} clause is specified.
|
||||
in data-sharing attribute clauses when the \kcode{default(none)} clause is specified.
|
||||
|
||||
\cnexample{default_none}{1}
|
||||
\ccppspecificend
|
||||
\end{ccppspecific}
|
||||
|
||||
\fexample{default_none}{1}
|
||||
|
||||
|
@ -1,13 +1,15 @@
|
||||
\pagebreak
|
||||
%\pagebreak
|
||||
\begin{fortranspecific}[4ex]
|
||||
\section{Fortran Private Loop Iteration Variables}
|
||||
\label{sec:fort_loopvar}
|
||||
\fortranspecificstart
|
||||
\index{loop variables, Fortran}
|
||||
|
||||
In general loop iteration variables will be private, when used in the \plc{do-loop}
|
||||
of a \code{do} and \code{parallel do} construct or in sequential loops in a
|
||||
\code{parallel} construct (see Section 2.7.1 and Section 2.14.1 of
|
||||
of a \kcode{do} and \kcode{parallel do} construct or in sequential loops in a
|
||||
\kcode{parallel} construct (see the \docref{Loop Construct} section and
|
||||
the \docref{Data-sharing Attribute Rules} section of
|
||||
the OpenMP 4.0 specification). In the following example of a sequential
|
||||
loop in a \code{parallel} construct the loop iteration variable \plc{I} will
|
||||
loop in a \kcode{parallel} construct the loop iteration variable \ucode{I} will
|
||||
be private.
|
||||
|
||||
\ffreenexample{fort_loopvar}{1}
|
||||
@ -19,5 +21,5 @@ example:
|
||||
|
||||
Note however that the use of shared loop iteration variables can easily lead to
|
||||
race conditions.
|
||||
\fortranspecificend
|
||||
\end{fortranspecific}
|
||||
|
||||
|
@ -1,24 +1,24 @@
|
||||
\pagebreak
|
||||
\section{Fortran Restrictions on Storage Association with the \code{private} Clause}
|
||||
\fortranspecificstart
|
||||
%\pagebreak
|
||||
\begin{fortranspecific}[4ex]
|
||||
\section{Fortran Restrictions on Storage Association with the \kcode{private} Clause}
|
||||
\label{sec:fort_sa_private}
|
||||
\index{clauses!private@\kcode{private}}
|
||||
\index{private clause@\kcode{private} clause!storage association, Fortran}
|
||||
|
||||
The following non-conforming examples illustrate the implications of the \code{private}
|
||||
The following non-conforming examples illustrate the implications of the \kcode{private}
|
||||
clause rules with regard to storage association.
|
||||
|
||||
\pagebreak
|
||||
\fnexample{fort_sa_private}{1}
|
||||
|
||||
\topmarker{Fortran}
|
||||
\fnexample{fort_sa_private}{2}
|
||||
\clearpage
|
||||
|
||||
\fnexample{fort_sa_private}{3}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
|
||||
\fnexample{fort_sa_private}{4}
|
||||
|
||||
\topmarker{Fortran}
|
||||
\fnexample[5.1]{fort_sa_private}{5}
|
||||
\fortranspecificend
|
||||
\end{fortranspecific}
|
||||
|
||||
|
45
data_environment/fort_shared_var.tex
Normal file
45
data_environment/fort_shared_var.tex
Normal file
@ -0,0 +1,45 @@
|
||||
%\pagebreak
|
||||
\begin{fortranspecific}[4ex]
|
||||
\section{Passing Shared Variable to Procedure in Fortran}
|
||||
\label{sec:fort_shared_var}
|
||||
\index{clauses!shared@\kcode{shared}}
|
||||
\index{shared clause@\kcode{shared} clause!storage association, Fortran}
|
||||
|
||||
Passing a shared variable to a procedure in Fortran may result in the use of
|
||||
temporary storage in place of the actual argument when the corresponding dummy
|
||||
argument does not have the \bcode{VALUE} or \bcode{CONTIGUOUS} attribute and
|
||||
its data-sharing attribute is implementation-defined as per the rules in
|
||||
Section \docref{Variables Referenced in a Region but not in a Construct} of
|
||||
the OpenMP Specification.
|
||||
These conditions effectively result in references to, and definitions of, the
|
||||
temporary storage during the procedure reference. Furthermore, the value of the
|
||||
shared variable is copied into the intervening temporary storage before the
|
||||
procedure reference when the dummy argument does not have the
|
||||
\bcode{INTENT(OUT)} attribute, and is copied out of the temporary storage into
|
||||
the shared variable when the dummy argument does not have the
|
||||
\bcode{INTENT(IN)} attribute. Any references to (or definitions of) the shared
|
||||
storage that is associated with the dummy argument by any other task must be
|
||||
synchronized with the procedure reference to avoid possible data races.
|
||||
|
||||
The following examples illustrate the implications of passing a shared
|
||||
variable \ucode{a} to subroutine \ucode{sub1} or \ucode{sub2} in
|
||||
a \kcode{parallel} region.
|
||||
For \ucode{sub1}, an implementation may or may not generate a copy-in/copy-out
|
||||
for the temporary storage associated with variable \ucode{b}.
|
||||
If there is a copy-in/copy-out, the code for copy-in/copy-out will result in
|
||||
a race condition, even though there is an \kcode{atomic}
|
||||
directive for the update of variable \ucode{b(i)} in the subroutine.
|
||||
If the implementation can create a temporary descriptor for \ucode{a(::2)}
|
||||
with the correct stride and passed it to subroutine \ucode{sub1},
|
||||
the same memory is accessed inside the subroutine and the result
|
||||
(\ucode{sum1}) is then well defined.
|
||||
For \ucode{sub2}, there is the \bcode{CONTIGUOUS} attribute for
|
||||
variable \ucode{b} and the implementation will generate a copy-in/copy-out
|
||||
for the temporary storage.
|
||||
The code will have a race condition and the result (\ucode{sum2}) is
|
||||
not well defined.
|
||||
|
||||
\topmarker{Fortran}
|
||||
\ffreenexample{fort_shared_var}{1}
|
||||
\end{fortranspecific}
|
||||
|
@ -1,32 +1,33 @@
|
||||
\pagebreak
|
||||
\section{Fortran Restrictions on \code{shared} and \code{private} Clauses with Common Blocks}
|
||||
\fortranspecificstart
|
||||
%\pagebreak
|
||||
\begin{fortranspecific}[4ex]
|
||||
\section{Fortran Restrictions on \kcode{shared} and \kcode{private} Clauses with Common Blocks}
|
||||
\label{sec:fort_sp_common}
|
||||
\index{clauses!private@\kcode{private}}
|
||||
\index{clauses!shared@\kcode{shared}}
|
||||
\index{private clause@\kcode{private} clause!common blocks, Fortran}
|
||||
\index{shared clause@\kcode{shared} clause!common blocks, Fortran}
|
||||
|
||||
When a named common block is specified in a \code{private}, \code{firstprivate},
|
||||
or \code{lastprivate} clause of a construct, none of its members may be declared
|
||||
When a named common block is specified in a \kcode{private}, \kcode{firstprivate},
|
||||
or \kcode{lastprivate} clause of a construct, none of its members may be declared
|
||||
in another data-sharing attribute clause on that construct. The following examples
|
||||
illustrate this point.
|
||||
|
||||
The following example is conforming:
|
||||
|
||||
\pagebreak
|
||||
\fnexample{fort_sp_common}{1}
|
||||
|
||||
The following example is also conforming:
|
||||
|
||||
\fnexample{fort_sp_common}{2}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
%\begin{figure}[t!]
|
||||
%\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
%\end{figure}
|
||||
\clearpage
|
||||
\topmarker{Fortran}
|
||||
|
||||
The following example is conforming:
|
||||
|
||||
\fnexample{fort_sp_common}{3}
|
||||
|
||||
The following example is non-conforming because \code{x} is a constituent element
|
||||
of \code{c}:
|
||||
The following example is non-conforming because \ucode{x} is a constituent element
|
||||
of \ucode{c}:
|
||||
|
||||
\fnexample{fort_sp_common}{4}
|
||||
|
||||
@ -34,6 +35,6 @@ The following example is non-conforming because a common block may not be declar
|
||||
both shared and private:
|
||||
|
||||
\fnexample{fort_sp_common}{5}
|
||||
\fortranspecificend
|
||||
\end{fortranspecific}
|
||||
|
||||
|
||||
|
67
data_environment/induction.tex
Normal file
67
data_environment/induction.tex
Normal file
@ -0,0 +1,67 @@
|
||||
%\pagebreak
|
||||
|
||||
\section{Induction}
|
||||
\label{sec:induction}
|
||||
|
||||
This section covers ways to perform inductions in \kcode{distribute}, worksharing-loop, \kcode{taskloop}, and SIMD regions.
|
||||
|
||||
\subsection{\kcode{induction} Clause}
|
||||
\label{subsec:induction}
|
||||
\index{clauses!induction@\kcode{induction}}
|
||||
\index{induction clause@\kcode{induction} clause}
|
||||
\index{inductions!induction clause@\kcode{induction} clause}
|
||||
\index{inductions!closed form}
|
||||
|
||||
The following example demonstrates the basic use of the \kcode{induction} clause
|
||||
in Case 1 for variable \ucode{xi} in a loop in routine \ucode{comp_poly} to
|
||||
evaluate the polynomial of variable \ucode{x}.
|
||||
For this case, the induction operation is
|
||||
with the inductor `\scode{*}' and induction step \ucode{x}.
|
||||
The intermediate value of \ucode{xi} is used in producing
|
||||
the reduction sum \ucode{result}.
|
||||
The last value of \ucode{xi} is well defined after the loop and
|
||||
is printed out together with the final value of \ucode{result}.
|
||||
An alternative approach is to use an \plc{inscan} reduction
|
||||
as illustrated in Case 2, but this may not be as optimal as Case 1.
|
||||
An equivalent code without the \kcode{induction} clause is given in Case 3
|
||||
where a non-recursive closed form of the induction operation is used to
|
||||
compute the intermediate value of \ucode{xi}.
|
||||
The last value of \ucode{xi} is returned with the \kcode{lastprivate} clause
|
||||
for this case.
|
||||
|
||||
\cexample[6.0]{induction}{1}
|
||||
|
||||
\ffreeexample[6.0]{induction}{1}
|
||||
|
||||
\subsection{User-defined Induction}
|
||||
\label{subsec:user-defined-induction}
|
||||
|
||||
\index{directives!declare induction@\kcode{declare induction}}
|
||||
\index{declare induction directive@\kcode{declare induction} directive}
|
||||
\index{inductions!declare induction directive@\kcode{declare induction} directive}
|
||||
\index{inductions!inductor clause@\kcode{inductor} clause}
|
||||
\index{inductions!collector clause@\kcode{collector} clause}
|
||||
\index{inductions!user-defined}
|
||||
\index{OpenMP variable identifiers!omp_var@\kcode{omp_var}}
|
||||
\index{OpenMP variable identifiers!omp_step@\kcode{omp_step}}
|
||||
\index{OpenMP variable identifiers!omp_idx@\kcode{omp_idx}}
|
||||
|
||||
The following is a user-defined induction example that uses the
|
||||
\kcode{declare induction} directive and the \kcode{induction} clause.
|
||||
The example processes in parallel $N$ points along a line of a given slope
|
||||
starting from a given point, and where adjacent points are separated by
|
||||
a fixed distance.
|
||||
The induction variable \ucode{P} represents a point, and
|
||||
the step expression is the distance. The induction identifier \ucode{next}
|
||||
is defined in the \kcode{declare induction} directive with an
|
||||
appropriate \plc{inductor} via the \kcode{inductor} clause and
|
||||
\plc{collector} via the \kcode{collector} clause.
|
||||
This identifier together with the \kcode{step(\ucode{Separation})}
|
||||
modifier is specified in the \kcode{induction} clause
|
||||
for the \kcode{parallel for}/\kcode{do} construct
|
||||
in routine \ucode{processPointsInLine}.
|
||||
|
||||
\cppexample[6.0]{induction}{2}
|
||||
|
||||
\ffreeexample[6.0]{induction}{2}
|
||||
|
@ -1,9 +1,11 @@
|
||||
\pagebreak
|
||||
\section{\code{lastprivate} Clause}
|
||||
%\pagebreak
|
||||
\section{\kcode{lastprivate} Clause}
|
||||
\label{sec:lastprivate}
|
||||
\index{clauses!lastprivate@\kcode{lastprivate}}
|
||||
\index{lastprivate clause@\kcode{lastprivate} clause}
|
||||
|
||||
Correct execution sometimes depends on the value that the last iteration of a loop
|
||||
assigns to a variable. Such programs must list all such variables in a \code{lastprivate}
|
||||
assigns to a variable. Such programs must list all such variables in a \kcode{lastprivate}
|
||||
clause so that the values of the variables are the same as when the loop is executed
|
||||
sequentially.
|
||||
|
||||
@ -11,9 +13,10 @@ sequentially.
|
||||
|
||||
\fexample{lastprivate}{1}
|
||||
|
||||
\clearpage
|
||||
The next example illustrates the use of the \code{conditional} modifier in
|
||||
a \code{lastprivate} clause to return the last value when it may not come from
|
||||
\index{lastprivate clause@\kcode{lastprivate} clause!conditional modifier@\kcode{conditional} modifier}
|
||||
\index{conditional modifier@\kcode{conditional} modifier}
|
||||
The next example illustrates the use of the \kcode{conditional} modifier in
|
||||
a \kcode{lastprivate} clause to return the last value when it may not come from
|
||||
the last iteration of a loop.
|
||||
That is, users can preserve the serial equivalence semantics of the loop.
|
||||
The conditional lastprivate ensures the final value of the variable after the loop
|
||||
|
@ -1,26 +1,29 @@
|
||||
\pagebreak
|
||||
\section{\code{private} Clause}
|
||||
%\pagebreak
|
||||
\section{\kcode{private} Clause}
|
||||
\label{sec:private}
|
||||
\index{clauses!private@\kcode{private}}
|
||||
\index{private clause@\kcode{private} clause}
|
||||
|
||||
In the following example, the values of original list items \plc{i} and \plc{j}
|
||||
are retained on exit from the \code{parallel} region, while the private list
|
||||
items \plc{i} and \plc{j} are modified within the \code{parallel} construct.
|
||||
In the following example, the values of original list items \ucode{i} and \ucode{j}
|
||||
are retained on exit from the \kcode{parallel} region, while the private list
|
||||
items \ucode{i} and \ucode{j} are modified within the \kcode{parallel} construct.
|
||||
|
||||
\cexample{private}{1}
|
||||
|
||||
\fexample{private}{1}
|
||||
|
||||
In the following example, all uses of the variable \plc{a} within the loop construct
|
||||
in the routine \plc{f} refer to a private list item \plc{a}, while it is
|
||||
unspecified whether references to \plc{a} in the routine \plc{g} are to a
|
||||
\pagebreak
|
||||
In the following example, all uses of the variable \ucode{a} within the loop construct
|
||||
in the routine \ucode{f} refer to a private list item \ucode{a}, while it is
|
||||
unspecified whether references to \ucode{a} in the routine \ucode{g} are to a
|
||||
private list item or the original list item.
|
||||
|
||||
\cexample{private}{2}
|
||||
|
||||
\fexample{private}{2}
|
||||
|
||||
The following example demonstrates that a list item that appears in a \code{private}
|
||||
clause in a \code{parallel} construct may also appear in a \code{private}
|
||||
The following example demonstrates that a list item that appears in a \kcode{private}
|
||||
clause in a \kcode{parallel} construct may also appear in a \kcode{private}
|
||||
clause in an enclosed worksharing construct, which results in an additional private
|
||||
copy.
|
||||
|
||||
|
@ -5,12 +5,15 @@
|
||||
|
||||
This section covers ways to perform reductions in parallel, task, taskloop, and SIMD regions.
|
||||
|
||||
\subsection{\code{reduction} Clause}
|
||||
\subsection{\kcode{reduction} Clause}
|
||||
\label{subsec:reduction}
|
||||
\index{clauses!reduction@\kcode{reduction}}
|
||||
\index{reduction clause@\kcode{reduction} clause}
|
||||
\index{reductions!reduction clause@\kcode{reduction} clause}
|
||||
|
||||
The following example demonstrates the \code{reduction} clause; note that some
|
||||
reductions can be expressed in the loop in several ways, as shown for the \code{max}
|
||||
and \code{min} reductions below:
|
||||
The following example demonstrates the \kcode{reduction} clause; note that some
|
||||
reductions can be expressed in the loop in several ways, as shown for the \kcode{max}
|
||||
and \kcode{min} reductions below:
|
||||
|
||||
\cexample[3.1]{reduction}{1}
|
||||
|
||||
@ -23,50 +26,47 @@ written as follows:
|
||||
|
||||
\cexample{reduction}{2}
|
||||
|
||||
\fortranspecificstart
|
||||
\begin{fortranspecific}
|
||||
\ffreenexample{reduction}{2}
|
||||
|
||||
The following program is non-conforming because the reduction is on the
|
||||
\emph{intrinsic procedure name} \code{MAX} but that name has been redefined to be the variable
|
||||
named \code{MAX}.
|
||||
\emph{intrinsic procedure name} \bcode{MAX} but that name has been redefined to be the variable
|
||||
named \ucode{MAX}.
|
||||
|
||||
\ffreenexample{reduction}{3}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
\topmarker{Fortran}
|
||||
|
||||
The following conforming program performs the reduction using the
|
||||
\emph{intrinsic procedure name} \code{MAX} even though the intrinsic \code{MAX} has been renamed
|
||||
to \code{REN}.
|
||||
\emph{intrinsic procedure name} \kcode{MAX} even though the intrinsic \bcode{MAX} has been renamed
|
||||
to \ucode{REN}.
|
||||
|
||||
\ffreenexample{reduction}{4}
|
||||
|
||||
The following conforming program performs the reduction using
|
||||
\plc{intrinsic procedure name} \code{MAX} even though the intrinsic \code{MAX} has been renamed
|
||||
to \code{MIN}.
|
||||
\plc{intrinsic procedure name} \kcode{MAX} even though the intrinsic \bcode{MAX} has been renamed
|
||||
to \ucode{MIN}.
|
||||
|
||||
\ffreenexample{reduction}{5}
|
||||
\fortranspecificend
|
||||
\end{fortranspecific}
|
||||
|
||||
%\pagebreak
|
||||
The following example is non-conforming because the initialization (\code{a =
|
||||
0}) of the original list item \code{a} is not synchronized with the update of
|
||||
\code{a} as a result of the reduction computation in the \code{for} loop. Therefore,
|
||||
the example may print an incorrect value for \code{a}.
|
||||
The following example is non-conforming because the initialization (\ucode{a =
|
||||
0}) of the original list item \ucode{a} is not synchronized with the update of
|
||||
\ucode{a} as a result of the reduction computation in the \bcode{for} loop. Therefore,
|
||||
the example may print an incorrect value for \ucode{a}.
|
||||
|
||||
To avoid this problem, the initialization of the original list item \code{a}
|
||||
should complete before any update of \code{a} as a result of the \code{reduction}
|
||||
To avoid this problem, the initialization of the original list item \ucode{a}
|
||||
should complete before any update of \ucode{a} as a result of the \kcode{reduction}
|
||||
clause. This can be achieved by adding an explicit barrier after the assignment
|
||||
\code{a = 0}, or by enclosing the assignment \code{a = 0} in a \code{single}
|
||||
directive (which has an implied barrier), or by initializing \code{a} before
|
||||
the start of the \code{parallel} region.
|
||||
\ucode{a = 0}, or by enclosing the assignment \ucode{a = 0} in a \kcode{single}
|
||||
directive (which has an implied barrier), or by initializing \ucode{a} before
|
||||
the start of the \kcode{parallel} region.
|
||||
|
||||
\cexample[5.1]{reduction}{6}
|
||||
|
||||
\fexample[5.1]{reduction}{6}[1]
|
||||
\fexample[5.1]{reduction}{6}
|
||||
|
||||
The following example demonstrates the reduction of array \plc{a}. In C/C++ this is illustrated by the explicit use of an array section \plc{a[0:N]} in the \code{reduction} clause. The corresponding Fortran example uses array syntax supported in the base language. As of the OpenMP 4.5 specification the explicit use of array section in the \code{reduction} clause in Fortran is not permitted. But this oversight has been fixed in the OpenMP 5.0 specification.
|
||||
The following example demonstrates the reduction of array \ucode{a}. In C/C++ this is illustrated by the explicit use of an array section \ucode{a[0:N]} in the \kcode{reduction} clause. The corresponding Fortran example uses array syntax supported in the base language. As of the OpenMP 4.5 specification the explicit use of array section in the \kcode{reduction} clause in Fortran is not permitted. But this oversight has been fixed in the OpenMP 5.0 specification.
|
||||
|
||||
|
||||
\cexample[4.5]{reduction}{7}
|
||||
@ -75,21 +75,27 @@ The following example demonstrates the reduction of array \plc{a}. In C/C++ thi
|
||||
|
||||
\subsection{Task Reduction}
|
||||
\label{subsec:task_reduction}
|
||||
\index{clauses!task_reduction@\kcode{task_reduction}}
|
||||
\index{task_reduction clause@\kcode{task_reduction} clause}
|
||||
\index{reductions!task_reduction clause@\kcode{task_reduction} clause}
|
||||
\index{clauses!in_reduction@\kcode{in_reduction}}
|
||||
\index{in_reduction clause@\kcode{in_reduction} clause}
|
||||
\index{reductions!in_reduction clause@\kcode{in_reduction} clause}
|
||||
|
||||
In OpenMP 5.0 the \code{task\_reduction} clause was created for the \code{taskgroup} construct,
|
||||
to allow reductions among explicit tasks that have an \code{in\_reduction} clause.
|
||||
In OpenMP 5.0 the \kcode{task_reduction} clause was created for the \kcode{taskgroup} construct,
|
||||
to allow reductions among explicit tasks that have an \kcode{in_reduction} clause.
|
||||
|
||||
In the \plc{task\_reduction.1} example below a reduction is performed as the algorithm
|
||||
In the \example{task_reduction.1} example below a reduction is performed as the algorithm
|
||||
traverses a linked list. The reduction statement is assigned to be an explicit task using
|
||||
a \code{task} construct and is specified to be a reduction participant with
|
||||
the \code{in\_reduction} clause.
|
||||
A \code{taskgroup} construct encloses the tasks participating in the reduction, and
|
||||
specifies, with the \code{task\_reduction} clause, that the taskgroup has tasks participating
|
||||
in a reduction. After the \code{taskgroup} region the original variable will contain
|
||||
a \kcode{task} construct and is specified to be a reduction participant with
|
||||
the \kcode{in_reduction} clause.
|
||||
A \kcode{taskgroup} construct encloses the tasks participating in the reduction, and
|
||||
specifies, with the \kcode{task_reduction} clause, that the taskgroup has tasks participating
|
||||
in a reduction. After the \kcode{taskgroup} region the original variable will contain
|
||||
the final value of the reduction.
|
||||
|
||||
Note: The \plc{res} variable is private in the \plc{linked\_list\_sum} routine
|
||||
and is not required to be shared (as in the case of a \code{parallel} construct
|
||||
Note: The \ucode{res} variable is private in the \ucode{linked_list_sum} routine
|
||||
and is not required to be shared (as in the case of a \kcode{parallel} construct
|
||||
reduction).
|
||||
|
||||
|
||||
@ -97,34 +103,36 @@ reduction).
|
||||
|
||||
\ffreeexample[5.0]{task_reduction}{1}
|
||||
|
||||
In OpenMP 5.0 the \code{task} \plc{reduction-modifier} for the \code{reduction} clause was
|
||||
\index{reduction clause@\kcode{reduction} clause!task modifier@\kcode{task} modifier}
|
||||
\index{task modifier@\kcode{task} modifier}
|
||||
In OpenMP 5.0 the \kcode{task} \plc{reduction-modifier} for the \kcode{reduction} clause was
|
||||
introduced to provide a means of performing reductions among implicit and explicit tasks.
|
||||
|
||||
The \code{reduction} clause of a \code{parallel} or worksharing construct may
|
||||
specify the \code{task} \plc{reduction-modifier} to include explicit task reductions
|
||||
The \kcode{reduction} clause of a \kcode{parallel} or worksharing construct may
|
||||
specify the \kcode{task} \plc{reduction-modifier} to include explicit task reductions
|
||||
within their region, provided the reduction operators (\plc{reduction-identifiers})
|
||||
and variables (\plc{list items}) of the participating tasks match those of the
|
||||
and variables (list items) of the participating tasks match those of the
|
||||
implicit tasks.
|
||||
|
||||
There are 2 reduction use cases (identified by USE CASE \#) in the \plc{task\_reduction.2} example below.
|
||||
There are 2 reduction use cases (identified by USE CASE \#) in the \example{task_reduction.2} example below.
|
||||
|
||||
In USE CASE 1 a \code{task} modifier in the \code{reduction} clause
|
||||
of the \code{parallel} construct is used to include the reductions of any
|
||||
participating tasks, those with an \code{in\_reduction} clause and matching
|
||||
\plc{reduction-identifiers} (\code{+}) and list items (\code{x}).
|
||||
In USE CASE 1 a \kcode{task} modifier in the \kcode{reduction} clause
|
||||
of the \kcode{parallel} construct is used to include the reductions of any
|
||||
participating tasks, those with an \kcode{in_reduction} clause and matching
|
||||
\plc{reduction-identifiers} (\kcode{+}) and list items (\ucode{x}).
|
||||
|
||||
Note, a \code{taskgroup} construct (with a \code{task\_reduction} clause) in not
|
||||
Note, a \kcode{taskgroup} construct (with a \kcode{task_reduction} clause) is not
|
||||
necessary to scope the explicit task reduction (as seen in the example above).
|
||||
Hence, even without the implicit task reduction statement (without the C \code{x++\;}
|
||||
and Fortran \code{x=x+1} statements), the \code{task} \plc{reduction-modifier}
|
||||
in a \code{reduction} clause of the \code{parallel} construct
|
||||
can be used to avoid having to create a \code{taskgroup} construct
|
||||
(and its \code{task\_reduction} clause) around the task generating structure.
|
||||
Hence, even without the implicit task reduction statement (without the C \ucode{x++;}
|
||||
and Fortran \ucode{x=x+1} statements), the \kcode{task} \plc{reduction-modifier}
|
||||
in a \kcode{reduction} clause of the \kcode{parallel} construct
|
||||
can be used to avoid having to create a \kcode{taskgroup} construct
|
||||
(and its \kcode{task_reduction} clause) around the task generating structure.
|
||||
|
||||
In USE CASE 2 tasks participating in the reduction are within a
|
||||
worksharing region (a parallel worksharing-loop construct).
|
||||
Here, too, no \code{taskgroup} is required, and the \plc{reduction-identifier} (\code{+})
|
||||
and list item (variable \code{x}) match as required.
|
||||
Here, too, no \kcode{taskgroup} is required, and the \plc{reduction-identifier} (\kcode{+})
|
||||
and list item (variable \ucode{x}) match as required.
|
||||
|
||||
|
||||
\cexample[5.0]{task_reduction}{2}
|
||||
@ -134,36 +142,45 @@ and list item (variable \code{x}) match as required.
|
||||
|
||||
\subsection{Reduction on Combined Target Constructs}
|
||||
\label{subsec:target_reduction}
|
||||
\index{reduction clause@\kcode{reduction} clause!on target construct@on \kcode{target} construct}
|
||||
\index{constructs!target@\kcode{target}}
|
||||
\index{target construct@\kcode{target} construct}
|
||||
|
||||
When a \code{reduction} clause appears on a combined construct that combines
|
||||
a \code{target} construct with another construct, there is an implicit map
|
||||
of the list items with a \code{tofrom} map type for the \code{target} construct.
|
||||
When a \kcode{reduction} clause appears on a combined construct that combines
|
||||
a \kcode{target} construct with another construct, there is an implicit map
|
||||
of the list items with a \kcode{tofrom} map type for the \kcode{target} construct.
|
||||
Otherwise, the list items (if they are scalar variables) would be
|
||||
treated as firstprivate by default in the \code{target} construct, which
|
||||
treated as firstprivate by default in the \kcode{target} construct, which
|
||||
is unlikely to provide the intended behavior since the result of the
|
||||
reduction that is in the firstprivate variable would be discarded
|
||||
at the end of the \code{target} region.
|
||||
at the end of the \kcode{target} region.
|
||||
|
||||
In the following example, the use of the \code{reduction} clause on \code{sum1}
|
||||
or \code{sum2} should, by default, result in an implicit \code{tofrom} map for
|
||||
that variable. So long as neither \code{sum1} nor \code{sum2} were already
|
||||
In the following example, the use of the \kcode{reduction} clause on \ucode{sum1}
|
||||
or \ucode{sum2} should, by default, result in an implicit \kcode{tofrom} map for
|
||||
that variable. So long as neither \ucode{sum1} nor \ucode{sum2} were already
|
||||
present on the device, the mapping behavior ensures the value for
|
||||
\code{sum1} computed in the first \code{target} construct is used in the
|
||||
second \code{target} construct.
|
||||
\ucode{sum1} computed in the first \kcode{target} construct is used in the
|
||||
second \kcode{target} construct.
|
||||
|
||||
Note: a \kcode{declare target} directive is needed for procedures,
|
||||
\ucode{f} and \ucode{g}, called in \kcode{target} region in Fortran codes.
|
||||
This directive is not required in C codes because functions, \ucode{f}
|
||||
and \ucode{g}, are defined in the same compilation unit of the \kcode{target}
|
||||
construct in which these functions are called.
|
||||
|
||||
\cexample[5.0]{target_reduction}{1}
|
||||
|
||||
\ffreeexample[5.0]{target_reduction}{1}
|
||||
%\clearpage
|
||||
|
||||
In next example, the variables \code{sum1} and \code{sum2} remain on the
|
||||
device for the duration of the \code{target}~\code{data} region so that it is
|
||||
In next example, the variables \ucode{sum1} and \ucode{sum2} remain on the
|
||||
device for the duration of the \kcode{target data} region so that it is
|
||||
their device copies that are updated by the reductions. Note the significance
|
||||
of mapping \code{sum1} on the second \code{target} construct; otherwise, it
|
||||
of mapping \ucode{sum1} on the second \kcode{target} construct; otherwise, it
|
||||
would be treated by default as firstprivate and the result computed for
|
||||
\code{sum1} in the prior \code{target} region may not be used. Alternatively, a
|
||||
\code{target}~\code{update} construct could be used between the two
|
||||
\code{target} constructs to update the host version of \code{sum1} with the
|
||||
\ucode{sum1} in the prior \kcode{target} region may not be used. Alternatively, a
|
||||
\kcode{target update} construct could be used between the two
|
||||
\kcode{target} constructs to update the host version of \ucode{sum1} with the
|
||||
value that is in the corresponding device version after the completion of the
|
||||
first construct.
|
||||
|
||||
@ -174,60 +191,75 @@ first construct.
|
||||
|
||||
\subsection{Task Reduction with Target Constructs}
|
||||
\label{subsec:target_task_reduction}
|
||||
\index{in_reduction clause@\kcode{in_reduction} clause}
|
||||
\index{constructs!target@\kcode{target}}
|
||||
\index{target construct@\kcode{target} construct}
|
||||
|
||||
\index{clauses!enter@\kcode{enter}}
|
||||
\index{enter clause@\kcode{enter} clause}
|
||||
|
||||
The following examples illustrate how task reductions can apply to target tasks
|
||||
that result from a \code{target} construct with the \code{in\_reduction}
|
||||
clause. Here, the \code{in\_reduction} clause specifies that the target task
|
||||
that result from a \kcode{target} construct with the \kcode{in_reduction}
|
||||
clause. Here, the \kcode{in_reduction} clause specifies that the target task
|
||||
participates in the task reduction defined in the scope of the enclosing
|
||||
\code{taskgroup} construct. Partial results from all tasks participating in the
|
||||
\kcode{taskgroup} construct. Partial results from all tasks participating in the
|
||||
task reduction will be combined (in some order) into the original variable
|
||||
listed in the \code{task\_reduction} clause before exiting the \code{taskgroup}
|
||||
listed in the \kcode{task_reduction} clause before exiting the \kcode{taskgroup}
|
||||
region.
|
||||
|
||||
\cexample[5.1]{target_task_reduction}{1}
|
||||
\cexample[5.2]{target_task_reduction}{1}
|
||||
|
||||
\ffreeexample[5.1]{target_task_reduction}{1}[1]
|
||||
\ffreeexample[5.2]{target_task_reduction}{1}
|
||||
\clearpage
|
||||
|
||||
\index{reduction clause@\kcode{reduction} clause!task modifier@\kcode{task} modifier}
|
||||
\index{task modifier@\kcode{task} modifier}
|
||||
In the next pair of examples, the task reduction is defined by a
|
||||
\code{reduction} clause with the \code{task} modifier, rather than a
|
||||
\code{task\_reduction} clause on a \code{taskgroup} construct. Again, the
|
||||
\kcode{reduction} clause with the \kcode{task} modifier, rather than a
|
||||
\kcode{task_reduction} clause on a \kcode{taskgroup} construct. Again, the
|
||||
partial results from the participating tasks will be combined in some order
|
||||
into the original reduction variable, \code{sum}.
|
||||
into the original reduction variable, \ucode{sum}.
|
||||
|
||||
\cexample[5.0]{target_task_reduction}{2a}
|
||||
\cexample[5.2]{target_task_reduction}{2a}
|
||||
|
||||
\ffreeexample[5.0]{target_task_reduction}{2a}
|
||||
\ffreeexample[5.2]{target_task_reduction}{2a}
|
||||
|
||||
Next, the \code{task} modifier is again used to define a task reduction over
|
||||
\index{in_reduction clause@\kcode{in_reduction} clause!with target construct@with \kcode{target} construct}
|
||||
\index{constructs!target@\kcode{target}}
|
||||
\index{target construct@\kcode{target} construct}
|
||||
Next, the \kcode{task} modifier is again used to define a task reduction over
|
||||
participating tasks. This time, the participating tasks are a target task
|
||||
resulting from a \code{target} construct with the \code{in\_reduction} clause,
|
||||
resulting from a \kcode{target} construct with the \kcode{in_reduction} clause,
|
||||
and the implicit task (executing on the primary thread) that calls
|
||||
\code{host\_compute}. As before, the partial results from these paricipating
|
||||
\ucode{host_compute}. As before, the partial results from these participating
|
||||
tasks are combined in some order into the original reduction variable.
|
||||
|
||||
\cexample[5.1]{target_task_reduction}{2b}
|
||||
\cexample[5.2]{target_task_reduction}{2b}
|
||||
|
||||
\ffreeexample[5.1]{target_task_reduction}{2b}[1]
|
||||
\ffreeexample[5.2]{target_task_reduction}{2b}
|
||||
|
||||
|
||||
\subsection{Taskloop Reduction}
|
||||
\label{subsec:taskloop_reduction}
|
||||
\index{reduction clause@\kcode{reduction} clause!on taskloop construct@on \kcode{taskloop} construct}
|
||||
\index{constructs!taskloop@\kcode{taskloop}}
|
||||
\index{taskloop construct@\kcode{taskloop} construct}
|
||||
|
||||
In the OpenMP 5.0 Specification the \code{taskloop} construct
|
||||
In the OpenMP 5.0 Specification the \kcode{taskloop} construct
|
||||
was extended to include the reductions.
|
||||
|
||||
The following two examples show how to implement a reduction over an array
|
||||
using taskloop reduction in two different ways.
|
||||
In the first
|
||||
example we apply the \code{reduction} clause to the \code{taskloop} construct. As it was
|
||||
example we apply the \kcode{reduction} clause to the \kcode{taskloop} construct. As it was
|
||||
explained above in the task reduction examples, a reduction over tasks is
|
||||
divided in two components: the scope of the reduction, which is defined by a
|
||||
\code{taskgroup} region, and the tasks that participate in the reduction. In this
|
||||
example, the \code{reduction} clause defines both semantics. First, it specifies that
|
||||
the implicit \code{taskgroup} region associated with the \code{taskloop} construct is the scope of the
|
||||
reduction, and second, it defines all tasks created by the \code{taskloop} construct as
|
||||
\kcode{taskgroup} region, and the tasks that participate in the reduction. In this
|
||||
example, the \kcode{reduction} clause defines both semantics. First, it specifies that
|
||||
the implicit \kcode{taskgroup} region associated with the \kcode{taskloop} construct is the scope of the
|
||||
reduction, and second, it defines all tasks created by the \kcode{taskloop} construct as
|
||||
participants of the reduction. About the first property, it is important to note
|
||||
that if we add the \code{nogroup} clause to the \code{taskloop} construct the code will be
|
||||
that if we add the \kcode{nogroup} clause to the \kcode{taskloop} construct the code will be
|
||||
nonconforming, basically because we have a set of tasks that participate in a
|
||||
reduction that has not been defined.
|
||||
|
||||
@ -249,86 +281,89 @@ reduction that has not been defined.
|
||||
%create a new reduction and also that all tasks generated by the taskloop will
|
||||
%participate on it.
|
||||
|
||||
The second example computes exactly the same value as in the preceding\plc{taskloop\_reduction.1} code section,
|
||||
The second example computes exactly the same value as in the preceding \example{taskloop_reduction.1} code section,
|
||||
but in a very different way.
|
||||
First, in the \plc{array\_sum} function a \code{taskgroup} region is created
|
||||
that defines the scope of a new reduction using the \code{task\_reduction} clause.
|
||||
First, in the \ucode{array_sum} function a \kcode{taskgroup} region is created
|
||||
that defines the scope of a new reduction using the \kcode{task_reduction} clause.
|
||||
After that, a task and also the tasks generated by a taskloop participate in
|
||||
that reduction by using the \code{in\_reduction} clause on the \code{task}
|
||||
and \code{taskloop} constructs, respectively.
|
||||
Note that the \code{nogroup} clause was added to the \code{taskloop} construct.
|
||||
This is allowed because what is expressed with the \code{in\_reduction} clause
|
||||
is different from what is expressed with the \code{reduction} clause.
|
||||
that reduction by using the \kcode{in_reduction} clause on the \kcode{task}
|
||||
and \kcode{taskloop} constructs, respectively.
|
||||
Note that the \kcode{nogroup} clause was added to the \kcode{taskloop} construct.
|
||||
This is allowed because what is expressed with the \kcode{in_reduction} clause
|
||||
is different from what is expressed with the \kcode{reduction} clause.
|
||||
In one case the generated tasks are specified to participate in a previously
|
||||
declared reduction (\code{in\_reduction} clause) whereas in the other case
|
||||
creation of a new reduction is specified and also that all tasks generated
|
||||
declared reduction (\kcode{in_reduction} clause) whereas in the other case
|
||||
creation of a new reduction is specified and also all tasks generated
|
||||
by the taskloop will participate on it.
|
||||
|
||||
\cexample[5.0]{taskloop_reduction}{2}
|
||||
\ffreeexample[5.0]{taskloop_reduction}{2}
|
||||
%\clearpage
|
||||
|
||||
In the OpenMP 5.0 Specification, \code{reduction} clauses for the
|
||||
\code{taskloop}~\code{ simd} construct were also added.
|
||||
In the OpenMP 5.0 Specification, \kcode{reduction} clauses for the
|
||||
\kcode{taskloop simd} construct were also added.
|
||||
|
||||
The examples below compare reductions for the \code{taskloop} and the \code{taskloop}~\code{simd} constructs.
|
||||
These examples illustrate the use of \code{reduction} clauses within
|
||||
"stand-alone" \code{taskloop} constructs, and the use of \code{in\_reduction} clauses for tasks of taskloops to participate
|
||||
\index{reduction clause@\kcode{reduction} clause!on taskloop simd construct@on \kcode{taskloop simd} construct}
|
||||
\index{combined constructs!taskloop simd@\kcode{taskloop simd}}
|
||||
\index{taskloop simd construct@\kcode{taskloop simd} construct}
|
||||
The examples below compare reductions for the \kcode{taskloop} and the \kcode{taskloop simd} constructs.
|
||||
These examples illustrate the use of \kcode{reduction} clauses within
|
||||
``stand-alone'' \kcode{taskloop} constructs, and the use of \kcode{in_reduction} clauses for tasks of taskloops to participate
|
||||
with other reductions within the scope of a parallel region.
|
||||
|
||||
\textbf{taskloop reductions:}
|
||||
|
||||
In the \plc{taskloop reductions} section of the example below,
|
||||
\plc{taskloop 1} uses the \code{reduction} clause
|
||||
in a \code{taskloop} construct for a sum reduction, accumulated in \plc{asum}.
|
||||
The behavior is as though a \code{taskgroup} construct encloses the
|
||||
taskloop region with a \code{task\_reduction} clause, and each taskloop
|
||||
task has an \code{in\_reduction} clause with the specifications
|
||||
of the \code{reduction} clause.
|
||||
At the end of the taskloop region \plc{asum} contains the result of the reduction.
|
||||
\example{taskloop 1} uses the \kcode{reduction} clause
|
||||
in a \kcode{taskloop} construct for a sum reduction, accumulated in \ucode{asum}.
|
||||
The behavior is as though a \kcode{taskgroup} construct encloses the
|
||||
taskloop region with a \kcode{task_reduction} clause, and each taskloop
|
||||
task has an \kcode{in_reduction} clause with the specifications
|
||||
of the \kcode{reduction} clause.
|
||||
At the end of the taskloop region \ucode{asum} contains the result of the reduction.
|
||||
|
||||
The next taskloop, \plc{taskloop 2}, illustrates the use of the
|
||||
\code{in\_reduction} clause to participate in a previously defined
|
||||
reduction scope of a \code{parallel} construct.
|
||||
The next taskloop, \example{taskloop 2}, illustrates the use of the
|
||||
\kcode{in_reduction} clause to participate in a previously defined
|
||||
reduction scope of a \kcode{parallel} construct.
|
||||
|
||||
The task reductions of \plc{task 2} and \plc{taskloop 2} are combined
|
||||
across the \code{taskloop} construct and the single \code{task} construct, as specified
|
||||
in the \code{reduction(task,}~\code{+:asum)} clause of the \code{parallel} construct.
|
||||
At the end of the parallel region \plc{asum} contains the combined result of all reductions.
|
||||
The task reductions of \example{task 2} and \example{taskloop 2} are combined
|
||||
across the \kcode{taskloop} construct and the single \kcode{task} construct, as specified
|
||||
in the \kcode{reduction(task,+: \ucode{asum})} clause of the \kcode{parallel} construct.
|
||||
At the end of the parallel region \ucode{asum} contains the combined result of all reductions.
|
||||
|
||||
\textbf{taskloop simd reductions:}
|
||||
|
||||
Reductions for the \code{taskloop}~\code{simd} construct are shown in the second half of the code.
|
||||
Since each component construct, \code{taskloop} and \code{simd},
|
||||
can accept a reduction-type clause, the \code{taskloop}~\code{simd} construct
|
||||
Reductions for the \kcode{taskloop simd} construct are shown in the second half of the code.
|
||||
Since each component construct, \kcode{taskloop} and \kcode{simd},
|
||||
can accept a reduction clause, the \kcode{taskloop simd} construct
|
||||
is a composite construct, and the specific application of the reduction clause is defined
|
||||
within the \code{taskloop}~\code{simd} construct section of the OpenMP 5.0 Specification.
|
||||
within the \docref{\kcode{taskloop simd} Construct} section of the OpenMP 5.0 Specification.
|
||||
The code below illustrates use cases for these reductions.
|
||||
|
||||
In the \plc{taskloop simd reduction} section of the example below,
|
||||
\plc{taskloop simd 3} uses the \code{reduction} clause
|
||||
in a \code{taskloop}~\code{simd} construct for a sum reduction within a loop.
|
||||
For this case a \code{reduction} clause is used, as one would use
|
||||
for a \code{simd} construct.
|
||||
\example{taskloop simd 3} uses the \kcode{reduction} clause
|
||||
in a \kcode{taskloop simd} construct for a sum reduction within a loop.
|
||||
For this case a \kcode{reduction} clause is used, as one would use
|
||||
for a \kcode{simd} construct.
|
||||
The SIMD reductions of each task are combined, and the results of these tasks are further
|
||||
combined just as in the \code{taskloop} construct with the \code{reduction} clause for \plc{taskloop 1}.
|
||||
At the end of the taskloop region \plc{asum} contains the combined result of all reductions.
|
||||
combined just as in the \kcode{taskloop} construct with the \kcode{reduction} clause for \example{taskloop 1}.
|
||||
At the end of the taskloop region \ucode{asum} contains the combined result of all reductions.
|
||||
|
||||
If a \code{taskloop}~\code{simd} construct is to participate in a previously defined
|
||||
If a \kcode{taskloop simd} construct is to participate in a previously defined
|
||||
reduction scope, the reduction participation should be specified with
|
||||
a \code{in\_reduction} clause, as shown in the \code{parallel} region enclosing
|
||||
\plc{task 4} and \plc{taskloop simd 4} code sections.
|
||||
a \kcode{in_reduction} clause, as shown in the \kcode{parallel} region enclosing
|
||||
\example{task 4} and \example{taskloop simd 4} code sections.
|
||||
|
||||
Here the \code{taskloop}~\code{simd} construct's
|
||||
\code{in\_reduction} clause specifies participation of the construct's tasks as
|
||||
Here the \kcode{taskloop simd} construct's
|
||||
\kcode{in_reduction} clause specifies participation of the construct's tasks as
|
||||
a task reduction within the scope of the parallel region.
|
||||
That is, the results of each task of the \code{taskloop} construct component
|
||||
contribute to the reduction in a broader level, just as in \plc{parallel reduction a} code section above.
|
||||
Also, each \code{simd}-component construct
|
||||
occurs as if it has a \code{reduction} clause, and the
|
||||
That is, the results of each task of the \kcode{taskloop} construct component
|
||||
contribute to the reduction in a broader level, just as in \example{parallel reduction a} code section above.
|
||||
Also, each \kcode{simd}-component construct
|
||||
occurs as if it has a \kcode{reduction} clause, and the
|
||||
SIMD results of each task are combined as though to form a single result for
|
||||
each task (that participates in the \code{in\_reduction} clause).
|
||||
At the end of the parallel region \plc{asum} contains the combined result of all reductions.
|
||||
each task (that participates in the \kcode{in_reduction} clause).
|
||||
At the end of the parallel region \ucode{asum} contains the combined result of all reductions.
|
||||
|
||||
%Just as in \plc{parallel reduction a} the
|
||||
%\code{taskloop simd} construct reduction results are combined
|
||||
@ -341,15 +376,18 @@ At the end of the parallel region \plc{asum} contains the combined result of all
|
||||
|
||||
\cexample[5.1]{taskloop_simd_reduction}{1}
|
||||
|
||||
\ffreeexample[5.1]{taskloop_simd_reduction}{1}[1]
|
||||
\ffreeexample[5.1]{taskloop_simd_reduction}{1}
|
||||
|
||||
|
||||
\subsection{Reduction with the \code{scope} Construct}
|
||||
\subsection{Reduction with the \kcode{scope} Construct}
|
||||
\label{subsec:reduction_scope}
|
||||
\index{reduction clause@\kcode{reduction} clause!on scope construct@on \kcode{scope} construct}
|
||||
\index{constructs!scope@\kcode{scope}}
|
||||
\index{scope construct@\kcode{scope} construct}
|
||||
|
||||
The following example illustrates the use of the \code{scope} construct
|
||||
to perform a reduction in a \code{parallel} region. The case is useful for
|
||||
producing a reduction and accessing reduction variables inside a \code{parallel} region
|
||||
The following example illustrates the use of the \kcode{scope} construct
|
||||
to perform a reduction in a \kcode{parallel} region. The case is useful for
|
||||
producing a reduction and accessing reduction variables inside a \kcode{parallel} region
|
||||
without using a worksharing-loop construct.
|
||||
|
||||
\cppexample[5.1]{scope_reduction}{1}
|
||||
@ -357,3 +395,34 @@ without using a worksharing-loop construct.
|
||||
|
||||
\ffreeexample[5.1]{scope_reduction}{1}
|
||||
|
||||
\subsection{Reduction on Private Variables in a \kcode{parallel} Region}
|
||||
\label{subsec:priv_reduction}
|
||||
\index{reduction clause@\kcode{reduction} clause!on private variables}
|
||||
\index{reduction clause@\kcode{reduction} clause!original(private) modifier@\kcode{original(private)} modifier}
|
||||
|
||||
The following example shows reduction on a private variable (\ucode{sum_v})
|
||||
for an orphaned worksharing loop in routine \ucode{do_red},
|
||||
which is called in a \kcode{parallel} region.
|
||||
At the end of the loop, private variable of each thread should have the same combined value.
|
||||
\cexample[6.0]{priv_reduction}{1}
|
||||
\ffreeexample[6.0]{priv_reduction}{1}
|
||||
|
||||
The following example is slightly modified from the previous example
|
||||
where the \kcode{original(private)} modifier is explicitly specified
|
||||
for variable \ucode{sum_v} in the \kcode{reduction} clause.
|
||||
This modifier indicates that variable \ucode{sum_v} is private
|
||||
for reduction as opposed to shared by default for a variable
|
||||
passed as a procedure argument.
|
||||
\cppexample[6.0]{priv_reduction}{2}
|
||||
\ffreeexample[6.0]{priv_reduction}{2}
|
||||
|
||||
The following example shows the effect of nested \kcode{reduction} constructs.
|
||||
For the \kcode{parallel} construct, the reduction is on the shared variable
|
||||
\ucode{x}. For the worksharing loop nested inside the \kcode{parallel}
|
||||
region, the reduction is performed on the private copy of \ucode{x}
|
||||
for each thread.
|
||||
With 4 threads assigned for the \kcode{parallel} region
|
||||
(enforced by the \kcode{strict} modifier in the \kcode{num_threads} clause),
|
||||
the code should print 40 at the end.
|
||||
\cexample[6.0]{priv_reduction}{3}
|
||||
\ffreeexample[6.0]{priv_reduction}{3}
|
||||
|
@ -1,38 +1,65 @@
|
||||
\pagebreak
|
||||
\section{\code{scan} Directive}
|
||||
%\pagebreak
|
||||
\section{\kcode{scan} Directive}
|
||||
\label{sec:scan}
|
||||
\index{directives!scan@\kcode{scan}}
|
||||
\index{scan directive@\kcode{scan} directive}
|
||||
\index{reduction clause@\kcode{reduction} clause!inscan modifier@\kcode{inscan} modifier}
|
||||
\index{inscan modifier@\kcode{inscan} modifier}
|
||||
|
||||
The following examples illustrate how to parallelize a loop that saves
|
||||
the \emph{prefix sum} of a reduction. This is accomplished by using
|
||||
the \code{inscan} modifier in the \code{reduction} clause for the input
|
||||
variable of the scan, and specifying with a \code{scan} directive whether
|
||||
the \kcode{inscan} modifier in the \kcode{reduction} clause for the input
|
||||
variable of the scan, and specifying with a \kcode{scan} directive whether
|
||||
the storage statement includes or excludes the scan input of the present
|
||||
iteration (\texttt{k}).
|
||||
iteration (\ucode{k}).
|
||||
|
||||
Basically, the \code{inscan} modifier connects a loop and/or SIMD reduction to
|
||||
the scan operation, and a \code{scan} construct with an \code{inclusive} or
|
||||
\code{exclusive} clause specifies whether the ``scan phase'' (lexical block
|
||||
\index{scan directive@\kcode{scan} directive!inclusive clause@\kcode{inclusive} clause}
|
||||
\index{scan directive@\kcode{scan} directive!exclusive clause@\kcode{exclusive} clause}
|
||||
\index{clauses!inclusive@\kcode{inclusive}}
|
||||
\index{inclusive clause@\kcode{inclusive} clause}
|
||||
\index{clauses!exclusive@\kcode{exclusive}}
|
||||
\index{exclusive clause@\kcode{exclusive} clause}
|
||||
Basically, the \kcode{inscan} modifier connects a loop and/or SIMD reduction to
|
||||
the scan operation, and a \kcode{scan} construct with an \kcode{inclusive} or
|
||||
\kcode{exclusive} clause specifies whether the ``scan phase'' (lexical block
|
||||
before and after the directive, respectively) is to use an \plc{inclusive} or
|
||||
\plc{exclusive} scan value for the list item (\texttt{x}).
|
||||
\plc{exclusive} scan value for the list item (\ucode{x}).
|
||||
|
||||
The first example uses the \plc{inclusive} scan operation on a composite
|
||||
loop-SIMD construct. The \code{scan} directive separates the reduction
|
||||
statement on variable \texttt{x} from the use of \texttt{x} (saving to array \texttt{b}).
|
||||
loop-SIMD construct. The \kcode{scan} directive separates the reduction
|
||||
statement on variable \ucode{x} from the use of \ucode{x} (saving to array \ucode{b}).
|
||||
The order of the statements in this example indicates that
|
||||
value \texttt{a[k]} (\texttt{a(k)} in Fortran) is included in the computation of
|
||||
the prefix sum \texttt{b[k]} (\texttt{b(k)} in Fortran) for iteration \texttt{k}.
|
||||
value \ucode{a[k]} (\ucode{a(k)} in Fortran) is included in the computation of
|
||||
the prefix sum \ucode{b[k]} (\ucode{b(k)} in Fortran) for iteration \ucode{k}.
|
||||
|
||||
\cexample[5.0]{scan}{1}
|
||||
|
||||
\ffreeexample[5.0]{scan}{1}
|
||||
|
||||
The second example uses the \plc{exclusive} scan operation on a composite
|
||||
loop-SIMD construct. The \code{scan} directive separates the use of \texttt{x}
|
||||
(saving to array \texttt{b}) from the reduction statement on variable \texttt{x}.
|
||||
loop-SIMD construct. The \kcode{scan} directive separates the use of \ucode{x}
|
||||
(saving to array \ucode{b}) from the reduction statement on variable \ucode{x}.
|
||||
The order of the statements in this example indicates that
|
||||
value \texttt{a[k]} (\texttt{a(k)} in Fortran) is excluded from the computation
|
||||
of the prefix sum \texttt{b[k]} (\texttt{b(k)} in Fortran) for iteration \texttt{k}.
|
||||
value \ucode{a[k]} (\ucode{a(k)} in Fortran) is excluded from the computation
|
||||
of the prefix sum \ucode{b[k]} (\ucode{b(k)} in Fortran) for iteration \ucode{k}.
|
||||
|
||||
\cexample[5.0]{scan}{2}
|
||||
|
||||
\ffreeexample[5.0]{scan}{2}
|
||||
|
||||
In OpenMP 6.0, the \kcode{scan} directive was extended to support
|
||||
the concept of an \plc{initialization} phase where a private variable
|
||||
can be set for later use in the \plc{input} phase of
|
||||
an \plc{exclusive} scan operation.
|
||||
The following example is a rewrite of the previous exclusive scan
|
||||
example, which uses the \kcode{scan init_complete} directive to separate
|
||||
the initialization phase from the other phases of the scan operation.
|
||||
The private variable \ucode{tmp} is set in the initialization phase
|
||||
and used later in the input phase to update the prefix sum stored
|
||||
in variable \ucode{x}.
|
||||
This case allows the same array \ucode{c} to be used for
|
||||
both input and output of the scan results.
|
||||
|
||||
\cexample[6.0]{scan}{3}
|
||||
|
||||
\ffreeexample[6.0]{scan}{3}
|
||||
|
@ -1,8 +1,7 @@
|
||||
! @@name: associate.1f
|
||||
! @@name: associate.1
|
||||
! @@type: F-fixed
|
||||
! @@compilable: no
|
||||
! @@linkable: no
|
||||
! @@expect: failure
|
||||
! @@operation: compile
|
||||
! @@expect: ct-error
|
||||
! @@version: omp_4.0
|
||||
program example_broken
|
||||
real :: a, c
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: associate.2f
|
||||
! @@name: associate.2
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: link
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
program example
|
||||
|
@ -1,7 +1,6 @@
|
||||
! @@name: associate.3f
|
||||
! @@name: associate.3
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@operation: run
|
||||
! @@expect: success
|
||||
! @@version: omp_4.0
|
||||
program example
|
||||
@ -10,8 +9,8 @@ program example
|
||||
associate(u => v)
|
||||
!$omp parallel private(v)
|
||||
v = -1
|
||||
print *, v ! private v=-1
|
||||
print *, u ! original v=15
|
||||
print *, "v=", v ! private v=-1
|
||||
print *, "u=", u ! original v=15
|
||||
!$omp end parallel
|
||||
end associate
|
||||
end program
|
||||
|
57
data_environment/sources/associate.4.f90
Normal file
57
data_environment/sources/associate.4.f90
Normal file
@ -0,0 +1,57 @@
|
||||
! @@name: associate.4
|
||||
! @@type: F-free
|
||||
! @@operation: link
|
||||
! @@expect: success
|
||||
! @@version: omp_5.1
|
||||
program main
|
||||
integer :: scalr, aray(3)
|
||||
scalr = -1 ; aray = -1
|
||||
|
||||
associate(a_scalr=>scalr, a_aray=>aray)
|
||||
|
||||
!$omp target !! TARGET 1
|
||||
aray = [1,2,3]
|
||||
!$omp end target
|
||||
print *, a_aray, aray !! 1 2 3 1 2 3
|
||||
|
||||
!$omp target !! TARGET 2
|
||||
a_aray = [4,5,6]
|
||||
!$omp end target
|
||||
print *, a_aray, aray !! 4 5 6 4 5 6
|
||||
|
||||
!!!$omp target !! TARGET 3
|
||||
!! !! mapping, in this case implicit,
|
||||
!! !! of aray AND a_aray NOT ALLOWED
|
||||
!! aray = [4,5,6]
|
||||
!! a_aray = [1,2,3]
|
||||
!!!$omp end target
|
||||
|
||||
|
||||
!$omp target !! TARGET 4
|
||||
scalr = 1 !! scalr is firstprivate
|
||||
!$omp end target
|
||||
print *, a_scalr, scalr !! -1 -1
|
||||
|
||||
!$omp target !! TARGET 5
|
||||
a_scalr = 2 !! a_scalr implicitly mapped
|
||||
!$omp end target
|
||||
print *, a_scalr, scalr !! 2 2
|
||||
|
||||
!$omp target !! TARGET 6
|
||||
scalr = 3 !! scalr is firstprivate
|
||||
print *, a_scalr, scalr !! 2 3
|
||||
a_scalr = 4 !! a_scalr implicitly mapped
|
||||
print *, a_scalr, scalr !! 4 3
|
||||
!$omp end target
|
||||
print *, a_scalr, scalr !! 4 4
|
||||
|
||||
!!!$omp target map(a_scalr,scalr) !! TARGET 7
|
||||
!! mapping, in this case explicit,
|
||||
!! of scalr AND a_sclar NOT ALLOWED
|
||||
!! scalr = 5
|
||||
!! a_scalr = 5
|
||||
!!!$omp end target
|
||||
|
||||
end associate
|
||||
|
||||
end program
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: carrays_fpriv.1c
|
||||
* @@name: carrays_fpriv.1
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@operation: run
|
||||
* @@expect: success
|
||||
* @@version: pre_omp_3.0
|
||||
*/
|
||||
#include <assert.h>
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: copyin.1c
|
||||
* @@name: copyin.1
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: pre_omp_3.0
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
! @@name: copyin.1f
|
||||
! @@name: copyin.1
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: pre_omp_3.0
|
||||
MODULE M
|
||||
REAL, POINTER, SAVE :: WORK(:)
|
||||
INTEGER :: SIZE
|
||||
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: copyprivate.1c
|
||||
* @@name: copyprivate.1
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: pre_omp_3.0
|
||||
*/
|
||||
#include <stdio.h>
|
||||
float x, y;
|
||||
|
@ -1,8 +1,8 @@
|
||||
! @@name: copyprivate.1f
|
||||
! @@name: copyprivate.1
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: pre_omp_3.0
|
||||
SUBROUTINE INIT(A,B)
|
||||
REAL A, B
|
||||
COMMON /XY/ X,Y
|
||||
|
@ -1,15 +1,10 @@
|
||||
/*
|
||||
* @@name: copyprivate.2c
|
||||
* @@name: copyprivate.2
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_5.1
|
||||
* @@version: omp_5.1
|
||||
*/
|
||||
#if _OPENMP < 202011
|
||||
#define masked master
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
@ -1,14 +1,8 @@
|
||||
! @@name: copyprivate.2f
|
||||
! @@name: copyprivate.2
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@requires: preprocessing
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: omp_5.1
|
||||
#if _OPENMP < 202011
|
||||
#define MASKED MASTER
|
||||
#endif
|
||||
|
||||
! @@version: omp_5.1
|
||||
REAL FUNCTION READ_NEXT()
|
||||
REAL, POINTER :: TMP
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: copyprivate.3c
|
||||
* @@name: copyprivate.3
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: pre_omp_3.0
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -1,8 +1,8 @@
|
||||
! @@name: copyprivate.3f
|
||||
! @@name: copyprivate.3
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: pre_omp_3.0
|
||||
FUNCTION NEW_LOCK()
|
||||
USE OMP_LIB ! or INCLUDE "omp_lib.h"
|
||||
INTEGER(OMP_LOCK_KIND), POINTER :: NEW_LOCK
|
||||
|
@ -1,8 +1,8 @@
|
||||
! @@name: copyprivate.4f
|
||||
! @@name: copyprivate.4
|
||||
! @@type: F-fixed
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@operation: compile
|
||||
! @@expect: success
|
||||
! @@version: pre_omp_3.0
|
||||
SUBROUTINE S(N)
|
||||
INTEGER N
|
||||
|
||||
|
@ -1,12 +1,10 @@
|
||||
/*
|
||||
* @@name: cpp_reference.1c
|
||||
* @@name: cpp_reference.1
|
||||
* @@type: C++
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@operation: compile
|
||||
* @@expect: success
|
||||
* @@version: omp_4.5
|
||||
*/
|
||||
|
||||
void task_body (int &);
|
||||
void gen_task (int &x) { // on orphaned task construct reference argument
|
||||
#pragma omp task // x is implicitly determined firstprivate(x)
|
||||
@ -23,4 +21,3 @@ void test (int &y, int &z) {
|
||||
gen_task (y);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* @@name: default_none.1c
|
||||
* @@name: default_none.1
|
||||
* @@type: C
|
||||
* @@compilable: no
|
||||
* @@linkable: no
|
||||
* @@expect: failure
|
||||
* @@operation: compile
|
||||
* @@expect: ct-error
|
||||
* @@version: pre_omp_3.0
|
||||
*/
|
||||
#include <omp.h>
|
||||
int x, y, z[1000];
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user