Compare commits

...

18 Commits
v4.0.2 ... main

Author SHA1 Message Date
Henry Jin
415024c369 Makefile update 2025-01-05 14:22:19 -08:00
Henry Jin
00bdf88b63 fix transparent task example 2024-11-14 08:08:37 -08:00
Henry Jin
3346a30ce2 v6.0 release 2024-11-13 11:07:08 -08:00
Henry Jin
11f2efcccf v5.2.2 release 2024-04-16 08:59:23 -07:00
Henry Jin
075683d574 more uses_allocators update 2022-11-09 13:11:02 -08:00
Henry Jin
08859e6029 declare target with device_type(nohost) 2022-11-07 10:59:22 -08:00
Henry Jin
03b9a00df9 release 5.2.1 2022-11-04 09:35:42 -07:00
Henry Jin
a5e3d8b3f2 v5.2 release 2022-04-18 15:02:25 -07:00
Henry Jin
fb0edc81e7 merge with examples-internal/v5.1 2021-08-17 09:11:55 -07:00
Henry Jin
60e8ece384 cleanup files 2020-06-26 08:01:16 -07:00
Henry Jin
3052c10566 v5.0.1 release 2020-06-26 07:54:45 -07:00
Henry Jin
eaec9ede64 synced with v5.0.0 of the examples-internal repo 2019-11-08 13:01:11 -08:00
Henry Jin
156a12ca09 synced with the 4.5.0 implementation of the examples-internal repo 2016-11-10 16:11:12 -08:00
Henry Jin
c65fe47427 Merge pull request #15 from jakubjelinek/master
Fix computation of fibonacci numbers.
2015-07-23 09:19:15 -07:00
Jakub Jelinek
c64c683964 Fix computation of fibonacci numbers. 2015-07-22 19:29:54 +02:00
Henry Jin
3f83389d3a Added the copyright note 2015-07-10 09:32:43 -07:00
Henry Jin
510fd1f0f0 Merge pull request #14 from jakubjelinek/master
Fix a pasto - loop simd construct in Fortran is do simd rather than
2015-07-10 08:03:44 -07:00
Jakub Jelinek
4b655aa21a Fix a pasto - loop simd construct in Fortran is do simd rather than
for simd.
2015-07-10 14:06:40 +02:00
860 changed files with 38913 additions and 4640 deletions

View File

@ -1,3 +1,14 @@
[02-Feb-2018] Note
This "Changes.log" is no longer updated. Please use History.tex and
the git log messages for changes.
[20-May-2016] Version 4.5.0
Changes from 4.0.2ltx
1. Reorganization into topic chapters
2. Change file suffixes (f/f90 => Fixed/Free format) C++ => cpp
[2-Feb-2015] Version 4.0.2
Changes from 4.0.1ltx

52
Chap_SIMD.tex Normal file
View File

@ -0,0 +1,52 @@
\cchapter{SIMD}{SIMD}
\label{chap:simd}
Single instruction, multiple data (SIMD) is a form of parallel execution
in which the same operation is performed on multiple data elements
independently in hardware vector processing units (VPU), also called SIMD units.
The addition of two vectors to form a third vector is a SIMD operation.
Many processors have SIMD (vector) units that can perform simultaneously
2, 4, 8 or more executions of the same operation (by a single SIMD unit).
Loops without loop-carried backward dependences (or with dependences preserved using
\kcode{ordered simd}) are candidates for vectorization by the compiler for
execution with SIMD units. In addition, with state-of-the-art vectorization
technology and \kcode{declare simd} directive extensions for function vectorization
in the OpenMP 4.5 specification, loops with function calls can be vectorized as well.
The basic idea is that a scalar function call in a loop can be replaced by a vector version
of the function, and the loop can be vectorized simultaneously by combining a loop
vectorization (\kcode{simd} directive on the loop) and a function
vectorization (\kcode{declare simd} directive on the function).
A \kcode{simd} construct states that SIMD operations be performed on the
data within the loop. A number of clauses are available to provide
data-sharing attributes (\kcode{private}, \kcode{linear}, \kcode{reduction} and
\kcode{lastprivate}). Other clauses provide vector length preference/restrictions
(\kcode{simdlen} / \kcode{safelen}), loop fusion (\kcode{collapse}), and data
alignment (\kcode{aligned}).
The \kcode{declare simd} directive designates
that a vector version of the function should also be constructed for
execution within loops that contain the function and have a \kcode{simd}
directive. Clauses provide argument specifications (\kcode{linear},
\kcode{uniform}, and \kcode{aligned}), a requested vector length
(\kcode{simdlen}), and designate whether the function is always/never
called conditionally in a loop (\kcode{notinbranch}/\kcode{inbranch}).
The latter is for optimizing performance.
Also, the \kcode{simd} construct has been combined with the worksharing loop
constructs (\kcode{for simd} and \kcode{do simd}) to enable simultaneous thread
execution in different SIMD units.
%Hence, the \code{simd} construct can be
%used alone on a loop to direct vectorization (SIMD execution), or in
%combination with a parallel loop construct to include thread parallelism
%(a parallel loop sequentially followed by a \code{simd} construct,
%or a combined construct such as \code{parallel do simd} or
%\code{parallel for simd}).
%===== Examples Sections =====
\input{SIMD/SIMD}
\input{SIMD/linear_modifier}

125
Chap_affinity.tex Normal file
View File

@ -0,0 +1,125 @@
\cchapter{OpenMP Affinity}{affinity}
\label{chap:openmp_affinity}
OpenMP Affinity consists of a \kcode{proc_bind} policy (thread affinity policy) and a specification of
places (``location units'' or \plc{processors} that may be cores, hardware
threads, sockets, etc.).
OpenMP Affinity enables users to bind computations on specific places.
The placement will hold for the duration of the parallel region.
However, the runtime is free to migrate the OpenMP threads
to different cores (hardware threads, sockets, etc.) prescribed within a given place,
if two or more cores (hardware threads, sockets, etc.) have been assigned to a given place.
Often the binding can be managed without resorting to explicitly setting places.
Without the specification of places in the \kcode{OMP_PLACES} variable,
the OpenMP runtime will distribute and bind threads using the entire range of processors for
the OpenMP program, according to the \kcode{OMP_PROC_BIND} environment variable
or the \kcode{proc_bind} clause. When places are specified, the OMP runtime
binds threads to the places according to a default distribution policy, or
those specified in the \kcode{OMP_PROC_BIND} environment variable or the
\kcode{proc_bind} clause.
In the OpenMP Specifications document a processor refers to an execution unit that
is enabled for an OpenMP thread to use. A processor is a core when there is
no SMT (Simultaneous Multi-Threading) support or SMT is disabled. When
SMT is enabled, a processor is a hardware thread (HW-thread). (This is the
usual case; but actually, the execution unit is implementation defined.) Processor
numbers are numbered sequentially from 0 to the number of cores less one (without SMT), or
0 to the number HW-threads less one (with SMT). OpenMP places use the processor number to designate
binding locations (unless an ``abstract name'' is used.)
The processors available to a process may be a subset of the system's
processors. This restriction may be the result of a
wrapper process controlling the execution (such as \plc{numactl} on Linux systems),
compiler options, library-specific environment variables, or default
kernel settings. For instance, the execution of multiple MPI processes,
launched on a single compute node, will each have a subset of processors as
determined by the MPI launcher or set by MPI affinity environment
variables for the MPI library. %Forked threads within an MPI process
%(for a hybrid execution of MPI and OpenMP code) inherit the valid
%processor set for execution from the parent process (the initial task region)
%when a parallel region forks threads. The binding policy set in
%\code{OMP\_PROC\_BIND} or the \code{proc\_bind} clause will be applied to
%the subset of processors available to \plc{the particular} MPI process.
%Also, setting an explicit list of processor numbers in the \code{OMP\_PLACES}
%variable before an MPI launch (which involves more than one MPI process) will
%result in unspecified behavior (and doesn't make sense) because the set of
%processors in the places list must not contain processors outside the subset
%of processors for an MPI process. A separate \code{OMP\_PLACES} variable must
%be set for each MPI process, and is usually accomplished by launching a script
%which sets \code{OMP\_PLACES} specifically for the MPI process.
Threads of a team are positioned onto places in a compact manner, a
scattered distribution, or onto the primary thread's place, by setting the
\kcode{OMP_PROC_BIND} environment variable or the \kcode{proc_bind} clause to
\kcode{close}, \kcode{spread}, or \kcode{primary} (\kcode{master} has been deprecated), respectively. When
\kcode{OMP_PROC_BIND} is set to FALSE no binding is enforced; and
when the value is TRUE, the binding is implementation defined to
a set of places in the \kcode{OMP_PLACES} variable or to places
defined by the implementation if the \kcode{OMP_PLACES} variable
is not set.
The \kcode{OMP_PLACES} variable can also be set to an abstract name
(\kcode{threads}, \kcode{cores}, \kcode{sockets}) to specify that a place is
either a single hardware thread, a core, or a socket, respectively.
This description of the \kcode{OMP_PLACES} is most useful when the
number of threads is equal to the number of hardware thread, cores
or sockets. It can also be used with a \kcode{close} or \kcode{spread}
distribution policy when the equality doesn't hold.
% We need an example of using sockets, cores and threads:
% case 1 cores:
% Hyper-Threads on (2 hardware threads per core)
% 1 socket x 4 cores x 2 HW-threads
%
% export OMP_NUM_THREADS=4
% export OMP_PLACES=threads
%
% core # 0 1 2 3
% processor # 0,1 2,3 4,5 6,7
% thread # 0 * _ _ _ _ _ _ _ #mask for thread 0
% thread # 1 _ _ * _ _ _ _ _ #mask for thread 1
% thread # 2 _ _ _ _ * _ _ _ #mask for thread 2
% thread # 3 _ _ _ _ _ _ * _ #mask for thread 3
% case 2 threads:
%
% Hyper-Threads on (2 hardware threads per core)
% 1 socket x 4 cores x 2 HW-threads
%
% export OMP_NUM_THREADS=4
% export OMP_PLACES=cores
%
% core # 0 1 2 3
% processor # 0,1 2,3 4,5 6,7
% thread # 0 * * _ _ _ _ _ _ #mask for thread 0
% thread # 1 _ _ * * _ _ _ _ #mask for thread 1
% thread # 2 _ _ _ _ * * _ _ #mask for thread 2
% thread # 3 _ _ _ _ _ _ * * #mask for thread 3
% case 3 sockets:
%
% No Hyper-Threads
% 3 socket x 4 cores
%
% export OMP_NUM_THREADS=3
% export OMP_PLACES=sockets
%
% socket # 0 1 2
% processor # 0,1,2,3 4,5,6,7 8,9,10,11
% thread # 0 * * * * _ _ _ _ _ _ _ _ #mask for thread 0
% thread # 0 _ _ _ _ * * * * _ _ _ _ #mask for thread 1
% thread # 0 _ _ _ _ _ _ _ _ * * * * #mask for thread 2
%===== Examples Sections =====
\input{affinity/affinity}
\input{affinity/task_affinity}
\input{affinity/affinity_display}
\input{affinity/affinity_query}

95
Chap_data_environment.tex Normal file
View File

@ -0,0 +1,95 @@
\cchapter{Data Environment}{data_environment}
\label{chap:data_environment}
The OpenMP \plc{data environment} contains data attributes of variables and
objects. Many constructs (such as \kcode{parallel}, \kcode{simd}, \kcode{task})
accept clauses to control \plc{data-sharing} attributes
of referenced variables in the construct, where \plc{data-sharing} applies to
whether the attribute of the variable is \plc{shared},
is \plc{private} storage, or has special operational characteristics
(as found in the \kcode{firstprivate}, \kcode{lastprivate}, \kcode{linear}, or \kcode{reduction} clause).
The data environment for a device (distinguished as a \plc{device data environment})
is controlled on the host by \plc{data-mapping} attributes, which determine the
relationship of the data on the host, the \plc{original} data, and the data on the
device, the \plc{corresponding} data.
\bigskip
DATA-SHARING ATTRIBUTES
Data-sharing attributes of variables can be classified as being \plc{predetermined},
\plc{explicitly determined} or \plc{implicitly determined}.
Certain variables and objects have predetermined attributes.
A commonly found case is the loop iteration variable in associated loops
of a \kcode{for} or \kcode{do} construct. It has a private data-sharing attribute.
Variables with predetermined data-sharing attributes cannot be listed in a data-sharing clause; but there are some
exceptions (mainly concerning loop iteration variables).
Variables with explicitly determined data-sharing attributes are those that are
referenced in a given construct and are listed in a data-sharing attribute
clause on the construct. Some of the common data-sharing clauses are:
\kcode{shared}, \kcode{private}, \kcode{firstprivate}, \kcode{lastprivate},
\kcode{linear}, and \kcode{reduction}. % Are these all of them?
Variables with implicitly determined data-sharing attributes are those
that are referenced in a given construct, do not have predetermined
data-sharing attributes, and are not listed in a data-sharing
attribute clause of an enclosing construct.
For a complete list of variables and objects with predetermined and
implicitly determined attributes, please refer to the
\docref{Data-sharing Attribute Rules for Variables Referenced in a Construct}
subsection of the OpenMP Specifications document.
\bigskip
DATA-MAPPING ATTRIBUTES
The \kcode{map} clause on a device construct explicitly specifies how the list items in
the clause are mapped from the encountering task's data environment (on the host)
to the corresponding item in the device data environment (on the device).
The common \plc{list items} are arrays, array sections, scalars, pointers, and
structure elements (members).
Procedures and global variables have predetermined data mapping if they appear
within the list or block of a \kcode{declare target} directive. Also, a C/C++ pointer
is mapped as a zero-length array section, as is a C++ variable that is a reference to a pointer.
% Waiting for response from Eric on this.
Without explicit mapping, non-scalar and non-pointer variables within the scope of the \kcode{target}
construct are implicitly mapped with a \plc{map-type} of \kcode{tofrom}.
Without explicit mapping, scalar variables within the scope of the \kcode{target}
construct are not mapped, but have an implicit firstprivate data-sharing
attribute. (That is, the value of the original variable is given to a private
variable of the same name on the device.) This behavior can be changed with
the \kcode{defaultmap} clause.
The \kcode{map} clause can appear on \kcode{target}, \kcode{target data} and
\kcode{target enter/exit data} constructs. The operations of creation and
removal of device storage as well as assignment of the original list item
values to the corresponding list items may be complicated when the list
item appears on multiple constructs or when the host and device storage
is shared. In these cases the item's reference count, the number of times
it has been referenced (increment by 1 on entry and decrement by 1 on exit) in nested (structured)
map regions and/or accumulative (unstructured) mappings, determines the operation.
Details of the \kcode{map} clause and reference count operation are specified
in the \docref{\kcode{map} Clause} subsection of the OpenMP Specifications document.
%===== Examples Sections =====
\input{data_environment/threadprivate}
\input{data_environment/default_none}
\input{data_environment/private}
\input{data_environment/fort_loopvar}
\input{data_environment/fort_sp_common}
\input{data_environment/fort_sa_private}
\input{data_environment/fort_shared_var}
\input{data_environment/carrays_fpriv}
\input{data_environment/lastprivate}
\input{data_environment/reduction}
\input{data_environment/udr}
\input{data_environment/induction}
\input{data_environment/scan}
\input{data_environment/copyin}
\input{data_environment/copyprivate}
\input{data_environment/cpp_reference}
\input{data_environment/associate}

79
Chap_devices.tex Normal file
View File

@ -0,0 +1,79 @@
\cchapter{Devices}{devices}
\label{chap:devices}
The \kcode{target} construct consists of a \kcode{target} directive
and an execution region. The \kcode{target} region is executed on
the default device or the device specified in the \kcode{device}
clause.
In OpenMP version 4.0, by default, all variables within the lexical
scope of the construct are copied \plc{to} and \plc{from} the
device, unless the device is the host, or the data exists on the
device from a previously executed data-type construct that
has created space on the device and possibly copied host
data to the device storage.
The constructs that explicitly
create storage, transfer data, and free storage on the device
are categorized as structured and unstructured. The
\kcode{target data} construct is structured. It creates
a data region around \kcode{target} constructs, and is
convenient for providing persistent data throughout multiple
\kcode{target} regions. The \kcode{target enter data} and
\kcode{target exit data} constructs are unstructured, because
they can occur anywhere and do not support a ``structure''
(a region) for enclosing \kcode{target} constructs, as does the
\kcode{target data} construct.
The \kcode{map} clause is used on \kcode{target}
constructs and the data-type constructs to map host data. It
specifies the device storage and data movement \plc{to} and \plc{from}
the device, and controls on the storage duration.
There is an important change in the OpenMP 4.5 specification
that alters the data model for scalar variables and C/C++ pointer variables.
The default behavior for scalar variables and C/C++ pointer variables
in a 4.5 compliant code is \kcode{firstprivate}. Example
codes that have been updated to reflect this new behavior are
annotated with a description that describes changes required
for correct execution. Often it is a simple matter of mapping
the variable as \kcode{tofrom} to obtain the intended 4.0 behavior.
In OpenMP version 4.5 the mechanism for target
execution is specified as occurring through a \plc{target task}.
When the \kcode{target} construct is encountered a new
target task is generated. The target task
completes after the \kcode{target} region has executed and all data
transfers have finished.
This new specification does not affect the execution of
pre-4.5 code; it is a necessary element for asynchronous
execution of the \kcode{target} region when using the new \kcode{nowait}
clause introduced in OpenMP 4.5.
%===== Examples Sections =====
\input{devices/target}
\input{devices/target_defaultmap}
\input{devices/target_pointer_mapping}
\input{devices/target_structure_mapping}
\input{devices/target_fort_allocatable_array_mapping}
\input{devices/array_sections}
\input{devices/usm}
\input{devices/C++_virtual_functions}
\input{devices/array_shaping}
\input{devices/target_mapper}
\input{devices/target_data}
\input{devices/target_unstructured_data}
\input{devices/target_update}
\input{devices/declare_target}
\input{devices/lambda_expressions}
\input{devices/teams}
\input{devices/async_target_depend}
\input{devices/async_target_with_tasks}
\input{devices/async_target_nowait}
\input{devices/async_target_nowait_depend}
\input{devices/async_target_nowait_arg}
\input{devices/device}
\input{devices/device_env_traits}

71
Chap_directives.tex Normal file
View File

@ -0,0 +1,71 @@
\cchapter{OpenMP Directive Syntax}{directives}
\label{chap:directive_syntax}
\index{directive syntax}
OpenMP \plc{directives} use base-language mechanisms to specify OpenMP program behavior.
In C/C++ code, the directives are formed with
either pragmas or attributes.
Fortran directives are formed with comments in free form and fixed form sources (codes).
All of these mechanisms allow the compilation to ignore the OpenMP directives if
OpenMP is not supported or enabled.
The OpenMP directive is a combination of the base-language mechanism and a \plc{directive-specification},
as shown below. The \plc{directive-specification} consists
of the \plc{directive-name} which may seldomly have arguments,
followed by optional \plc{clauses}. Full details of the syntax can be found in the OpenMP Specification.
Illustrations of the syntax is given in the examples.
The formats for combining a base-language mechanism and a \plc{directive-specification} are:
C/C++ pragmas
\begin{indentedcodelist}
#pragma omp \plc{directive-specification}
\end{indentedcodelist}
C/C++ attribute specifiers
\begin{indentedcodelist}
[[omp :: directive( \plc{directive-specification} )]]
[[omp :: decl( \plc{directive-specification} )]]
\end{indentedcodelist}
C++ attribute specifiers
\begin{indentedcodelist}
[[using omp : directive( \plc{directive-specification} )]]
[[using omp : decl( \plc{directive-specification} )]]
\end{indentedcodelist}
where the \kcode{decl} attribute may be used for declarative
directives alternatively.
Fortran comments
\begin{indentedcodelist}
!$omp \plc{directive-specification}
\end{indentedcodelist}
where \scode{c$omp} and \scode{*$omp} may be used in Fortran fixed form sources.
Most OpenMP directives accept clauses that alter the semantics of the directive in some way,
and some directives also accept parenthesized arguments that follow the directive name.
A clause may just be a keyword (e.g., \kcode{untied}) or it may also accept argument lists
(e.g., \kcode{shared(\ucode{x,y,z})}) and/or optional modifiers (e.g., \kcode{tofrom} in
\kcode{map(tofrom: \ucode{x,y,z})}).
Clause modifiers may be ``simple'' or ``complex'' -- a complex modifier consists of a
keyword followed by one or more parameters, bracketed by parentheses, while a simple
modifier does not. An example of a complex modifier is the \kcode{iterator} modifier,
as in \kcode{map(iterator(\ucode{i=0:n}), tofrom: \ucode{p[i]})}, or the \kcode{step} modifier, as in
\kcode{linear(\ucode{x}: ref, step(\ucode{4}))}.
In the preceding examples, \kcode{tofrom} and \kcode{ref} are simple modifiers.
For Fortran, a declarative directive (such as \kcode{declare reduction})
must appear after any \bcode{USE}, \bcode{IMPORT}, and \bcode{IMPLICIT} statements
in the specification part.
%===== Examples Sections =====
\input{directives/pragmas}
\input{directives/attributes}
\input{directives/fixed_format_comments}
\input{directives/free_format_comments}

View File

@ -1,5 +1,5 @@
% This is the introduction for the OpenMP Examples document.
% This is an included file. See the master file (openmp-examples.tex) for more information.
% This is an included file. See the main file (openmp-examples.tex) for more information.
%
% When editing this file:
%
@ -32,45 +32,42 @@
% This is a \plc{var-name}.
%
\chapter*{Introduction}
\cchapter{Introduction}{introduction}
\label{chap:introduction}
This collection of programming examples supplements the OpenMP API for Shared
Memory Parallelization specifications, and is not part of the formal specifications. It
assumes familiarity with the OpenMP specifications, and shares the typographical
conventions used in that document.
\notestart
\noteheader This first release of the OpenMP Examples reflects the OpenMP Version 4.0
specifications. Additional examples are being developed and will be published in future
releases of this document.
\noteend
The OpenMP API specification provides a model for parallel programming that is
portable across shared memory architectures from different vendors. Compilers from
numerous vendors support the OpenMP API.
The directives, library routines, and environment variables demonstrated in this
document allow users to create and manage parallel programs while permitting
portability. The directives extend the C, C++ and Fortran base languages with single
program multiple data (SPMD) constructs, tasking constructs, device constructs,
worksharing constructs, and synchronization constructs, and they provide support for
portability. The directives extend the C, C++ and Fortran base languages with \plc{single
program multiple data} (SPMD) constructs, \plc{tasking} constructs, \plc{device} constructs,
\plc{worksharing} constructs, and \plc{synchronization} constructs, and they provide support for
sharing and privatizing data. The functionality to control the runtime environment is
provided by library routines and environment variables. Compilers that support the
OpenMP API often include a command line option to the compiler that activates and
allows interpretation of all OpenMP directives.
The latest source codes for OpenMP Examples can be downloaded from the \code{sources}
directory at
\href{https://github.com/OpenMP/Examples}{https://github.com/OpenMP/Examples}.
The codes for this OpenMP \VER{} Examples document have the tag \plc{v\VER}.
%\href{https://github.com/OpenMP/Examples/tree/master/sources}{https://github.com/OpenMP/Examples/sources}.
The documents and source codes for OpenMP Examples can be downloaded from
\href{\examplesrepo}{\examplesrepo}.
Each directory holds the contents of a chapter and has a \plc{sources} subdirectory of its codes.
This OpenMP Examples \VER{} document and its codes are tagged as
\examplestree{\VER}{\plc{v\VER}}.
Complete information about the OpenMP API and a list of the compilers that support
the OpenMP API can be found at the OpenMP.org web site
\code{http://www.openmp.org}
\scode{https://www.openmp.org}
\clearpage
\input{introduction/Examples}
% This is the end of introduction.tex of the OpenMP Examples document.

View File

@ -0,0 +1,27 @@
\cchapter{Loop Transformations}{loop_transformations}
\label{chap:loop_transformations}
To obtain better performance on a platform, code may need to be restructured
relative to the way it is written (which is often for best readability).
User-directed loop transformations accomplish this goal by providing a means
to separate code semantics and its optimization.
A loop transformation construct states that a transformation operation is to be
performed on set of nested loops. This directive approach can target specific loops
for transformation, rather than applying more time-consuming general compiler
heuristics methods with compiler options that may not be able to discover
optimal transformations.
Loop transformations can be augmented by preprocessor support or OpenMP \kcode{metadirective}
directives, to select optimal dimension and size parameters for specific platforms,
facilitating a single code base for multiple platforms.
Moreover, directive-based transformations make experimenting easier:
whereby specific hot spots can be affected by transformation directives.
%===== Examples Sections =====
\input{loop_transformations/tile}
\input{loop_transformations/partial_tile}
\input{loop_transformations/unroll}
\input{loop_transformations/apply}

137
Chap_memory_model.tex Normal file
View File

@ -0,0 +1,137 @@
\cchapter{Memory Model}{memory_model}
\label{chap:memory_model}
OpenMP provides a shared-memory model that allows all threads on a given
device shared access to \emph{memory}. For a given OpenMP region that may be
executed by more than one thread or SIMD lane, variables in memory may be
\plc{shared} or \plc{private} with respect to those threads or SIMD lanes. A
variable's data-sharing attribute indicates whether it is shared (the
\plc{shared} attribute) or private (the \plc{private}, \plc{firstprivate},
\plc{lastprivate}, \plc{linear}, and \plc{reduction} attributes) in the data
environment of an OpenMP region. While private variables in an OpenMP region
are new copies of the original variable (with same name) that may then be
concurrently accessed or modified by their respective threads or SIMD lanes, a
shared variable in an OpenMP region is the same as the variable of the same
name in the enclosing region. Concurrent accesses or modifications to a
shared variable may therefore require synchronization to avoid data races.
OpenMP's memory model also includes a \emph{temporary view} of memory that is
associated with each thread. Two different threads may see different values for
a given variable in their respective temporary views. Threads may employ flush
operations for the purposes of making their temporary view of a variable
consistent with the value of the variable in memory. The effect of a given
flush operation is characterized by its flush properties -- some combination of
\plc{strong}, \plc{release}, and \plc{acquire} -- and, for \plc{strong}
flushes, a \plc{flush-set}.
A \plc{strong} flush will force consistency between the temporary view and the
memory for all variables in its \plc{flush-set}. Furthermore, all strong flushes in a
program that have intersecting flush-sets will execute in some total order, and
within a thread strong flushes may not be reordered with respect to other
memory operations on variables in its flush-set. \plc{Release} and
\plc{acquire} flushes operate in pairs. A release flush may ``synchronize''
with an acquire flush, and when it does so the local memory operations that
precede the release flush will appear to have been completed before the local
memory operations on the same variables that follow the acquire flush.
Flush operations arise from explicit \kcode{flush} directives, implicit
\kcode{flush} directives, and also from the execution of \kcode{atomic}
constructs. The \kcode{flush} directive forces a consistent view of local
variables of the thread executing the \kcode{flush}. When a list is supplied on
the directive, only the items (variables) in the list are guaranteed to be
flushed. Implied flushes exist at prescribed locations of certain constructs.
For the complete list of these locations and associated constructs, please
refer to the \docref{\kcode{flush} Construct} section of the OpenMP Specifications
document.
In this chapter, examples illustrate how race conditions may arise for accesses
to variables with a \plc{shared} data-sharing attribute when flush operations
are not properly employed. A race condition can exist when two or more threads
are involved in accessing a variable and at least one of the accesses modifies
the variable. In particular, a data race will arise when conflicting accesses
do not have a well-defined \emph{completion order}. The existence of data
races in OpenMP programs result in undefined behavior, and so they should
generally be avoided for programs to be correct. The completion order of
accesses to a shared variable is guaranteed in OpenMP through a set of memory
consistency rules that are described in the \docref{OpenMP Memory Consistency}
section of the OpenMP Specifications document.
%This chapter also includes examples that exhibit non-sequentially consistent
%(\emph{non-SC}) behavior. Sequential consistency (\emph{SC}) is the desirable
%property that the results of a multi-threaded program are as if all operations
%are performed in some total order, consistent with the program order of
%operations performed by each thread. OpenMP guarantees that a correct program
%(i.e. a program that does not have a data race) will exhibit SC behavior
%so long as the only \code{atomic} constructs it uses are SC atomic directives.
% The following table lists construct in which implied flushes exist, and the
% location of their execution.
%
% %\begin{table}[hb]
% \begin{center}
% %\caption {Execution Location for Implicit Flushes. }
% \begin{tabular}{ | p{0.6\linewidth} | l | }
% \hline
% \code{CONSTRUCT} & \makecell{\code{EXECUTION} \\ \code{LOCATION}} \\
% \hline
% \code{parallel} & upon entry and exit \\
% \hline
% \makecell[l]{worksharing \\ \hspace{1.5em}\code{for}, \code{do}
% \\ \hspace{1.5em}\code{sections}
% \\ \hspace{1.5em}\code{single}
% \\ \hspace{1.5em}\code{workshare} }
% & upon exit \\
% \hline
% \code{critical} & upon entry and exit \\
% \hline
% \code{target} & upon entry and exit \\
% \hline
% \code{barrier} & during \\
% \hline
% \code{atomic} operation with \plc{seq\_cst} clause & upon entry and exit \\
% \hline
% \code{ordered}* & upon entry and exit \\
% \hline
% \code{cancel}** and \code{cancellation point}** & during \\
% \hline
% \code{target data} & upon entry and exit \\
% \hline
% \code{target update} + \code{to} clause,
% \code{target enter data} & on entry \\
% \hline
% \code{target update} + \code{from} clause,
% \code{target exit data} & on exit \\
% \hline
% \code{omp\_set\_lock} & during \\
% \hline
% \makecell[l]{ \code{omp\_set/unset\_lock}, \code{omp\_test\_lock}***
% \\ \code{omp\_set/unset/test\_nest\_lock}*** }
% & during \\
% \hline
% task scheduling point & \makecell[l]{immediately \\ before and after} \\
% \hline
% \end{tabular}
% %\caption {Execution Location for Implicit Flushes. }
%
% \end{center}
% %\end{table}
%
% * without clauses and with \code{threads} or \code{depend} clauses \newline
% ** when \plc{cancel-var} ICV is \plc{true} (cancellation is turned on) and cancellation is activated \newline
% *** if the region causes the lock to be set or unset
%
% A flush with a list is implied for non-sequentially consistent \code{atomic} operations
% (\code{atomic} directive without a \code{seq\_cst} clause), where the list item is the
% specific storage location accessed atomically (specified as the \plc{x} variable
% in \plc{atomic Construct} subsection of the OpenMP Specifications document).
% Examples 1-3 show the difficulty of synchronizing threads through \code{flush} and \code{atomic} directives.
%===== Examples Sections =====
\input{memory_model/mem_model}
\input{memory_model/allocators}
\input{memory_model/fort_race}

19
Chap_ompt_interface.tex Normal file
View File

@ -0,0 +1,19 @@
\cchapter{OMPT Interface}{ompt_interface}
\label{chap:ompt_interface}
OMPT defines mechanisms and an API for interfacing with tools in the OpenMP program.
The OMPT API provides the following functionality:
\begin{itemize}
\addtolength{\itemindent}{1cm}
\item examines the state associated with an OpenMP thread
\item interprets the call stack of an OpenMP thread
\item receives notification about OpenMP events
\item traces activity on OpenMP target devices
\item assesses implementation-dependent details
\item controls a tool from an OpenMP application
\end{itemize}
The following sections will illustrate basic mechanisms and operations of the OMPT API.
\input{ompt_interface/ompt_start}

130
Chap_parallel_execution.tex Normal file
View File

@ -0,0 +1,130 @@
\cchapter{Parallel Execution}{parallel_execution}
\label{chap:parallel_execution}
A single thread, the \plc{initial thread}, begins sequential execution of
an OpenMP enabled program, as if the whole program is in an implicit parallel
region consisting of an implicit task executed by the \plc{initial thread}.
A \kcode{parallel} construct encloses code,
forming a parallel region. An \plc{initial thread} encountering a \kcode{parallel}
region forks (creates) a team of threads at the beginning of the
\kcode{parallel} region, and joins them (removes from execution) at the
end of the region. The initial thread becomes the primary thread of the team in a
\kcode{parallel} region with a \plc{thread} number equal to zero, the other
threads are numbered from 1 to number of threads minus 1.
A team may be comprised of just a single thread.
Each \plc{thread} of a team is assigned an implicit task consisting of code within the
\kcode{parallel} region. The task that creates a \kcode{parallel} region is suspended while the
tasks of the team are executed. A thread is tied to its task; that is,
only the thread assigned to the task can execute that task. After completion
of the \kcode{parallel} region, the primary thread resumes execution of the generating task.
%After the \code{parallel} region the primary thread becomes the initial
%thread again, and continues to execute the \plc{sequential part}.
Any task within a \kcode{parallel} region is allowed to encounter another
\kcode{parallel} region to form a nested \kcode{parallel} region. The
parallelism of a nested \kcode{parallel} region (whether it forks additional
threads, or is executed serially by the encountering task) can be controlled by the
\kcode{OMP_NESTED} environment variable or the \kcode{omp_set_nested()}
API routine with arguments indicating true or false.
The number of threads of a \kcode{parallel} region can be set by the \kcode{OMP_NUM_THREADS}
environment variable, the \kcode{omp_set_num_threads()} routine, or on the \kcode{parallel}
directive with the \kcode{num_threads}
clause. The routine overrides the environment variable, and the clause overrides all.
Use the \kcode{OMP_DYNAMIC}
or the \kcode{omp_set_dynamic()} function to specify that the OpenMP
implementation dynamically adjust the number of threads for
\kcode{parallel} regions. The default setting for dynamic adjustment is implementation
defined. When dynamic adjustment is on and the number of threads is specified,
the number of threads becomes an upper limit for the number of threads to be
provided by the OpenMP runtime.
%\pagebreak
\bigskip
WORKSHARING CONSTRUCTS
A worksharing construct distributes the execution of the associated region
among the members of the team that encounter it. There is an
implied barrier at the end of the worksharing region
(there is no barrier at the beginning).
\newpage
The worksharing constructs are:
\begin{compactitem}
\item loop constructs: {\kcode{for} and \kcode{do} }
\item \kcode{sections}
\item \kcode{single}
\item \kcode{workshare}
\end{compactitem}
The \kcode{for} and \kcode{do} constructs (loop constructs) create a region
consisting of a loop. A loop controlled by a loop construct is called
an \plc{associated} loop. Nested loops can form a single region when the
\kcode{collapse} clause (with an integer argument) designates the number of
\plc{associated} loops to be executed in parallel, by forming a
``single iteration space'' for the specified number of nested loops.
The \kcode{ordered} clause can also control multiple associated loops.
An associated loop must adhere to a ``canonical form'' (specified in the
\docref{Canonical Loop Form} of the OpenMP Specifications document) which allows the
iteration count (of all associated loops) to be computed before the
(outermost) loop is executed. %[58:27-29].
Most common loops comply with the canonical form, including C++ iterators.
A \kcode{single} construct forms a region in which only one thread (any one
of the team) executes the region.
The other threads wait at the implied
barrier at the end, unless the \kcode{nowait} clause is specified.
The \kcode{sections} construct forms a region that contains one or more
structured blocks. Each block of a \kcode{sections} directive is
constructed with a \kcode{section} construct, and executed once by
one of the threads (any one) in the team. (If only one block is
formed in the region, the \kcode{section} construct, which is used to
separate blocks, is not required.)
The other threads wait at the implied
barrier at the end, unless the \kcode{nowait} clause is specified.
The \kcode{workshare} construct is a Fortran feature that consists of a
region with a single structure block (section of code). Statements in the
\kcode{workshare} region are divided into units of work, and executed (once)
by threads of the team.
\bigskip
MASKED CONSTRUCT
The \kcode{masked} construct is not a worksharing construct. The \kcode{masked} region is
executed only by the primary thread. There is no implicit barrier (and flush)
at the end of the \kcode{masked} region; hence the other threads of the team continue
execution beyond code statements beyond the \kcode{masked} region.
The \kcode{master} construct, which has been deprecated in OpenMP 5.1, has identical semantics
to the \kcode{masked} construct with no \kcode{filter} clause.
%===== Examples Sections =====
\input{parallel_execution/ploop}
\input{parallel_execution/parallel}
\input{parallel_execution/host_teams}
\input{parallel_execution/nthrs_nesting}
\input{parallel_execution/nthrs_dynamic}
\input{parallel_execution/fort_do}
\input{parallel_execution/nowait}
\input{parallel_execution/collapse}
\input{parallel_execution/linear_in_loop}
\input{parallel_execution/psections}
\input{parallel_execution/fpriv_sections}
\input{parallel_execution/single}
\input{parallel_execution/workshare}
\input{parallel_execution/masked}
\input{parallel_execution/loop}
\input{parallel_execution/pra_iterator}
\input{parallel_execution/set_dynamic_nthrs}
\input{parallel_execution/get_nthrs}

116
Chap_program_control.tex Normal file
View File

@ -0,0 +1,116 @@
\cchapter{Program Control}{program_control}
\label{chap:program_control}
Basic concepts and mechanisms for directing and controlling a program compilation and execution
are provided in this introduction and illustrated in subsequent examples.
\bigskip
CONDITIONAL COMPILATION and EXECUTION
Conditional compilation can be performed with conventional \bcode{\#ifdef} directives
in C, C++, and Fortran, and additionally with OpenMP sentinel (\scode{!$}) in Fortran.
The \kcode{if} clause on some directives
can direct the runtime to ignore or alter the behavior of the construct.
Of course, the base-language \bcode{if} statements can be used to control the execution
of stand-alone directives (such as \kcode{flush}, \kcode{barrier}, \kcode{taskwait},
and \kcode{taskyield}).
However, the directives must appear in a block structure, and not as a substatement.
The \kcode{metadirective} and \kcode{declare variant} directives provide conditional
selection of directives and routines for compilation (and use), respectively.
The \kcode{assume} and \kcode{requires} directives provide invariants
for optimizing compilation, and essential features for compilation
and correct execution, respectively.
\bigskip
CANCELLATION
Cancellation (termination) of the normal sequence of execution for the threads in an OpenMP region can
be accomplished with the \kcode{cancel} construct. The construct uses a
\plc{construct-type-clause} to set the region-type to activate for the cancellation.
That is, inclusion of one of the \plc{construct-type-clause} names \kcode{parallel}, \kcode{for},
\kcode{do}, \kcode{sections} or \kcode{taskgroup} on the directive line
activates the corresponding region.
The \kcode{cancel} construct is activated by the first encountering thread, and it
continues execution at the end of the named region.
The \kcode{cancel} construct is also a cancellation point for any other thread of the team
to also continue execution at the end of the named region.
Also, once the specified region has been activated for cancellation any thread that encounters
a \kcode{cancellation point} construct with the same named region (\plc{construct-type-clause}),
continues execution at the end of the region.
For an activated \kcode{cancel taskgroup} construct, the tasks that
belong to the taskgroup set of the innermost enclosing taskgroup region will be canceled.
A task that encounters a \kcode{cancel taskgroup} construct continues execution at the end of its
task region. Any task of the taskgroup that has already begun execution will run to completion,
unless it encounters a \kcode{cancellation point}; tasks that have not begun execution may be
discarded as completed tasks.
\pagebreak
CONTROL VARIABLES
Internal control variables (ICV) are used by implementations to hold values which control the execution
of OpenMP regions. Control (and hence the ICVs) may be set as implementation defaults,
or set and adjusted through environment variables, clauses, and API functions.
%Many of the ICV control values are accessible through API function calls.
Initial ICV values are reported by the runtime
if the \kcode{OMP_DISPLAY_ENV} environment variable has been set to \vcode{TRUE} or \vcode{VERBOSE}.
%As an example, the \plc{nthreads-var} is the ICV that holds the number of threads
%to be used in a \code{parallel} region. It can be set with the \code{OMP\_NUM\_THREADS} environment variable,
%the \code{omp\_set\_num\_threads()} API function, or the \code{num\_threads} clause. The default \plc{nthreads-var}
%value is implementation defined. All of the ICVs are presented in the \plc{Internal Control Variables} section
%of the \plc{Directives} chapter of the OpenMP Specifications document. Within the same document section, override
%relationships and scoping information can be found for applying user specifications and understanding the
%extent of the control.
\bigskip
NESTED CONSTRUCTS
Certain combinations of nested constructs are permitted, giving rise to \plc{combined} constructs
consisting of two or more directives. These can be used when the two (or several) constructs would be used
immediately in succession (closely nested). A combined construct can use the clauses of the component
constructs without restrictions.
A \plc{composite} construct is a combined construct which has one or more clauses with (an often obviously)
modified or restricted meaning, relative to when the constructs are uncombined. %%[appear separately (singly).
%The combined \code{parallel do} and \code{parallel for} constructs are formed by combining the \code{parallel}
%construct with one of the loops constructs \code{do} or \code{for}. The
%\code{parallel do SIMD} and \code{parallel for SIMD} constructs are composite constructs (composed from
%the parallel loop constructs and the \code{SIMD} construct), because the \code{collapse} clause must
%explicitly address the ordering of loop chunking \plc{and} SIMD ``combined'' execution.
Certain nestings are forbidden, and often the reasoning is obvious. For example, worksharing constructs cannot be nested, and
the \kcode{barrier} construct cannot be nested inside a worksharing construct, or a \kcode{critical} construct.
Also, \kcode{target} constructs cannot be nested, unless the nested target is a reverse offload.
The \kcode{parallel} construct can be nested, as well as the \kcode{task} construct.
The parallel execution in the nested \kcode{parallel} construct(s) is controlled by the
\kcode{OMP_MAX_ACTIVE_LEVELS} environment variable, and the \kcode{omp_set_max_active_levels} routine.
Use the \kcode{omp_get_max_active_levels} routine to determine the maximum levels provided by an implementation.
As of OpenMP 5.0, use of the \kcode{OMP_NESTED} environment variable and the \kcode{omp_set_nested} routine
has been deprecated.
More details on nesting can be found in the \docref{Nesting of Regions} of the \docref{Directives}
chapter in the OpenMP Specifications document.
%===== Examples Sections =====
\input{program_control/assumption}
\input{program_control/cond_comp}
\input{program_control/icv}
\input{program_control/standalone}
\input{program_control/cancellation}
\input{program_control/requires}
\input{program_control/context_based_variants}
\input{program_control/dispatch}
\input{program_control/nested_loop}
\input{program_control/nesting_restrict}
\input{program_control/target_offload}
\input{program_control/pause_resource}
\input{program_control/reproducible}
\input{program_control/interop}
\input{program_control/utilities}

101
Chap_synchronization.tex Normal file
View File

@ -0,0 +1,101 @@
\cchapter{Synchronization}{synchronization}
\label{chap:synchronization}
The \kcode{barrier} construct is a stand-alone directive that requires all threads
of a team (within a contention group) to execute the barrier and complete
execution of all tasks within the region, before continuing past the barrier.
The \kcode{critical} construct is a directive that contains a structured block.
The construct allows only a single thread at a time to execute the structured block (region).
Multiple \kcode{critical} regions may exist in a parallel region, and may
act cooperatively (only one thread at a time in all \kcode{critical} regions),
or separately (only one thread at a time in each \kcode{critical} regions when
a unique name is supplied on each \kcode{critical} construct).
An optional (lock) \kcode{hint} clause may be specified on a named \kcode{critical}
construct to provide the OpenMP runtime guidance in selection a locking
mechanism.
On a finer scale the \kcode{atomic} construct allows only a single thread at
a time to have atomic access to a storage location involving a single read,
write, update or capture statement, and a limited number of combinations
when specifying the \kcode{capture} \plc{atomic-clause} clause. The
\plc{atomic-clause} clause is required for some expression statements, but is
not required for \kcode{update} statements. The \plc{memory-order} clause can be
used to specify the degree of memory ordering enforced by an \kcode{atomic}
construct. From weakest to strongest, they are \kcode{relaxed} (the default),
\plc{acquire} and/or \plc{release} clauses (specified with \kcode{acquire}, \kcode{release},
or \kcode{acq_rel}), and \kcode{seq_cst}. Please see the details in the
\docref{atomic Construct} subsection of the \docref{Directives} chapter in the OpenMP
Specifications document.
% The following three sentences were stolen from the spec.
The \kcode{ordered} construct either specifies a structured block in a loop,
simd, or loop SIMD region that will be executed in the order of the loop
iterations. The \kcode{ordered} construct sequentializes and orders the execution
of \kcode{ordered} regions while allowing code outside the region to run in parallel.
Since OpenMP 4.5 the \kcode{ordered} construct can also be a stand-alone
directive that specifies cross-iteration dependences in a \plc{doacross} loop nest.
The \kcode{depend} clause uses a \kcode{sink} \plc{dependence-type}, along with an
iteration vector argument (\plc{vec}) to indicate the iteration that satisfies the
dependence. The \kcode{depend} clause with a \kcode{source}
\plc{dependence-type} specifies dependence satisfaction.
The \kcode{flush} directive is a stand-alone construct for enforcing consistency
between a thread's view of memory and the view of memory for other threads (see
the Memory Model chapter of this document for more details). When the construct
is used with an explicit variable list, a \plc{strong flush} that forces a
thread's temporary view of memory to be consistent with the actual memory is
applied to all listed variables. When the construct is used without an explicit
variable list and without a \plc{memory-order} clause, a strong flush is
applied to all locally thread-visible data as defined by the base language, and
additionally the construct provides both acquire and release memory ordering
semantics. When an explicit variable list is not present and a
\plc{memory-order} clause is present, the construct provides acquire and/or
release memory ordering semantics according to the \plc{memory-order} clause,
but no strong flush is performed. A resulting strong flush that applies to a
set of variables effectively ensures that no memory (load or store)
operation for the affected variables may be reordered across the \kcode{flush}
directive.
General-purpose routines provide mutual exclusion semantics through locks,
represented by lock variables.
The semantics allows a task to \plc{set}, and hence
\plc{own} a lock, until it is \plc{unset} by the task that set it. A
\plc{nestable} lock can be set multiple times by a task, and is used
when in code requires nested control of locks. A \plc{simple lock} can
only be set once by the owning task. There are specific calls for the two
types of locks, and the variable of a specific lock type cannot be used by the
other lock type.
Any explicit task will observe the synchronization prescribed in a
\kcode{barrier} construct and an implied barrier. Also, additional synchronizations
are available for tasks. All children of a task will wait at a \kcode{taskwait} (for
their siblings to complete). A \kcode{taskgroup} construct creates a region in which the
current task is suspended at the end of the region until all sibling tasks,
and their descendants, have completed.
Scheduling constraints on task execution can be prescribed by the \kcode{depend}
clause to enforce dependence on previously generated tasks.
More details on controlling task executions can be found in the \docref{Tasking} Chapter
in the OpenMP Specifications document. %(DO REF. RIGHT.)
%===== Examples Sections =====
\input{synchronization/critical}
\input{synchronization/worksharing_critical}
\input{synchronization/barrier_regions}
\input{synchronization/atomic}
\input{synchronization/atomic_cas}
\input{synchronization/atomic_restrict}
\input{synchronization/atomic_hint}
\input{synchronization/acquire_release}
\input{synchronization/ordered}
\input{synchronization/depobj}
\input{synchronization/doacross}
\input{synchronization/locks}
\input{synchronization/init_lock}
\input{synchronization/init_lock_with_hint}
\input{synchronization/lock_owner}
\input{synchronization/simple_lock}
\input{synchronization/nestable_lock}

63
Chap_tasking.tex Normal file
View File

@ -0,0 +1,63 @@
\cchapter{Tasking}{tasking}
\label{chap:tasking}
Tasking constructs provide units of work to a thread for execution.
Worksharing constructs do this, too (e.g. \kcode{for}, \kcode{do},
\kcode{sections}, and \kcode{single} constructs);
but the work units are tightly controlled by an iteration limit and limited
scheduling, or a limited number of \kcode{sections} or \kcode{single} regions.
Worksharing was designed
with ``data parallel'' computing in mind. Tasking was designed for
``task parallel'' computing and often involves non-locality or irregularity
in memory access.
The \kcode{task} construct can be used to execute work chunks: in a while loop;
while traversing nodes in a list; at nodes in a tree graph;
or in a normal loop (with a \kcode{taskloop} construct).
Unlike the statically scheduled loop iterations of worksharing, a task is
often enqueued, and then dequeued for execution by any of the threads of the
team within a parallel region. The generation of tasks can be from a single
generating thread (creating sibling tasks), or from multiple generators
in a recursive graph tree traversals.
%(creating a parent-descendents hierarchy of tasks, see example 4 and 7 below).
A \kcode{taskloop} construct
bundles iterations of an associated loop into tasks, and provides
similar controls found in the \kcode{task} construct.
Sibling tasks are synchronized by the \kcode{taskwait} construct, and tasks
and their descendent tasks can be synchronized by containing them in
a \kcode{taskgroup} region. Ordered execution is accomplished by specifying
dependences with a \kcode{depend} clause. Also, priorities can be
specified as hints to the scheduler through a \kcode{priority} clause.
Various clauses can be used to manage and optimize task generation,
as well as reduce the overhead of execution and to relinquish
control of threads for work balance and forward progress.
Once a thread starts executing a task, it is the designated thread
for executing the task to completion, even though it may leave the
execution at a scheduling point and return later. The thread is \plc{tied}
to the task. Scheduling points can be introduced with the \kcode{taskyield}
construct. With an \kcode{untied} clause any other thread is allowed to continue
the task. An \kcode{if} clause with an expression that evaluates to \plc{false}
results in an \plc{undeferred} task, which instructs the runtime to suspend
the generating task until the undeferred task completes its execution.
By including the data environment of the generating task into the generated task with the
\kcode{mergeable} and \kcode{final} clauses, task generation overhead can be reduced.
A complete list of the tasking constructs and details of their clauses
can be found in the \docref{Tasking Constructs} chapter of the OpenMP Specifications.
%in the \docref{OpenMP Application Programming Interface} section.
%===== Examples Sections =====
\input{tasking/tasking}
\input{tasking/task_priority}
\input{tasking/task_dep}
\input{tasking/task_detach}
\input{tasking/taskgroup}
\input{tasking/taskyield}
\input{tasking/taskloop}
\input{tasking/parallel_masked_taskloop}
\input{tasking/taskloop_dep}

234
Contributions.md Normal file
View File

@ -0,0 +1,234 @@
# Contributing
The usual process for adding new examples, making changes or adding corrections
is to submit an issue for discussion and initial evaluation of changes or example additions.
When there is a consensus at a meeting about the contribution,
the issue will be brought forward for voting at the OpenMP Language
Committee meetings and you will be asked to submit a pull request.
Of course, if your contribution is an obvious correction, clarification, or note, you
may want to submit a pull request directly.
-----------------------------------------------------------
## The OpenMP Examples document
The OpenMP Examples document is in LaTeX format.
Please see the main LaTeX file, `openmp-examples.tex`, for more information.
## Maintainer
[OpenMP Examples Subcommittee](http://twiki.openmp.org/twiki/bin/view/OpenMPLang/OpenMPExamplesSubCommittee)
For a brief revision history, see `Changes.log` in the repo.
## Git procedure
* Fork your own branch of the OpenMP [examples-internal repo](https://github.com/OpenMP/examples-internal)
* Clone your fork locally
* If you are working on generic or old-version updates, create a branch off main.
* If you are working on an example for a release candidate for version #.#, create a branch off work_#.#.
1) `git clone --branch <main|work_#.#> https://github.com/<my_account>/examples-internal`
2) `git checkout -b <branch_name>`
3) ... `add`, `commit`
4) `git push -u origin <branch_name>`
5) `make` or `make diff` will create a full-document pdf or just a pdf with differences (do this at any point).
* `git status` and `git branch -a` are your friends
* Submit an issue for your work (usually with a diff pdf), and then you will be asked to submit a pull request
* Create an issue by selecting the (issue tab)[https://github.com/OpenMP/examples-internal/issues] and clicking on `new issue`.
* Use this MarkDown Cheatsheet for (issue formatting)[https://wordpress.com/support/markdown-quick-reference/]
* More MarkDown details are available (here)[https://markdown-it.github.io]
* You can cut and paste markdown formatted text in a (reader)[https://dillinger.io] to see formatting effects.
* Forced spaces are available in Markdown. On a Mac it is "option+space".
* Polling is available. Go to (gh-poll)[https://app.gh-polls.com/]. Type an option on each line, then click `copy markdown`, and paste the contents into the issue. (Use preview to check your poll, and then submit it.)
* Create a pull request
## Processing source code
* Prepare source code (C/C++ and Fortran) and a text description (use similar styles found in recent examples)
* Determine the *example* name `<ename>`, *sequence* identifier `<seq-id>` and *compiler* suffix `<csuffix>` for the example
* The syntax is: `<ename>.<seq-id>.<csuffix>` (e.g. `affinity_display.1.f90`)
* The example name may be a Section name (e.g. affinity), or a Subsection name (affinity_display)
* If you are creating a new Chapter, it may be the chapter name.
* New examples are usually added at the end of a Section or Subsection. Number it as the next number in the sequence numbers for examples in that Section or Subsection.
* The compiler suffix `<csuffix>` is `c`, `cpp`, `f`, and `f90` for C, C++ and Fortran (fixed/free form) codes.
* Insert the code in the sources directory for each chapter, and include the following metadata:
* Metadata Tags for example sources:
```
@@name: <ename>.<seq-no>
@@type: C|C++|F-fixed|F-free
@@operation: view|compile|link|run
@@expect: success|ct-error|rt-error|unspecified
@@version: [pre_]omp_<verno>
@@env: <environment_variables>
@@depend: <source_code_name>
```
* **name**
- is the name of an example
* **type**
- is the source code type, which can be translated into or from proper file extension (C:c,C++:cpp,F-fixed:f,F-free:f90)
* **operation**
- indicates how the source code is treated. Possible values are:
- `view` - code for illustration only, not compilable;
- `compile` - incomplete program, such as function or subroutine;
- `link` - complete program, but no verification value;
- `run` - complete program with verification value.
* **expect**
- indicates some expected result for testing purpose.
- `success` means no issue;
- `ct-error` applies to the result of code compilation;
- `rt-error` is for a case where compilation may be successful, but the code
contains potential runtime issues (including race condition);
- `unspecified` could result from a non-conforming code or is for code
that is viewable only.
* **version**
- indicates that the example uses features in a specific OpenMP version, such as "`omp_5.0`".
The prefix `pre_` indicates that the example uses features prior to a specific version, such as "`pre_omp_3.0`".
* **env**
- specifies any environment variables needed to run the code.
This tag is optional and can be repeated.
* **depend**
- specifies a source code file on which the current code depends.
This tag is optional and can be repeated.
* For **env** and **depend**, make sure to specify
a proper skipping number `<s>` in the LaTeX macros described below
to match with the number of `env` and `depend` tags.
## Process for text
* Create or update the description text in a Section/Subsection file under each chapter directory, usually `<chap_directory>/<ename>.tex`
* If adding a new Subsection, just include it in the appropriate subsection file (`<subsection>.tex`)
* If adding a new Section, create an `<section>.tex` file and add an entry in the corresponding chapter file, such as `Chap_affinity.tex`
* If adding a new Chapter, create a `Chap_<chap_name>.tex` file with introductory text, and add a new `<section>.tex` file with text and links to the code. Update `Makefile` and `openmp-examples.tex` to include the new chapter file.
* Commit your changes into your fork of examples-internal
* Summit your issue at [OpenMP Examples internal repo]( https://github.com/openmp/examples-internal/issues), and include a PDF when ready.
* Examples subcommittee members can view [meeting schedule and notes](http://twiki.openmp.org/twiki/bin/view/OpenMPLang/ExamplesSchedules)
* Shepherd your issue to acceptance (discussed at weekly Examples meeting and in issue comments)
* When it is in a ready state, you should then submit a pull request.
* It will be reviewed and voted on, and changes will be requested.
* Once the last changes are made, it will be verified and merged into an appropriate branch (either the `main` branch or a working branch).
## LaTeX macros for examples
The following describes LaTeX macros defined specifically for examples.
* Source code with language h-rules
* Source code without language h-rules
* Language h-rules
* Macros for keywords in text description
* Other macros
* See `openmp.sty` for more information
### Source code with language h-rules
```
\cexample[<verno>]{<ename>}{<seq-no>}[<s>] % for C/C++ examples
\cppexample[<verno>]{<ename>}{<seq-no>}[<s>] % for C++ examples
\fexample[<verno>]{<ename>}{<seq-no>}[<s>] % for fixed-form Fortran examples
\ffreeexample[<verno>]{<ename>}{<seq-no>}[<s>] % for free-form Fortran examples
```
### Source code without language h-rules
```
\cnexample[<verno>]{<ename>}{<seq-no>}[<s>]
\cppnexample[<verno>]{<ename>}{<seq-no>}[<s>]
\fnexample[<verno>]{<ename>}{<seq-no>}[<s>]
\ffreenexample[<verno>]{<ename>}{<seq-no>}[<s>]
\srcnexample[<verno>]{<ename>}{<seq-no>}{<ext>}[<s>]
```
Optional `<verno>` can be supplied in a macro to include a specific OpenMP
version in the example header. This option also suggests one additional
tag (`@@version`) line is included in the corresponding source code.
If this is not the case (i.e., no `@@version` tag line), one needs to
prefix `<verno>` with an underscore '\_' symbol in the macro.
The exception is macro `\srcnexample`, for which the corresponding
source code might not contain any `@@` metadata tags. The `ext` argument
to this macro is the file extension (such as `h`, `hpp`, `inc`).
The `<s>` option to each macro allows finer-control of any additional lines
to be skipped due to addition of new `@@` tags, such as `@@env`.
The default value for `<s>` is 0.
### Language h-rules
```
\cspecificstart, \cspecificend
\cppspecificstart, \cppspecificend
\ccppspecificstart, \ccppspecificend
\fortranspecificstart, \fortranspecificend
\begin{cspecific}[s] ... \end{cspecific}
\begin{cppspecific}[s] ... \end{cppspecific}
\begin{ccppspecific}[s] ... \end{ccppspecific}
\begin{fortranspecific}[s] ... \end{fortranspecific}
\topmarker{Lang}
```
Use of the structured `\begin{} .. \end{}` environments is the preferred
way of specifying language-dependent text over the unstructured approach
of using `\*specificstart` and `\*specificend`.
The option `[s]` to each of the environments can specify a vertical shift
for the beginning rule, such as when followed by a section header.
The macro `\topmarker` puts a dashed blue line floater at top of a page for
"Lang (cont.)" where `Lang` can be `C/C++`, `C++`, `Fortran`.
### Macros for keywords in text description
A partial list:
- `\kcode{}` - for OpenMP keywords, such as directives, clauses, environment variables, API routines. Support direct use of '_' (underscore) and ' ' (space)
- `\scode{}` - OpenMP specifier with special chars, such as '`$`' in "`!$omp`"
- `\bcode{}` - base language keywords (such as `ASSOCIATE` in Fortran)
- `\vcode{}` - values of a keyword, such as `TRUE`, `FALSE`, `VERBOSE`
- `\plc{}` - OpenMP concept, such ICV names; `\splc{}` - escape '_' (underscore)
- `\example{}` - example names, such as `\example{taskloop_reduction.1}`
- `\docref{}` - chapter or section name of a document, such as the spec
- `\ucode{}` - program variables, procedure names, or expression in examples codes. Support direct use of '_' (underscore) and ' ' (space).
- `\pout{}` - program outputs
Examples:
- `\kcode{declare reduction}` for **declare reduction**
- `\scode{!$omp}` sentinel, however, `\kcode{\#pragma omp}`
- `\kcode{map(iterator(\ucode{i=0:n}), tofrom: \ucode{p[i]})}` for **map(iterator(**_i=0:n_**), tofrom:** _p[i]_**)**
- Fortran `\bcode{IMPLICIT NONE}` statement
- The `\vcode{VERBOSE}` value for `\kcode{OMP_DISPLAY_ENV}`
- OpenMP `\plc{directives}`, the `\plc{num-threads}` ICV
- This is an example name `\example{taskloop_reduction.1}`
- `(\ucode{x,y,z})` argument for procedure `\ucode{a_proc_name}`
- structure constructor `\ucode{point($\ldots$)}`
- This is a code output `"\pout{x = 1}"`
### Other macros
```
\cchapter{<Chapter Name>}{<chap_directory>}
\hexentry[ext1]{<example_name>}[ext2]{<earlier_tag>}
\hexmentry[ext1]{<example_name>}[ext2]{<earlier_tag>}{<prior_name>}
\examplesref{<verno>}
\examplesblob{<verno/file>}
```
The `\cchapter` macro is used for starting a chapter with proper page spacing.
`<Chapter Name>` is the name of a chapter and `<chap_directory>` is the name
of the chapter directory. All section and subsection files for the chapter
should be placed under `<chap_directory>`. The corresponding example sources
should be placed under the `sources` directory inside `<chap_directory>`.
A previously-defined macro `\sinput{<section_file>}` to import a section
file from `<chap_directory>` is no longer supported. Please use
`\input{<chap_directory>/<section_file>}` explicitly.
The two macros `\hexentry` and `\hexmentry` are defined for simplifying
entries in the feature deprecation and update tables. Option `[ext1]` is
the file extension with a default value of `c` and option `[ext2]` is
the file extension for the associated second file if present.
`<earlier_tag>` is the version tag of the corresponding example
in the earlier version. `\hexentry` assumes no name change for an example
in different versions; `\hexmentry` can be used to specify a prior name
if it is different.
The two macros `\examplesref` and `\examplesblob` are for referencing
a specific version of or a file in the github Examples repository.
## License
For copyright information, please see [omp_copyright.txt](omp_copyright.txt).

282
Deprecated_Features.tex Normal file
View File

@ -0,0 +1,282 @@
\cchapter{Feature Deprecations and Updates in Examples}{deprecated_features}
\label{chap:deprecated_features}
\label{sec:deprecated_features}
\index{deprecated features}
\newcommand\tabpcont[1]{\multicolumn{2}{l}{\small\slshape table continued #1 page}}
\newcommand\tabpheader{\textbf{Version} & \textbf{Deprecated Feature} &
\textbf{Replacement}}
\newcommand\tabuheader{\textbf{Example Name} & \textbf{Earlier Version} &
\textbf{Feature Updated}}
\newcommand\dpftable[1]{
\renewcommand{\arraystretch}{1.0}
\tablefirsthead{%
\hline\\[-2ex]
\tabuheader\\[2pt]
\hline\\[-2ex]
}
\tablehead{%
\tabpcont{from previous}\\[2pt]
\hline\\[-2ex]
\tabuheader\\[2pt]
\hline\\[-2ex]
}
\tabletail{%
\hline\\[-2.5ex]
\tabpcont{on next}\\
}
\tablelasttail{\hline\\[-1ex]}
\tablecaption{Updated Examples for Features Deprecated in Version #1\label{tab:Updated Examples #1}}
}
Deprecation of features began in OpenMP 5.0.
Examples that use a deprecated feature have been updated with an equivalent
replacement feature.
Table~\ref{tab:Deprecated Features} summarizes deprecated features and
their replacements in each version. Affected examples are updated
accordingly and listed in Section~\ref{sec:Updated Examples}.
\nolinenumbers
\renewcommand{\arraystretch}{1.4}
\tablefirsthead{%
\hline
\tabpheader\\
\hline\\[-3.5ex]
}
\tablehead{%
\tabpcont{from previous}\\
\hline
\tabpheader\\
\hline\\[-3ex]
}
\tabletail{%
\hline\\[-4ex]
\tabpcont{on next}\\
}
\tablelasttail{\hline\\[-2ex]}
\tablecaption{Deprecated Features and Their Replacements\label{tab:Deprecated Features}}
\begin{supertabular}{p{0.4in} p{2.3in} p{2.2in}}
6.0 & \kcode{declare reduction(}\plc{reduction-id}: \plc{typename-list}: \plc{combiner}\kcode{)}
& \kcode{declare reduction(}\plc{reduction-id}: \plc{typename-list}\kcode{)} \kcode{combiner(\plc{combiner-exp})} \\
\hline
5.2 & \kcode{default} clause on metadirectives
& \kcode{otherwise} clause \\
5.2 & delimited \kcode{declare target} directive for C/C++
& \kcode{begin declare target} directive \\
5.2 & \kcode{to} clause on \kcode{declare target} directive
& \kcode{enter} clause \\
5.2 & non-argument \kcode{destroy} clause on \kcode{depobj} construct
& \kcode{destroy(\plc{argument})} \\
5.2 & \kcode{allocate} directive for Fortran \bcode{ALLOCATE} statements
& \kcode{allocators} directive \\
5.2 & \kcode{depend} clause on \kcode{ordered} construct
& \kcode{doacross} clause \\
5.2 & \kcode{linear(\plc{modifier(list): linear-step})} clause
& \kcode{linear(\plc{list:} step(\plc{linear-step})\plc{, modifier})} clause \\
\hline
5.1 & \kcode{master} construct
& \kcode{masked} construct \\
5.1 & \kcode{master} affinity policy
& \kcode{primary} affinity policy \\
\hline
5.0 & \kcode{omp_lock_hint_*} constants
& \kcode{omp_sync_hint_*} constants \\[2pt]
\end{supertabular}
\linenumbers
These replacements appear in examples that illustrate, otherwise, earlier features.
When using a compiler that is compliant with a version prior to
the indicated version, the earlier form of an example for a previous
version is listed as a reference.
\newpage
\section{Updated Examples for Different Versions}
\label{sec:Updated Examples}
The following tables list the updated examples for different versions as
a result of feature deprecation. The \emph{Earlier Version} column of
the tables shows the version tag of the earlier version. It also shows
the prior name of an example when it has been renamed.
Table~\ref{tab:Updated Examples 6.0} lists the updated examples for
features deprecated in OpenMP 6.0
in the Examples Document Version
\href{https://github.com/OpenMP/Examples/tree/v6.0}{6.0}.
The \emph{Earlier Version} column of the table lists the earlier version
tags of the examples that can be found in
the Examples Document Version
\href{https://github.com/OpenMP/Examples/tree/v5.2}{5.2}.
\index{clauses!combiner@\kcode{combiner}}
\index{combiner clause@\kcode{combiner} clause}
\nolinenumbers
\dpftable{6.0}
\begin{supertabular}{p{1.7in} p{1.1in} p{2.2in}}
\hexentry{udr.1}[f90]{4.0} &
\plc{combiner} expression in \kcode{declare} \\
\hexentry{udr.2}[f90]{4.0} &
\kcode{reduction} directive changed to use \\
\hexentry{udr.3}[f90]{4.0} & \kcode{combiner} clause \\
\hexentry[f90]{udr.4}{4.0} & \\
\hexentry[cpp]{udr.5}{4.0} & \\
\hexentry[cpp]{udr.6}{4.0} & \\[2pt]
\end{supertabular}
\linenumbers
Table~\ref{tab:Updated Examples 5.2} lists the updated examples for
features deprecated in OpenMP 5.2
in the Examples Document Version \examplesref{5.2}.
The \emph{Earlier Version} column of the table lists the earlier version
tags of the examples that can be found in
the Examples Document Version \examplesref{5.1}.
\index{clauses!default@\kcode{default}}
\index{clauses!otherwise@\kcode{otherwise}}
\index{clauses!to@\kcode{to}}
\index{clauses!enter@\kcode{enter}}
\index{clauses!depend@\kcode{depend}}
\index{clauses!doacross@\kcode{doacross}}
\index{clauses!linear@\kcode{linear}}
\index{clauses!destroy@\kcode{destroy}}
\index{default clause@\kcode{default} clause}
\index{otherwise clause@\kcode{otherwise} clause}
\index{to clause@\kcode{to} clause}
\index{enter clause@\kcode{enter} clause}
\index{depend clause@\kcode{depend} clause}
\index{doacross clause@\kcode{doacross} clause}
\index{linear clause@\kcode{linear} clause}
\index{destroy clause@\kcode{destroy} clause}
\index{directives!begin declare target@\kcode{begin declare target}}
\index{begin declare target directive@\kcode{begin declare target} directive}
\index{allocate directive@\kcode{allocate} directive}
\index{allocators directive@\kcode{allocators} directive}
\nolinenumbers
\dpftable{5.2}
\begin{supertabular}{p{1.7in} p{1.2in} p{2.1in}}
\hexentry{error.1}[f90]{5.1} &
\kcode{default} clause on metadirectives \\
\hexentry{metadirective.1}[f90]{5.0} &
replaced with \kcode{otherwise} clause \\
\hexentry{metadirective.2}[f90]{5.0} & \\
\hexentry{metadirective.3}[f90]{5.0} & \\
\hexentry{metadirective.4}[f90]{5.1} & \\
\hexentry{target_ptr_map.4}{5.1} & \\
\hexentry{target_ptr_map.5}[f90]{5.1} & \\[2pt]
\hline\\[-2ex]
\hexentry[f90]{array_shaping.1}{5.0} &
\kcode{to} clause on \kcode{declare target} \\
\hexentry{target_reverse_offload.7}{5.0} &
directive replaced with \kcode{enter} clause \\
\hexentry{target_task_reduction.1}[f90]{5.1} & \\
\hexentry{target_task_reduction.2a}[f90]{5.0} & \\
\hexentry{target_task_reduction.2b}[f90]{5.1} &\\[2pt]
\hline\\[-2ex]
\hexentry{array_shaping.1}{5.0} &
delimited \kcode{declare target} \\
\hexentry{async_target.1}{4.0} &
directive replaced with \\
\hexentry{async_target.2}{4.0} &
\kcode{begin declare target} \\
\hexentry{declare_target.1}{4.0} &
directive for C/C++ \\
\hexentry[cpp]{declare_target.2c}{4.0} & \\
\hexentry{declare_target.3}{4.0} & \\
\hexentry{declare_target.4}{4.0} & \\
\hexentry{declare_target.5}{4.0} & \\
\hexentry{declare_target.6}{4.0} & \\
\hexentry{declare_variant.1}{5.0} & \\
\hexentry{device.1}{4.0} & \\
\hexentry{metadirective.3}{5.0} & \\
\hexentry{target_ptr_map.2}{5.0} & \\
\hexentry{target_ptr_map.3a}{5.0} & \\
\hexentry{target_ptr_map.3b}{5.0} & \\
\hexentry{target_struct_map.1}{5.0} & \\
\hexentry[cpp]{target_struct_map.2}{5.0} & \\
\hexentry{target_struct_map.3}{5.0} & \\
\hexentry{target_struct_map.4}{5.0} & \\[2pt]
\hline\\[-2ex]
\hexentry{doacross.1}[f90]{4.5} &
\kcode{depend} clause on \kcode{ordered} \\
\hexentry{doacross.2}[f90]{4.5} &
construct replaced with \kcode{doacross} \\
\hexentry{doacross.3}[f90]{4.5} &
clause \\
\hexentry{doacross.4}[f90]{4.5} & \\[2pt]
\hline\\[-2ex]
\hexentry[cpp]{linear_modifier.1}[f90]{4.5} &
modifier syntax change for \kcode{linear} \\
\hexentry[cpp]{linear_modifier.2}[f90]{4.5} &
clause on \kcode{declare simd} directive \\
\hexentry{linear_modifier.3}[f90]{4.5} & \\[2pt]
\hline\\[-2ex]
\hexentry[f90]{allocators.1}{5.0} &
\kcode{allocate} directive replaced with \kcode{allocators} directive
for Fortran \bcode{allocate} statements \\[2pt]
\hline\\[-2ex]
\hexentry{depobj.1}[f90]{5.0} &
argument added to \kcode{destroy} clause on \kcode{depobj}
construct \\[2pt]
\end{supertabular}
\linenumbers
\newpage
Table~\ref{tab:Updated Examples 5.1} lists the updated examples for
features deprecated in OpenMP 5.1
in the Examples Document Version \examplesref{5.1}.
The \emph{Earlier Version} column of the table lists the earlier version
tags and prior names of the examples that can be found in
the Examples Document Version \examplesref{5.0.1}.
\index{affinity!master policy@\kcode{master} policy}
\index{affinity!primary policy@\kcode{primary} policy}
\index{constructs!master@\kcode{master}}
\index{constructs!masked@\kcode{masked}}
\index{master construct@\kcode{master} construct}
\index{masked construct@\kcode{masked} construct}
\nolinenumbers
\dpftable{5.1}
\begin{supertabular}{p{1.8in} p{1.4in} p{1.8in}}
\hexentry{affinity.5}[f]{4.0} &
\kcode{master} affinity policy replaced with \kcode{primary} policy \\[2pt]
\hline\\[-2ex]
\hexentry{async_target.3}[f90]{5.0} &
\kcode{master} construct replaced \\
\hexentry{cancellation.2}[f90]{4.0} &
with \kcode{masked} construct \\
\hexentry{copyprivate.2}[f]{3.0} & \\
\hexentry[f]{fort_sa_private.5}{3.0} & \\
\hexentry{lock_owner.1}[f]{3.0} & \\
\hexmentry{masked.1}[f]{3.0}{master.1} & \\
\hexmentry{parallel_masked_taskloop.1}[f90]{5.0}{parallel_master_taskloop.1} &\\
\hexentry{reduction.6}[f]{3.0} & \\
\hexentry{target_task_reduction.1}[f90]{5.0} & \\
\hexentry{target_task_reduction.2b}[f90]{5.0} & \\
\hexentry{taskloop_simd_reduction.1}[f90]{5.0} & \\
\hexentry{task_detach.1}[f90]{5.0} & \\[2pt]
\end{supertabular}
\linenumbers
Table~\ref{tab:Updated Examples 5.0} lists the updated examples for
features deprecated in OpenMP 5.0
in the Examples Document Version \examplesref{5.1}.
The \emph{Earlier Version} column of the table lists the earlier version
tags of the examples that can be found in
the Examples Document Version \examplesref{5.0.1}.
\nolinenumbers
\dpftable{5.0}
\begin{supertabular}{p{1.6in} p{1.3in} p{2.1in}}
\hexentry{critical.2}[f]{4.5} &
\kcode{omp_lock_hint_*} constants \\
\hexentry[cpp]{init_lock_with_hint.1}[f]{4.5} &
replaced with \kcode{omp_sync_hint_*} constants \\[2pt]
\end{supertabular}
\linenumbers

View File

@ -1,9 +0,0 @@
\chapter*{Examples}
\label{chap:examples}
The following are examples of the OpenMP API directives, constructs, and routines.
\ccppspecificstart
A statement following a directive is compound only when necessary, and a
non-compound statement is indented with respect to a directive preceding it.
\ccppspecificend

View File

@ -1,109 +0,0 @@
\pagebreak
\chapter{SIMD Constructs}
\label{chap:SIMD}
The following examples illustrate the use of SIMD constructs for vectorization.
Compilers may not vectorize loops when they are complex or possibly have
dependencies, even though the programmer is certain the loop will execute
correctly as a vectorized loop. The \code{simd} construct assures the compiler
that the loop can be vectorized.
\cexample{SIMD}{1c}
\fexample{SIMD}{1f}
When a function can be inlined within a loop the compiler has an opportunity to
vectorize the loop. By guaranteeing SIMD behavior of a function's operations,
characterizing the arguments of the function and privatizing temporary
variables of the loop, the compiler can often create faster, vector code for
the loop. In the examples below the \code{declare} \code{simd} construct is
used on the \plc{add1} and \plc{add2} functions to enable creation of their
corresponding SIMD function versions for execution within the associated SIMD
loop. The functions characterize two different approaches of accessing data
within the function: by a single variable and as an element in a data array,
respectively. The \plc{add3} C function uses dereferencing.
The \code{declare} \code{simd} constructs also illustrate the use of
\code{uniform} and \code{linear} clauses. The \code{uniform(fact)} clause
indicates that the variable \plc{fact} is invariant across the SIMD lanes. In
the \plc{add2} function \plc{a} and \plc{b} are included in the \code{unform}
list because the C pointer and the Fortran array references are constant. The
\plc{i} index used in the \plc{add2} function is included in a \code{linear}
clause with a constant-linear-step of 1, to guarantee a unity increment of the
associated loop. In the \code{declare} \code{simd} construct for the \plc{add3}
C function the \code{linear(a,b:1)} clause instructs the compiler to generate
unit-stride loads across the SIMD lanes; otherwise, costly \emph{gather}
instructions would be generated for the unknown sequence of access of the
pointer dereferences.
In the \code{simd} constructs for the loops the \code{private(tmp)} clause is
necessary to assure that the each vector operation has its own \plc{tmp}
variable.
\cexample{SIMD}{2c}
\fexample{SIMD}{2f}
A thread that encounters a SIMD construct executes a vectorized code of the
iterations. Similar to the concerns of a worksharing loop a loop vectorized
with a SIMD construct must assure that temporary and reduction variables are
privatized and declared as reductions with clauses. The example below
illustrates the use of \code{private} and \code{reduction} clauses in a SIMD
construct.
\cexample{SIMD}{3c}
\fexample{SIMD}{3f}
A \code{safelen(N)} clause in a \code{simd} construct assures the compiler that
there are no loop-carried dependencies for vectors of size \plc{N} or below. If
the \code{safelen} clause is not specified, then the default safelen value is
the number of loop iterations.
The \code{safelen(16)} clause in the example below guarantees that the vector
code is safe for vectors up to and including size 16. In the loop, \plc{m} can
be 16 or greater, for correct code execution. If the value of \plc{m} is less
than 16, the behavior is undefined.
\cexample{SIMD}{4c}
\fexample{SIMD}{4f}
The following SIMD construct instructs the compiler to collapse the \plc{i} and
\plc{j} loops into a single SIMD loop in which SIMD chunks are executed by
threads of the team. Within the workshared loop chunks of a thread, the SIMD
chunks are executed in the lanes of the vector units.
\cexample{SIMD}{5c}
\fexample{SIMD}{5f}
The following examples illustrate the use of the \code{declare} \code{simd}
construct with the \code{inbranch} and \code{notinbranch} clauses. The
\code{notinbranch} clause informs the compiler that the function \plc{foo} is
never called conditionally in the SIMD loop of the function \plc{myaddint}. On
the other hand, the \code{inbranch} clause for the function goo indicates that
the function is always called conditionally in the SIMD loop inside
the function \plc{myaddfloat}.
\cexample{SIMD}{6c}
\fexample{SIMD}{6f}
In the code below, the function \plc{fib()} is called in the main program and
also recursively called in the function \plc{fib()} within an \code{if}
condition. The compiler creates a masked vector version and a non-masked vector
version for the function \plc{fib()} while retaining the original scalar
version of the \plc{fib()} function.
\cexample{SIMD}{7c}
\fexample{SIMD}{7f}

View File

@ -1,35 +0,0 @@
\pagebreak
\chapter{Array Sections in Device Constructs}
\label{chap:array_sections}
The following examples show the usage of array sections in \code{map} clauses
on \code{target} and \code{target} \code{data} constructs.
This example shows the invalid usage of two seperate sections of the same array
inside of a \code{target} construct.
\cexample{array_sections}{1c}
\fexample{array_sections}{1f}
This example shows the invalid usage of two separate sections of the same array
inside of a \code{target} construct.
\cexample{array_sections}{2c}
\fexample{array_sections}{2f}
This example shows the valid usage of two separate sections of the same array inside
of a \code{target} construct.
\cexample{array_sections}{3c}
\fexample{array_sections}{3f}
This example shows the valid usage of a wholly contained array section of an already
mapped array section inside of a \code{target} construct.
\cexample{array_sections}{4c}
\fexample{array_sections}{4f}

View File

@ -1,32 +0,0 @@
\pagebreak
\chapter{Fortran \code{ASSOCIATE} Construct}
\fortranspecificstart
\label{chap:associate}
The following is an invalid example of specifying an associate name on a data-sharing attribute
clause. The constraint in the Data Sharing Attribute Rules section in the OpenMP
4.0 API Specifications states that an associate name preserves the association
with the selector established at the \code{ASSOCIATE} statement. The associate
name \plc{b} is associated with the shared variable \plc{a}. With the predetermined data-sharing
attribute rule, the associate name \plc{b} is not allowed to be specified on the \code{private}
clause.
\fnexample{associate}{1f}
In next example, within the \code{parallel} construct, the association name \plc{thread\_id}
is associated with the private copy of \plc{i}. The print statement should output the
unique thread number.
\fnexample{associate}{2f}
The following example illustrates the effect of specifying a selector name on a data-sharing
attribute clause. The associate name \plc{u} is associated with \plc{v} and the variable \plc{v}
is specified on the \code{private} clause of the \code{parallel} construct.
The construct association is established prior to the \code{parallel} region.
The association between \plc{u} and the original \plc{v} is retained (see the Data Sharing
Attribute Rules section in the OpenMP 4.0 API Specifications). Inside the \code{parallel}
region, \plc{v} has the value of -1 and \plc{u} has the value of the original \plc{v}.
\fnexample{associate}{3f}
\fortranspecificend

View File

@ -1,54 +0,0 @@
\pagebreak
\chapter{Asynchronous Execution of a \code{target} Region Using Tasks}
\label{chap:async_target}
The following example shows how the \code{task} and \code{target} constructs
are used to execute multiple \code{target} regions asynchronously. The task that
encounters the \code{task} construct generates an explicit task that contains
a \code{target} region. The thread executing the explicit task encounters a task
scheduling point while waiting for the execution of the \code{target} region
to complete, allowing the thread to switch back to the execution of the encountering
task or one of the previously generated explicit tasks.
\cexample{async_target}{1c}
The Fortran version has an interface block that contains the \code{declare} \code{target}.
An identical statement exists in the function declaration (not shown here).
\fexample{async_target}{1f}
The following example shows how the \code{task} and \code{target} constructs
are used to execute multiple \code{target} regions asynchronously. The task dependence
ensures that the storage is allocated and initialized on the device before it is
accessed.
\cexample{async_target}{2c}
The Fortran example below is similar to the C version above. Instead of pointers, though, it uses
the convenience of Fortran allocatable arrays on the device. An allocatable array has the
same behavior in a \code{map} clause as a C pointer, in this case.
If there is no shape specified for an allocatable array in a \code{map} clause, only the array descriptor
(also called a dope vector) is mapped. That is, device space is created for the descriptor, and it
is initially populated with host values. In this case, the \plc{v1} and \plc{v2} arrays will be in a
non-associated state on the device. When space for \plc{v1} and \plc{v2} is allocated on the device
the addresses to the space will be included in their descriptors.
At the end of the first \code{target} region, the descriptor (of an unshaped specification of an allocatable
array in a \code{map} clause) is returned with the raw device address of the allocated space.
The content of the array is not returned. In the example the data in arrays \plc{v1} and \plc{v2}
are not returned. In the second \code{target} directive, the \plc{v1} and \plc{v2} descriptors are
re-created on the device with the descriptive information; and references to the
vectors point to the correct local storage, of the space that was not freed in the first \code{target}
directive. At the end of the second \code{target} region, the data in array \plc{p} is copied back
to the host since \plc{p} is not an allocatable array.
A \code{depend} clause is used in the \code{task} directive to provide a wait at the beginning of the second
\code{target} region, to insure that there is no race condition with \plc{v1} and \plc{v2} in the two tasks.
It would be noncompliant to use \plc{v1} and/or \plc{v2} in lieu of \plc{N} in the \code{depend} clauses,
because the use of non-allocated allocatable arrays as list items in the first \code{depend} clause would
lead to unspecified behavior.
\fexample{async_target}{2f}

View File

@ -1,44 +0,0 @@
\pagebreak
\chapter{The \code{atomic} Construct}
\label{chap:atomic}
The following example avoids race conditions (simultaneous updates of an element
of \plc{x} by multiple threads) by using the \code{atomic} construct .
The advantage of using the \code{atomic} construct in this example is that it
allows updates of two different elements of \plc{x} to occur in parallel. If
a \code{critical} construct were used instead, then all updates to elements of
\plc{x} would be executed serially (though not in any guaranteed order).
Note that the \code{atomic} directive applies only to the statement immediately
following it. As a result, elements of \plc{y} are not updated atomically in
this example.
\cexample{atomic}{1c}
\fexample{atomic}{1f}
The following example illustrates the \code{read} and \code{write} clauses
for the \code{atomic} directive. These clauses ensure that the given variable
is read or written, respectively, as a whole. Otherwise, some other thread might
read or write part of the variable while the current thread was reading or writing
another part of the variable. Note that most hardware provides atomic reads and
writes for some set of properly aligned variables of specific sizes, but not necessarily
for all the variable types supported by the OpenMP API.
\cexample{atomic}{2c}
\fexample{atomic}{2f}
The following example illustrates the \code{capture} clause for the \code{atomic}
directive. In this case the value of a variable is captured, and then the variable
is incremented. These operations occur atomically. This particular example could
be implemented using the fetch-and-add instruction available on many kinds of hardware.
The example also shows a way to implement a spin lock using the \code{capture}
and \code{read} clauses.
\cexample{atomic}{3c}
\fexample{atomic}{3f}

View File

@ -1,25 +0,0 @@
\pagebreak
\chapter{Restrictions on the \code{atomic} Construct}
\label{chap:atomic_restrict}
The following non-conforming examples illustrate the restrictions on the \code{atomic}
construct.
\cexample{atomic_restrict}{1c}
\fexample{atomic_restrict}{1f}
\cexample{atomic_restrict}{2c}
\fortranspecificstart
The following example is non-conforming because \code{I} and \code{R} reference
the same location but have different types.
\fnexample{atomic_restrict}{2f}
Although the following example might work on some implementations, this is also
non-conforming:
\fnexample{atomic_restrict}{3f}
\fortranspecificend

View File

@ -1,24 +0,0 @@
\pagebreak
\chapter{Binding of \code{barrier} Regions}
\label{chap:barrier_regions}
The binding rules call for a \code{barrier} region to bind to the closest enclosing
\code{parallel} region.
In the following example, the call from the main program to \plc{sub2} is conforming
because the \code{barrier} region (in \plc{sub3}) binds to the \code{parallel}
region in \plc{sub2}. The call from the main program to \plc{sub1} is conforming
because the \code{barrier} region binds to the \code{parallel} region in subroutine
\plc{sub2}.
The call from the main program to \plc{sub3} is conforming because the \code{barrier}
region binds to the implicit inactive \code{parallel} region enclosing the sequential
part. Also note that the \code{barrier} region in \plc{sub3} when called from
\plc{sub2} only synchronizes the team of threads in the enclosing \code{parallel}
region and not all the threads created in \plc{sub1}.
\cexample{barrier_regions}{1c}
\fexample{barrier_regions}{1f}

View File

@ -1,42 +0,0 @@
\pagebreak
\chapter{Cancellation Constructs}
\label{chap:cancellation}
The following example shows how the \code{cancel} directive can be used to terminate
an OpenMP region. Although the \code{cancel} construct terminates the OpenMP
worksharing region, programmers must still track the exception through the pointer
ex and issue a cancellation for the \code{parallel} region if an exception has
been raised. The master thread checks the exception pointer to make sure that the
exception is properly handled in the sequential part. If cancellation of the \code{parallel}
region has been requested, some threads might have executed \code{phase\_1()}.
However, it is guaranteed that none of the threads executed \code{phase\_2()}.
\cexample{cancellation}{1c}
The following example illustrates the use of the \code{cancel} construct in error
handling. If there is an error condition from the \code{allocate} statement,
the cancellation is activated. The encountering thread sets the shared variable
\code{err} and other threads of the binding thread set proceed to the end of
the worksharing construct after the cancellation has been activated.
\fexample{cancellation}{1f}
The following example shows how to cancel a parallel search on a binary tree as
soon as the search value has been detected. The code creates a task to descend
into the child nodes of the current tree node. If the search value has been found,
the code remembers the tree node with the found value through an \code{atomic}
write to the result variable and then cancels execution of all search tasks. The
function \code{search\_tree\_parallel} groups all search tasks into a single
task group to control the effect of the \code{cancel taskgroup} directive. The
\plc{level} argument is used to create undeferred tasks after the first ten
levels of the tree.
\cexample{cancellation}{2c}
The following is the equivalent parallel search example in Fortran.
\fexample{cancellation}{2f}

View File

@ -1,37 +0,0 @@
\pagebreak
\chapter{C/C++ Arrays in a \code{firstprivate} Clause}
\ccppspecificstart
\label{chap:carrays_fpriv}
The following example illustrates the size and value of list items of array or
pointer type in a \code{firstprivate} clause . The size of new list items is
based on the type of the corresponding original list item, as determined by the
base language.
In this example:
\begin{compactitem}
\item The type of \code{A} is array of two arrays of two ints.
\item The type of \code{B} is adjusted to pointer to array of \code{n}
ints, because it is a function parameter.
\item The type of \code{C} is adjusted to pointer to int, because
it is a function parameter.
\item The type of \code{D} is array of two arrays of two ints.
\item The type of \code{E} is array of \code{n} arrays of \code{n}
ints.
\end{compactitem}
Note that \code{B} and \code{E} involve variable length array types.
The new items of array type are initialized as if each integer element of the original
array is assigned to the corresponding element of the new array. Those of pointer
type are initialized as if by assignment from the original item to the new item.
\cnexample{carrays_fpriv}{1c}
\ccppspecificend

View File

@ -1,78 +0,0 @@
\pagebreak
\chapter{The \code{collapse} Clause}
\label{chap:collapse}
In the following example, the \code{k} and \code{j} loops are associated with
the loop construct. So the iterations of the \code{k} and \code{j} loops are
collapsed into one loop with a larger iteration space, and that loop is then divided
among the threads in the current team. Since the \code{i} loop is not associated
with the loop construct, it is not collapsed, and the \code{i} loop is executed
sequentially in its entirety in every iteration of the collapsed \code{k} and
\code{j} loop.
The variable \code{j} can be omitted from the \code{private} clause when the
\code{collapse} clause is used since it is implicitly private. However, if the
\code{collapse} clause is omitted then \code{j} will be shared if it is omitted
from the \code{private} clause. In either case, \code{k} is implicitly private
and could be omitted from the \code{private} clause.
\cexample{collapse}{1c}
\fexample{collapse}{1f}
In the next example, the \code{k} and \code{j} loops are associated with the
loop construct. So the iterations of the \code{k} and \code{j} loops are collapsed
into one loop with a larger iteration space, and that loop is then divided among
the threads in the current team.
The sequential execution of the iterations in the \code{k} and \code{j} loops
determines the order of the iterations in the collapsed iteration space. This implies
that in the sequentially last iteration of the collapsed iteration space, \code{k}
will have the value \code{2} and \code{j} will have the value \code{3}. Since
\code{klast} and \code{jlast} are \code{lastprivate}, their values are assigned
by the sequentially last iteration of the collapsed \code{k} and \code{j} loop.
This example prints: \code{2 3}.
\cexample{collapse}{2c}
\fexample{collapse}{2f}
The next example illustrates the interaction of the \code{collapse} and \code{ordered}
clauses.
In the example, the loop construct has both a \code{collapse} clause and an \code{ordered}
clause. The \code{collapse} clause causes the iterations of the \code{k} and
\code{j} loops to be collapsed into one loop with a larger iteration space, and
that loop is divided among the threads in the current team. An \code{ordered}
clause is added to the loop construct, because an ordered region binds to the loop
region arising from the loop construct.
According to Section 2.12.8 of the OpenMP 4.0 specification,
a thread must not execute more than one ordered region that binds
to the same loop region. So the \code{collapse} clause is required for the example
to be conforming. With the \code{collapse} clause, the iterations of the \code{k}
and \code{j} loops are collapsed into one loop, and therefore only one ordered
region will bind to the collapsed \code{k} and \code{j} loop. Without the \code{collapse}
clause, there would be two ordered regions that bind to each iteration of the \code{k}
loop (one arising from the first iteration of the \code{j} loop, and the other
arising from the second iteration of the \code{j} loop).
The code prints
\code{0 1 1}
\\
\code{0 1 2}
\\
\code{0 2 1}
\\
\code{1 2 2}
\\
\code{1 3 1}
\\
\code{1 3 2}
\cexample{collapse}{3c}
\fexample{collapse}{3f}

View File

@ -1,21 +0,0 @@
\pagebreak
\chapter{Conditional Compilation}
\label{chap:cond_comp}
\ccppspecificstart
The following example illustrates the use of conditional compilation using the
OpenMP macro \code{\_OPENMP}. With OpenMP compilation, the \code{\_OPENMP}
macro becomes defined.
\cnexample{cond_comp}{1c}
\ccppspecificend
\fortranspecificstart
The following example illustrates the use of the conditional compilation sentinel.
With OpenMP compilation, the conditional compilation sentinel \code{!\$} is recognized
and treated as two spaces. In fixed form source, statements guarded by the sentinel
must start after column 6.
\fnexample{cond_comp}{1f}
\fortranspecificend

View File

@ -1,13 +0,0 @@
\pagebreak
\chapter{The \code{copyin} Clause}
\label{chap:copyin}
The \code{copyin} clause is used to initialize threadprivate data upon entry
to a \code{parallel} region. The value of the threadprivate variable in the master
thread is copied to the threadprivate variable of each other team member.
\cexample{copyin}{1c}
\fexample{copyin}{1f}

View File

@ -1,51 +0,0 @@
\pagebreak
\chapter{The \code{copyprivate} Clause}
\label{chap:copyprivate}
The \code{copyprivate} clause can be used to broadcast values acquired by a single
thread directly to all instances of the private variables in the other threads.
In this example, if the routine is called from the sequential part, its behavior
is not affected by the presence of the directives. If it is called from a \code{parallel}
region, then the actual arguments with which \code{a} and \code{b} are associated
must be private.
The thread that executes the structured block associated with the \code{single}
construct broadcasts the values of the private variables \code{a}, \code{b},
\code{x}, and
\code{y} from its implicit task's data environment to the data environments
of the other implicit tasks in the thread team. The broadcast completes before
any of the threads have left the barrier at the end of the construct.
\cexample{copyprivate}{1c}
\fexample{copyprivate}{1f}
In this example, assume that the input must be performed by the master thread.
Since the \code{master} construct does not support the \code{copyprivate} clause,
it cannot broadcast the input value that is read. However, \code{copyprivate}
is used to broadcast an address where the input value is stored.
\cexample{copyprivate}{2c}
\fexample{copyprivate}{2f}
Suppose that the number of lock variables required within a \code{parallel} region
cannot easily be determined prior to entering it. The \code{copyprivate} clause
can be used to provide access to shared lock variables that are allocated within
that \code{parallel} region.
\cexample{copyprivate}{3c}
\fortranspecificstart
\fnexample{copyprivate}{3f}
Note that the effect of the \code{copyprivate} clause on a variable with the
\code{allocatable} attribute is different than on a variable with the \code{pointer}
attribute. The value of \code{A} is copied (as if by intrinsic assignment) and
the pointer \code{B} is copied (as if by pointer assignment) to the corresponding
list items in the other implicit tasks belonging to the \code{parallel} region.
\fnexample{copyprivate}{4f}
\fortranspecificend

View File

@ -1,16 +0,0 @@
\pagebreak
\chapter{The \code{critical} Construct}
\label{chap:critical}
The following example includes several \code{critical} constructs . The example
illustrates a queuing model in which a task is dequeued and worked on. To guard
against multiple threads dequeuing the same task, the dequeuing operation must
be in a \code{critical} region. Because the two queues in this example are independent,
they are protected by \code{critical} constructs with different names, \plc{xaxis}
and \plc{yaxis}.
\cexample{critical}{1c}
\fexample{critical}{1f}

View File

@ -1,113 +0,0 @@
\pagebreak
\chapter{\code{declare} \code{target} Construct}
\label{chap:declare_target}
\section{\code{declare} \code{target} and \code{end} \code{declare} \code{target} for a Function}
The following example shows how the \code{declare} \code{target} directive
is used to indicate that the corresponding call inside a \code{target} region
is to a \code{fib} function that can execute on the default target device.
A version of the function is also available on the host device. When the \code{if}
clause conditional expression on the \code{target} construct evaluates to \plc{false},
the \code{target} region (thus \code{fib}) will execute on the host device.
For C/C++ codes the declaration of the function \code{fib} appears between the \code{declare}
\code{target} and \code{end} \code{declare} \code{target} directives.
\cexample{declare_target}{1c}
The Fortran \code{fib} subroutine contains a \code{declare} \code{target} declaration
to indicate to the compiler to create an device executable version of the procedure.
The subroutine name has not been included on the \code{declare} \code{target}
directive and is, therefore, implicitly assumed.
The program uses the \code{module\_fib} module, which presents an explicit interface to
the compiler with the \code{declare} \code{target} declarations for processing
the \code{fib} call.
\fexample{declare_target}{1f}
The next Fortran example shows the use of an external subroutine. Without an explicit
interface (through module use or an interface block) the \code{declare} \code{target}
declarations within a external subroutine are unknown to the main program unit;
therefore, a \code{declare} \code{target} must be provided within the program
scope for the compiler to determine that a target binary should be available.
\fexample{declare_target}{2f}
\section{\code{declare} \code{target} Construct for Class Type}
\cppspecificstart
The following example shows how the \code{declare} \code{target} and \code{end}
\code{declare} \code{target} directives are used to enclose the declaration
of a variable \plc{varY} with a class type \code{typeY}. The member function \code{typeY::foo()} cannot
be accessed on a target device because its declaration did not appear between \code{declare}
\code{target} and \code{end} \code{declare} \code{target} directives.
\cnexample{declare_target}{2c}
\cppspecificend
\section{\code{declare} \code{target} and \code{end} \code{declare} \code{target} for Variables}
The following examples show how the \code{declare} \code{target} and \code{end}
\code{declare} \code{target} directives are used to indicate that global variables
are mapped to the implicit device data environment of each target device.
In the following example, the declarations of the variables \plc{p}, \plc{v1}, and \plc{v2} appear
between \code{declare} \code{target} and \code{end} \code{declare} \code{target}
directives indicating that the variables are mapped to the implicit device data
environment of each target device. The \code{target} \code{update} directive
is then used to manage the consistency of the variables \plc{p}, \plc{v1}, and \plc{v2} between the
data environment of the encountering host device task and the implicit device data
environment of the default target device.
\cexample{declare_target}{3c}
The Fortran version of the above C code uses a different syntax. Fortran modules
use a list syntax on the \code{declare} \code{target} directive to declare
mapped variables.
\fexample{declare_target}{3f}
The following example also indicates that the function \code{Pfun()} is available on the
target device, as well as the variable \plc{Q}, which is mapped to the implicit device
data environment of each target device. The \code{target} \code{update} directive
is then used to manage the consistency of the variable \plc{Q} between the data environment
of the encountering host device task and the implicit device data environment of
the default target device.
In the following example, the function and variable declarations appear between
the \code{declare} \code{target} and \code{end} \code{declare} \code{target}
directives.
\cexample{declare_target}{4c}
The Fortran version of the above C code uses a different syntax. In Fortran modules
a list syntax on the \code{declare} \code{target} directive is used to declare
mapped variables and procedures. The \plc{N} and \plc{Q} variables are declared as a comma
separated list. When the \code{declare} \code{target} directive is used to
declare just the procedure, the procedure name need not be listed -- it is implicitly
assumed, as illustrated in the \code{Pfun()} function.
\fexample{declare_target}{4f}
\section{\code{declare} \code{target} and \code{end} \code{declare} \code{target} with \code{declare} \code{simd}}
The following example shows how the \code{declare} \code{target} and \code{end}
\code{declare} \code{target} directives are used to indicate that a function
is available on a target device. The \code{declare} \code{simd} directive indicates
that there is a SIMD version of the function \code{P()} that is available on the target
device as well as one that is available on the host device.
\cexample{declare_target}{5c}
The Fortran version of the above C code uses a different syntax. Fortran modules
use a list syntax of the \code{declare} \code{target} declaration for the mapping.
Here the \plc{N} and \plc{Q} variables are declared in the list form as a comma separated list.
The function declaration does not use a list and implicitly assumes the function
name. In this Fortran example row and column indices are reversed relative to the
C/C++ example, as is usual for codes optimized for memory access.
\fexample{declare_target}{5f}

View File

@ -1,19 +0,0 @@
\pagebreak
\chapter{The \code{default(none)} Clause}
\label{chap:default_none}
The following example distinguishes the variables that are affected by the \code{default(none)}
clause from those that are not.
\ccppspecificstart
Beginning with OpenMP 4.0, variables with \code{const}-qualified type and no mutable member
are no longer predetermined shared. Thus, these variables (variable \plc{c} in the example)
need to be explicitly listed
in data-sharing attribute clauses when the \code{default(none)} clause is specified.
\cnexample{default_none}{1c}
\ccppspecificend
\fexample{default_none}{1f}

View File

@ -1,35 +0,0 @@
\pagebreak
\chapter{Device Routines}
\label{chap:device}
\section{\code{omp\_is\_initial\_device} Routine}
The following example shows how the \code{omp\_is\_initial\_device} runtime library routine
can be used to query if a code is executing on the initial host device or on a
target device. The example then sets the number of threads in the \code{parallel}
region based on where the code is executing.
\cexample{device}{1c}
\fexample{device}{1f}
\section{\code{omp\_get\_num\_devices} Routine}
The following example shows how the \code{omp\_get\_num\_devices} runtime library routine
can be used to determine the number of devices.
\cexample{device}{2c}
\fexample{device}{2f}
\section{\code{omp\_set\_default\_device} and \\
\code{omp\_get\_default\_device} Routines}
The following example shows how the \code{omp\_set\_default\_device} and \code{omp\_get\_default\_device}
runtime library routines can be used to set the default device and determine the
default device respectively.
\cexample{device}{3c}
\fexample{device}{3f}

View File

@ -1,12 +0,0 @@
\pagebreak
\chapter{The \code{flush} Construct without a List}
\label{chap:flush_nolist}
The following example distinguishes the shared variables affected by a \code{flush}
construct with no list from the shared objects that are not affected:
\cexample{flush_nolist}{1c}
\fexample{flush_nolist}{1f}

View File

@ -1,19 +0,0 @@
\pagebreak
\chapter{Fortran Restrictions on the \code{do} Construct}
\label{chap:fort_do}
\fortranspecificstart
If an \code{end do} directive follows a \plc{do-construct} in which several
\code{DO} statements share a \code{DO} termination statement, then a \code{do}
directive can only be specified for the outermost of these \code{DO} statements.
The following example contains correct usages of loop constructs:
\fnexample{fort_do}{1f}
The following example is non-conforming because the matching \code{do} directive
for the \code{end do} does not precede the outermost loop:
\fnexample{fort_do}{2f}
\fortranspecificend

View File

@ -1,23 +0,0 @@
\pagebreak
\chapter{Fortran Private Loop Iteration Variables}
\label{chap:fort_loopvar}
\fortranspecificstart
In general loop iteration variables will be private, when used in the \plc{do-loop}
of a \code{do} and \code{parallel do} construct or in sequential loops in a
\code{parallel} construct (see Section 2.7.1 and Section 2.14.1 of
the OpenMP 4.0 specification). In the following example of a sequential
loop in a \code{parallel} construct the loop iteration variable \plc{I} will
be private.
\fnexample{fort_loopvar}{1f}
In exceptional cases, loop iteration variables can be made shared, as in the following
example:
\fnexample{fort_loopvar}{2f}
Note however that the use of shared loop iteration variables can easily lead to
race conditions.
\fortranspecificend

View File

@ -1,23 +0,0 @@
\pagebreak
\chapter{Fortran Restrictions on Storage Association with the \code{private} Clause}
\fortranspecificstart
\label{chap:fort_sa_private}
The following non-conforming examples illustrate the implications of the \code{private}
clause rules with regard to storage association.
\fnexample{fort_sa_private}{1f}
\fnexample{fort_sa_private}{2f}
% blue line floater at top of this page for "Fortran, cont."
\begin{figure}[t!]
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
\end{figure}
\fnexample{fort_sa_private}{3f}
\fnexample{fort_sa_private}{4f}
\fnexample{fort_sa_private}{5f}
\fortranspecificend

View File

@ -1,38 +0,0 @@
\pagebreak
\chapter{Fortran Restrictions on \code{shared} and \code{private} Clauses with Common Blocks}
\fortranspecificstart
\label{chap:fort_sp_common}
When a named common block is specified in a \code{private}, \code{firstprivate},
or \code{lastprivate} clause of a construct, none of its members may be declared
in another data-sharing attribute clause on that construct. The following examples
illustrate this point.
The following example is conforming:
\fnexample{fort_sp_common}{1f}
The following example is also conforming:
\fnexample{fort_sp_common}{2f}
% blue line floater at top of this page for "Fortran, cont."
\begin{figure}[t!]
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
\end{figure}
The following example is conforming:
\fnexample{fort_sp_common}{3f}
The following example is non-conforming because \code{x} is a constituent element
of \code{c}:
\fnexample{fort_sp_common}{4f}
The following example is non-conforming because a common block may not be declared
both shared and private:
\fnexample{fort_sp_common}{5f}
\fortranspecificend

View File

@ -1,18 +0,0 @@
\pagebreak
\chapter{The \code{firstprivate} Clause and the \code{sections} Construct}
\label{chap:fpriv_sections}
In the following example of the \code{sections} construct the \code{firstprivate}
clause is used to initialize the private copy of \code{section\_count} of each
thread. The problem is that the \code{section} constructs modify \code{section\_count},
which breaks the independence of the \code{section} constructs. When different
threads execute each section, both sections will print the value 1. When the same
thread executes the two sections, one section will print the value 1 and the other
will print the value 2. Since the order of execution of the two sections in this
case is unspecified, it is unspecified which section prints which value.
\cexample{fpriv_sections}{1c}
\fexample{fpriv_sections}{1f}

View File

@ -1,21 +0,0 @@
\pagebreak
\chapter{The \code{omp\_get\_num\_threads} Routine}
\label{chap:get_nthrs}
In the following example, the \code{omp\_get\_num\_threads} call returns 1 in
the sequential part of the code, so \code{np} will always be equal to 1. To determine
the number of threads that will be deployed for the \code{parallel} region, the
call should be inside the \code{parallel} region.
\cexample{get_nthrs}{1c}
\fexample{get_nthrs}{1f}
The following example shows how to rewrite this program without including a query
for the number of threads:
\cexample{get_nthrs}{2c}
\fexample{get_nthrs}{2f}

View File

@ -1,56 +0,0 @@
\pagebreak
\chapter{Internal Control Variables (ICVs)}
\label{chap:icv}
According to Section 2.3 of the OpenMP 4.0 specification, an OpenMP implementation must act as if there are ICVs that control
the behavior of the program. This example illustrates two ICVs, \plc{nthreads-var}
and \plc{max-active-levels-var}. The \plc{nthreads-var} ICV controls the
number of threads requested for encountered parallel regions; there is one copy
of this ICV per task. The \plc{max-active-levels-var} ICV controls the maximum
number of nested active parallel regions; there is one copy of this ICV for the
whole program.
In the following example, the \plc{nest-var}, \plc{max-active-levels-var},
\plc{dyn-var}, and \plc{nthreads-var} ICVs are modified through calls to
the runtime library routines \code{omp\_set\_nested},\\ \code{omp\_set\_max\_active\_levels},\code{
omp\_set\_dynamic}, and \code{omp\_set\_num\_threads} respectively. These ICVs
affect the operation of \code{parallel} regions. Each implicit task generated
by a \code{parallel} region has its own copy of the \plc{nest-var, dyn-var},
and \plc{nthreads-var} ICVs.
In the following example, the new value of \plc{nthreads-var} applies only to
the implicit tasks that execute the call to \code{omp\_set\_num\_threads}. There
is one copy of the \plc{max-active-levels-var} ICV for the whole program and
its value is the same for all tasks. This example assumes that nested parallelism
is supported.
The outer \code{parallel} region creates a team of two threads; each of the threads
will execute one of the two implicit tasks generated by the outer \code{parallel}
region.
Each implicit task generated by the outer \code{parallel} region calls \code{omp\_set\_num\_threads(3)},
assigning the value 3 to its respective copy of \plc{nthreads-var}. Then each
implicit task encounters an inner \code{parallel} region that creates a team
of three threads; each of the threads will execute one of the three implicit tasks
generated by that inner \code{parallel} region.
Since the outer \code{parallel} region is executed by 2 threads, and the inner
by 3, there will be a total of 6 implicit tasks generated by the two inner \code{parallel}
regions.
Each implicit task generated by an inner \code{parallel} region will execute
the call to\\ \code{omp\_set\_num\_threads(4)}, assigning the value 4 to its respective
copy of \plc{nthreads-var}.
The print statement in the outer \code{parallel} region is executed by only one
of the threads in the team. So it will be executed only once.
The print statement in an inner \code{parallel} region is also executed by only
one of the threads in the team. Since we have a total of two inner \code{parallel}
regions, the print statement will be executed twice -- once per inner \code{parallel}
region.
\cexample{icv}{1c}
\fexample{icv}{1f}

View File

@ -1,11 +0,0 @@
\pagebreak
\chapter{The \code{omp\_init\_lock} Routine}
\label{chap:init_lock}
The following example demonstrates how to initialize an array of locks in a \code{parallel}
region by using \code{omp\_init\_lock}.
\cexample{init_lock}{1c}
\fexample{init_lock}{1f}

View File

@ -1,14 +0,0 @@
\pagebreak
\chapter{The \code{lastprivate} Clause}
\label{chap:lastprivate}
Correct execution sometimes depends on the value that the last iteration of a loop
assigns to a variable. Such programs must list all such variables in a \code{lastprivate}
clause so that the values of the variables are the same as when the loop is executed
sequentially.
\cexample{lastprivate}{1c}
\fexample{lastprivate}{1f}

View File

@ -1,23 +0,0 @@
\pagebreak
\chapter{Ownership of Locks}
\label{chap:lock_owner}
Ownership of locks has changed since OpenMP 2.5. In OpenMP 2.5, locks are owned
by threads; so a lock released by the \code{omp\_unset\_lock} routine must be
owned by the same thread executing the routine. Beginning with OpenMP 3.0, locks are owned
by task regions; so a lock released by the \code{omp\_unset\_lock} routine in
a task region must be owned by the same task region.
This change in ownership requires extra care when using locks. The following program
is conforming in OpenMP 2.5 because the thread that releases the lock \code{lck}
in the parallel region is the same thread that acquired the lock in the sequential
part of the program (master thread of parallel region and the initial thread are
the same). However, it is not conforming beginning with OpenMP 3.0, because the task
region that releases the lock \code{lck} is different from the task region that
acquires the lock.
\cexample{lock_owner}{1c}
\fexample{lock_owner}{1f}

View File

@ -1,13 +0,0 @@
\pagebreak
\chapter{The \code{master} Construct}
\label{chap:master}
The following example demonstrates the master construct . In the example, the master
keeps track of how many iterations have been executed and prints out a progress
report. The other threads skip the master region without waiting.
\cexample{master}{1c}
\fexample{master}{1f}

View File

@ -1,38 +0,0 @@
\pagebreak
\chapter{The OpenMP Memory Model}
\label{chap:mem_model}
In the following example, at Print 1, the value of \plc{x} could be either 2
or 5, depending on the timing of the threads, and the implementation of the assignment
to \plc{x}. There are two reasons that the value at Print 1 might not be 5.
First, Print 1 might be executed before the assignment to \plc{x} is executed.
Second, even if Print 1 is executed after the assignment, the value 5 is not guaranteed
to be seen by thread 1 because a flush may not have been executed by thread 0 since
the assignment.
The barrier after Print 1 contains implicit flushes on all threads, as well as
a thread synchronization, so the programmer is guaranteed that the value 5 will
be printed by both Print 2 and Print 3.
\cexample{mem_model}{1c}
\fexample{mem_model}{1f}
The following example demonstrates why synchronization is difficult to perform
correctly through variables. The value of flag is undefined in both prints on thread
1 and the value of data is only well-defined in the second print.
\cexample{mem_model}{2c}
\fexample{mem_model}{2f}
The next example demonstrates why synchronization is difficult to perform correctly
through variables. Because the \plc{write}(1)-\plc{flush}(1)-\plc{flush}(2)-\plc{read}(2)
sequence cannot be guaranteed in the example, the statements on thread 0 and thread
1 may execute in either order.
\cexample{mem_model}{3c}
\fexample{mem_model}{3f}

View File

@ -1,18 +0,0 @@
\pagebreak
\chapter{Nested Loop Constructs}
\label{chap:nested_loop}
The following example of loop construct nesting is conforming because the inner
and outer loop regions bind to different \code{parallel} regions:
\cexample{nested_loop}{1c}
\fexample{nested_loop}{1f}
The following variation of the preceding example is also conforming:
\cexample{nested_loop}{2c}
\fexample{nested_loop}{2f}

View File

@ -1,52 +0,0 @@
\pagebreak
\chapter{Restrictions on Nesting of Regions}
\label{chap:nesting_restrict}
The examples in this section illustrate the region nesting rules.
The following example is non-conforming because the inner and outer loop regions
are closely nested:
\cexample{nesting_restrict}{1c}
\fexample{nesting_restrict}{1f}
The following orphaned version of the preceding example is also non-conforming:
\cexample{nesting_restrict}{2c}
\fexample{nesting_restrict}{2f}
The following example is non-conforming because the loop and \code{single} regions
are closely nested:
\cexample{nesting_restrict}{3c}
\fexample{nesting_restrict}{3f}
The following example is non-conforming because a \code{barrier} region cannot
be closely nested inside a loop region:
\cexample{nesting_restrict}{4c}
\fexample{nesting_restrict}{4f}
The following example is non-conforming because the \code{barrier} region cannot
be closely nested inside the \code{critical} region. If this were permitted,
it would result in deadlock due to the fact that only one thread at a time can
enter the \code{critical} region:
\cexample{nesting_restrict}{5c}
\fexample{nesting_restrict}{5f}
The following example is non-conforming because the \code{barrier} region cannot
be closely nested inside the \code{single} region. If this were permitted, it
would result in deadlock due to the fact that only one thread executes the \code{single}
region:
\cexample{nesting_restrict}{6c}
\fexample{nesting_restrict}{6f}

View File

@ -1,28 +0,0 @@
\pagebreak
\chapter{The \code{nowait} Clause}
\label{chap:nowait}
If there are multiple independent loops within a \code{parallel} region, you
can use the \code{nowait} clause to avoid the implied barrier at the end of the
loop construct, as follows:
\cexample{nowait}{1c}
\fexample{nowait}{1f}
In the following example, static scheduling distributes the same logical iteration
numbers to the threads that execute the three loop regions. This allows the \code{nowait}
clause to be used, even though there is a data dependence between the loops. The
dependence is satisfied as long the same thread executes the same logical iteration
numbers in each loop.
Note that the iteration count of the loops must be the same. The example satisfies
this requirement, since the iteration space of the first two loops is from \code{0}
to \code{n-1} (from \code{1} to \code{N} in the Fortran version), while the
iteration space of the last loop is from \code{1} to \code{n} (\code{2} to
\code{N+1} in the Fortran version).
\cexample{nowait}{2c}
\fexample{nowait}{2f}

View File

@ -1,30 +0,0 @@
\pagebreak
\chapter{Interaction Between the \code{num\_threads} Clause and \code{omp\_set\_dynamic}}
\label{chap:nthrs_dynamic}
The following example demonstrates the \code{num\_threads} clause and the effect
of the \\
\code{omp\_set\_dynamic} routine on it.
The call to the \code{omp\_set\_dynamic} routine with argument \code{0} in
C/C++, or \code{.FALSE.} in Fortran, disables the dynamic adjustment of the number
of threads in OpenMP implementations that support it. In this case, 10 threads
are provided. Note that in case of an error the OpenMP implementation is free to
abort the program or to supply any number of threads available.
\cexample{nthrs_dynamic}{1c}
\fexample{nthrs_dynamic}{1f}
The call to the \code{omp\_set\_dynamic} routine with a non-zero argument in
C/C++, or \code{.TRUE.} in Fortran, allows the OpenMP implementation to choose
any number of threads between 1 and 10.
\cexample{nthrs_dynamic}{2c}
\fexample{nthrs_dynamic}{2f}
It is good practice to set the \plc{dyn-var} ICV explicitly by calling the \code{omp\_set\_dynamic}
routine, as its default setting is implementation defined.

View File

@ -1,12 +0,0 @@
\pagebreak
\chapter{Controlling the Number of Threads on Multiple Nesting Levels}
\label{chap:nthrs_nesting}
The following examples demonstrate how to use the \code{OMP\_NUM\_THREADS} environment
variable to control the number of threads on multiple nesting levels:
\cexample{nthrs_nesting}{1c}
\fexample{nthrs_nesting}{1f}

View File

@ -1,28 +0,0 @@
\pagebreak
\chapter{The \code{ordered} Clause and the \code{ordered} Construct}
\label{chap:ordered}
Ordered constructs are useful for sequentially ordering the output from work that
is done in parallel. The following program prints out the indices in sequential
order:
\cexample{ordered}{1c}
\fexample{ordered}{1f}
It is possible to have multiple \code{ordered} constructs within a loop region
with the \code{ordered} clause specified. The first example is non-conforming
because all iterations execute two \code{ordered} regions. An iteration of a
loop must not execute more than one \code{ordered} region:
\cexample{ordered}{2c}
\fexample{ordered}{2f}
The following is a conforming example with more than one \code{ordered} construct.
Each iteration will execute only one \code{ordered} region:
\cexample{ordered}{3c}
\fexample{ordered}{3f}

View File

@ -1,12 +0,0 @@
\pagebreak
\chapter{The \code{parallel} Construct}
\label{chap:parallel}
The \code{parallel} construct can be used in coarse-grain parallel programs.
In the following example, each thread in the \code{parallel} region decides what
part of the global array \plc{x} to work on, based on the thread number:
\cexample{parallel}{1c}
\fexample{parallel}{1f}

View File

@ -1,11 +0,0 @@
\chapter{A Simple Parallel Loop}
\label{chap:ploop}
The following example demonstrates how to parallelize a simple loop using the parallel
loop construct. The loop iteration variable is private by default, so it is not
necessary to specify it explicitly in a \code{private} clause.
\cexample{ploop}{1c}
\fexample{ploop}{1f}

View File

@ -1,11 +0,0 @@
\pagebreak
\chapter{Parallel Random Access Iterator Loop}
\cppspecificstart
\label{chap:pra_iterator}
The following example shows a parallel random access iterator loop.
\cnexample{pra_iterator}{1c}
\cppspecificend

View File

@ -1,31 +0,0 @@
\pagebreak
\chapter{The \code{private} Clause}
\label{chap:private}
In the following example, the values of original list items \plc{i} and \plc{j}
are retained on exit from the \code{parallel} region, while the private list
items \plc{i} and \plc{j} are modified within the \code{parallel} construct.
\cexample{private}{1c}
\fexample{private}{1f}
In the following example, all uses of the variable \plc{a} within the loop construct
in the routine \plc{f} refer to a private list item \plc{a}, while it is
unspecified whether references to \plc{a} in the routine \plc{g} are to a
private list item or the original list item.
\cexample{private}{2c}
\fexample{private}{2f}
The following example demonstrates that a list item that appears in a \code{private}
clause in a \code{parallel} construct may also appear in a \code{private}
clause in an enclosed worksharing construct, which results in an additional private
copy.
\cexample{private}{3c}
\fexample{private}{3f}

View File

@ -1,13 +0,0 @@
\pagebreak
\chapter{The \code{parallel} \code{sections} Construct}
\label{chap:psections}
In the following example routines \code{XAXIS}, \code{YAXIS}, and \code{ZAXIS} can
be executed concurrently. The first \code{section} directive is optional. Note
that all \code{section} directives need to appear in the \code{parallel sections}
construct.
\cexample{psections}{1c}
\fexample{psections}{1f}

View File

@ -1,60 +0,0 @@
\pagebreak
\chapter{The \code{reduction} Clause}
\label{chap:reduction}
The following example demonstrates the \code{reduction} clause ; note that some
reductions can be expressed in the loop in several ways, as shown for the \code{max}
and \code{min} reductions below:
\cexample{reduction}{1c}
\fexample{reduction}{1f}
A common implementation of the preceding example is to treat it as if it had been
written as follows:
\cexample{reduction}{2c}
\fortranspecificstart
\fnexample{reduction}{2f}
The following program is non-conforming because the reduction is on the
\emph{intrinsic procedure name} \code{MAX} but that name has been redefined to be the variable
named \code{MAX}.
% blue line floater at top of this page for "Fortran, cont."
\begin{figure}[t!]
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
\end{figure}
\fnexample{reduction}{3f}
The following conforming program performs the reduction using the
\emph{intrinsic procedure name} \code{MAX} even though the intrinsic \code{MAX} has been renamed
to \code{REN}.
\fnexample{reduction}{4f}
The following conforming program performs the reduction using
\plc{intrinsic procedure name} \code{MAX} even though the intrinsic \code{MAX} has been renamed
to \code{MIN}.
\fnexample{reduction}{5f}
\fortranspecificend
The following example is non-conforming because the initialization (\code{a =
0}) of the original list item \code{a} is not synchronized with the update of
\code{a} as a result of the reduction computation in the \code{for} loop. Therefore,
the example may print an incorrect value for \code{a}.
To avoid this problem, the initialization of the original list item \code{a}
should complete before any update of \code{a} as a result of the \code{reduction}
clause. This can be achieved by adding an explicit barrier after the assignment
\code{a = 0}, or by enclosing the assignment \code{a = 0} in a \code{single}
directive (which has an implied barrier), or by initializing \code{a} before
the start of the \code{parallel} region.
\cexample{reduction}{3c}
\fexample{reduction}{6f}

View File

@ -1,24 +0,0 @@
\pagebreak
\chapter{The \code{omp\_set\_dynamic} and \\
\code{omp\_set\_num\_threads} Routines}
\label{chap:set_dynamic_nthrs}
Some programs rely on a fixed, prespecified number of threads to execute correctly.
Because the default setting for the dynamic adjustment of the number of threads
is implementation defined, such programs can choose to turn off the dynamic threads
capability and set the number of threads explicitly to ensure portability. The
following example shows how to do this using \code{omp\_set\_dynamic}, and \code{omp\_set\_num\_threads}.
In this example, the program executes correctly only if it is executed by 16 threads.
If the implementation is not capable of supporting 16 threads, the behavior of
this example is implementation defined. Note that the number of threads executing
a \code{parallel} region remains constant during the region, regardless of the
dynamic threads setting. The dynamic threads mechanism determines the number of
threads to use at the start of the \code{parallel} region and keeps it constant
for the duration of the region.
\cexample{set_dynamic_nthrs}{1c}
\fexample{set_dynamic_nthrs}{1f}

View File

@ -1,19 +0,0 @@
\pagebreak
\chapter{Simple Lock Routines}
\label{chap:simple_lock}
In the following example, the lock routines cause the threads to be idle while
waiting for entry to the first critical section, but to do other work while waiting
for entry to the second. The \code{omp\_set\_lock} function blocks, but the \code{omp\_test\_lock}
function does not, allowing the work in \code{skip} to be done.
Note that the argument to the lock routines should have type \code{omp\_lock\_t},
and that there is no need to flush it.
\cexample{simple_lock}{1c}
Note that there is no need to flush the lock variable.
\fexample{simple_lock}{1f}

View File

@ -1,18 +0,0 @@
\pagebreak
\chapter{The \code{single} Construct}
\label{chap:single}
The following example demonstrates the \code{single} construct. In the example,
only one thread prints each of the progress messages. All other threads will skip
the \code{single} region and stop at the barrier at the end of the \code{single}
construct until all threads in the team have reached the barrier. If other threads
can proceed without waiting for the thread executing the \code{single} region,
a \code{nowait} clause can be specified, as is done in the third \code{single}
construct in this example. The user must not make any assumptions as to which thread
will execute a \code{single} region.
\cexample{single}{1c}
\fexample{single}{1f}

View File

@ -1,31 +0,0 @@
\pagebreak
\chapter{Placement of \code{flush}, \code{barrier}, \code{taskwait}
and \code{taskyield} Directives}
\label{chap:standalone}
The following example is non-conforming, because the \code{flush}, \code{barrier},
\code{taskwait}, and \code{taskyield} directives are stand-alone directives
and cannot be the immediate substatement of an \code{if} statement.
\cexample{standalone}{1c}
The following example is non-conforming, because the \code{flush}, \code{barrier},
\code{taskwait}, and \code{taskyield} directives are stand-alone directives
and cannot be the action statement of an \code{if} statement or a labeled branch
target.
\fexample{standalone}{1f}
The following version of the above example is conforming because the \code{flush},
\code{barrier}, \code{taskwait}, and \code{taskyield} directives are enclosed
in a compound statement.
\cexample{standalone}{2c}
The following example is conforming because the \code{flush}, \code{barrier},
\code{taskwait}, and \code{taskyield} directives are enclosed in an \code{if}
construct or follow the labeled branch target.
\fexample{standalone}{2f}

View File

@ -1,96 +0,0 @@
\pagebreak
\chapter{\code{target} Construct}
\label{chap:target}
\section{\code{target} Construct on \code{parallel} Construct}
This following example shows how the \code{target} construct offloads a code
region to a target device. The variables \plc{p}, \plc{v1}, \plc{v2}, and \plc{N} are implicitly mapped
to the target device.
\cexample{target}{1c}
\fexample{target}{1f}
\section{\code{target} Construct with \code{map} Clause}
This following example shows how the \code{target} construct offloads a code
region to a target device. The variables \plc{p}, \plc{v1} and \plc{v2} are explicitly mapped to the
target device using the \code{map} clause. The variable \plc{N} is implicitly mapped to
the target device.
\cexample{target}{2c}
\fexample{target}{2f}
\section{\code{map} Clause with \code{to}/\code{from} map-types}
The following example shows how the \code{target} construct offloads a code region
to a target device. In the \code{map} clause, the \code{to} and \code{from}
map-types define the mapping between the original (host) data and the target (device)
data. The \code{to} map-type specifies that the data will only be read on the
device, and the \code{from} map-type specifies that the data will only be written
to on the device. By specifying a guaranteed access on the device, data transfers
can be reduced for the \code{target} region.
The \code{to} map-type indicates that at the start of the \code{target} region
the variables \plc{v1} and \plc{v2} are initialized with the values of the corresponding variables
on the host device, and at the end of the \code{target} region the variables
\plc{v1} and \plc{v2} are not assigned to their corresponding variables on the host device.
The \code{from} map-type indicates that at the start of the \code{target} region
the variable \plc{p} is not initialized with the value of the corresponding variable
on the host device, and at the end of the \code{target} region the variable \plc{p}
is assigned to the corresponding variable on the host device.
\cexample{target}{3c}
The \code{to} and \code{from} map-types allow programmers to optimize data
motion. Since data for the \plc{v} arrays are not returned, and data for the \plc{p} array
are not transferred to the device, only one-half of the data is moved, compared
to the default behavior of an implicit mapping.
\fexample{target}{3f}
\section{\code{map} Clause with Array Sections}
The following example shows how the \code{target} construct offloads a code region
to a target device. In the \code{map} clause, map-types are used to optimize
the mapping of variables to the target device. Because variables \plc{p}, \plc{v1} and \plc{v2} are
pointers, array section notation must be used to map the arrays. The notation \code{:N}
is equivalent to \code{0:N}.
\cexample{target}{4c}
In C, the length of the pointed-to array must be specified. In Fortran the extent
of the array is known and the length need not be specified. A section of the array
can be specified with the usual Fortran syntax, as shown in the following example.
The value 1 is assumed for the lower bound for array section \plc{v2(:N)}.
\fexample{target}{4f}
A more realistic situation in which an assumed-size array is passed to \code{vec\_mult}
requires that the length of the arrays be specified, because the compiler does
not know the size of the storage. A section of the array must be specified with
the usual Fortran syntax, as shown in the following example. The value 1 is assumed
for the lower bound for array section \plc{v2(:N)}.
\fexample{target}{4bf}
\section{\code{target} Construct with \code{if} Clause}
The following example shows how the \code{target} construct offloads a code region
to a target device.
The \code{if} clause on the \code{target} construct indicates that if the variable
\plc{N} is smaller than a given threshold, then the \code{target} region will be executed
by the host device.
The \code{if} clause on the \code{parallel} construct indicates that if the
variable \plc{N} is smaller than a second threshold then the \code{parallel} region
is inactive.
\cexample{target}{5c}
\fexample{target}{5f}

View File

@ -1,175 +0,0 @@
\pagebreak
\chapter{\code{target} \code{data} Construct}
\label{chap:target_data}
\section{Simple \code{target} \code{data} Construct}
This example shows how the \code{target} \code{data} construct maps variables
to a device data environment. The \code{target} \code{data} construct creates
a new device data environment and maps the variables \plc{v1}, \plc{v2}, and \plc{p} to the new device
data environment. The \code{target} construct enclosed in the \code{target}
\code{data} region creates a new device data environment, which inherits the
variables \plc{v1}, \plc{v2}, and \plc{p} from the enclosing device data environment. The variable
\plc{N} is mapped into the new device data environment from the encountering task's data
environment.
\cexample{target_data}{1c}
The Fortran code passes a reference and specifies the extent of the arrays in the
declaration. No length information is necessary in the map clause, as is required
with C/C++ pointers.
\fexample{target_data}{1f}
\section{\code{target} \code{data} Region Enclosing Multiple \code{target} Regions}
The following examples show how the \code{target} \code{data} construct maps
variables to a device data environment of a \code{target} region. The \code{target}
\code{data} construct creates a device data environment and encloses \code{target}
regions, which have their own device data environments. The device data environment
of the \code{target} \code{data} region is inherited by the device data environment
of an enclosed \code{target} region. The \code{target} \code{data} construct
is used to create variables that will persist throughout the \code{target} \code{data}
region.
In the following example the variables \plc{v1} and \plc{v2} are mapped at each \code{target}
construct. Instead of mapping the variable \plc{p} twice, once at each \code{target}
construct, \plc{p} is mapped once by the \code{target} \code{data} construct.
\cexample{target_data}{2c}
The Fortran code uses reference and specifies the extent of the \plc{p}, \plc{v1} and \plc{v2} arrays.
No length information is necessary in the \code{map} clause, as is required with
C/C++ pointers. The arrays \plc{v1} and \plc{v2} are mapped at each \code{target} construct.
Instead of mapping the array \plc{p} twice, once at each target construct, \plc{p} is mapped
once by the \code{target} \code{data} construct.
\fexample{target_data}{2f}
In the following example, the variable tmp defaults to \code{tofrom} map-type
and is mapped at each \code{target} construct. The array \plc{Q} is mapped once at
the enclosing \code{target} \code{data} region instead of at each \code{target}
construct.
\cexample{target_data}{3c}
In the following example the arrays \plc{v1} and \plc{v2} are mapped at each \code{target}
construct. Instead of mapping the array \plc{Q} twice at each \code{target} construct,
\plc{Q} is mapped once by the \code{target} \code{data} construct. Note, the \plc{tmp}
variable is implicitly remapped for each \code{target} region, mapping the value
from the device to the host at the end of the first \code{target} region, and
from the host to the device for the second \code{target} region.
\fexample{target_data}{3f}
\section{\code{target} \code{data} Construct with Orphaned Call}
The following two examples show how the \code{target} \code{data} construct
maps variables to a device data environment. The \code{target} \code{data}
construct's device data environment encloses the \code{target} construct's device
data environment in the function \code{vec\_mult()}.
When the type of the variable appearing in an array section is pointer, the pointer
variable and the storage location of the corresponding array section are mapped
to the device data environment. The pointer variable is treated as if it had appeared
in a \code{map} clause with a map-type of \code{alloc}. The array section's
storage location is mapped according to the map-type in the \code{map} clause
(the default map-type is \code{tofrom}).
The \code{target} construct's device data environment inherits the storage locations
of the array sections \plc{v1[0:N]}, \plc{v2[:n]}, and \plc{p0[0:N]} from the enclosing target data
construct's device data environment. Neither initialization nor assignment is performed
for the array sections in the new device data environment.
The pointer variables \plc{p1}, \plc{v3}, and \plc{v4} are mapped into the target construct's device
data environment with an implicit map-type of alloc and they are assigned the address
of the storage location associated with their corresponding array sections. Note
that the following pairs of array section storage locations are equivalent (\plc{p0[:N]},
\plc{p1[:N]}), (\plc{v1[:N]},\plc{v3[:N]}), and (\plc{v2[:N]},\plc{v4[:N]}).
\cexample{target_data}{4c}
The Fortran code maps the pointers and storage in an identical manner (same extent,
but uses indices from 1 to \plc{N}).
The \code{target} construct's device data environment inherits the storage locations
of the arrays \plc{v1}, \plc{v2} and \plc{p0} from the enclosing \code{target} \code{data} constructs's
device data environment. However, in Fortran the associated data of the pointer
is known, and the shape is not required.
The pointer variables \plc{p1}, \plc{v3}, and \plc{v4} are mapped into the \code{target} construct's
device data environment with an implicit map-type of \code{alloc} and they are
assigned the address of the storage location associated with their corresponding
array sections. Note that the following pair of array storage locations are equivalent
(\plc{p0},\plc{p1}), (\plc{v1},\plc{v3}), and (\plc{v2},\plc{v4}).
\fexample{target_data}{4f}
In the following example, the variables \plc{p1}, \plc{v3}, and \plc{v4} are references to the pointer
variables \plc{p0}, \plc{v1} and \plc{v2} respectively. The \code{target} construct's device data
environment inherits the pointer variables \plc{p0}, \plc{v1}, and \plc{v2} from the enclosing \code{target}
\code{data} construct's device data environment. Thus, \plc{p1}, \plc{v3}, and \plc{v4} are already
present in the device data environment.
\cexample{target_data}{5c}
In the following example, the usual Fortran approach is used for dynamic memory.
The \plc{p0}, \plc{v1}, and \plc{v2} arrays are allocated in the main program and passed as references
from one routine to another. In \code{vec\_mult}, \plc{p1}, \plc{v3} and \plc{v4} are references to the
\plc{p0}, \plc{v1}, and \plc{v2} arrays, respectively. The \code{target} construct's device data
environment inherits the arrays \plc{p0}, \plc{v1}, and \plc{v2} from the enclosing target data construct's
device data environment. Thus, \plc{p1}, \plc{v3}, and \plc{v4} are already present in the device
data environment.
\fexample{target_data}{5f}
\section{\code{target} \code{data} Construct with \code{if} Clause}
The following two examples show how the \code{target} \code{data} construct
maps variables to a device data environment.
In the following example, the if clause on the \code{target} \code{data} construct
indicates that if the variable \plc{N} is smaller than a given threshold, then the \code{target}
\code{data} construct will not create a device data environment.
The \code{target} constructs enclosed in the \code{target} \code{data} region
must also use an \code{if} clause on the same condition, otherwise the pointer
variable \plc{p} is implicitly mapped with a map-type of \code{tofrom}, but the storage
location for the array section \plc{p[0:N]} will not be mapped in the device data environments
of the \code{target} constructs.
\cexample{target_data}{6c}
The \code{if} clauses work the same way for the following Fortran code. The \code{target}
constructs enclosed in the \code{target} \code{data} region should also use
an \code{if} clause with the same condition, so that the \code{target} \code{data}
region and the \code{target} region are either both created for the device, or
are both ignored.
\fexample{target_data}{6f}
In the following example, when the \code{if} clause conditional expression on
the \code{target} construct evaluates to \plc{false}, the target region will
execute on the host device. However, the \code{target} \code{data} construct
created an enclosing device data environment that mapped \plc{p[0:N]} to a device data
environment on the default device. At the end of the \code{target} \code{data}
region the array section \plc{p[0:N]} will be assigned from the device data environment
to the corresponding variable in the data environment of the task that encountered
the \code{target} \code{data} construct, resulting in undefined values in \plc{p[0:N]}.
\cexample{target_data}{7c}
The \code{if} clauses work the same way for the following Fortran code. When
the \code{if} clause conditional expression on the \code{target} construct
evaluates to \plc{false}, the \code{target} region will execute on the host
device. However, the \code{target} \code{data} construct created an enclosing
device data environment that mapped the \plc{p} array (and \plc{v1} and \plc{v2}) to a device data
environment on the default target device. At the end of the \code{target} \code{data}
region the \plc{p} array will be assigned from the device data environment to the corresponding
variable in the data environment of the task that encountered the \code{target}
\code{data} construct, resulting in undefined values in \plc{p}.
\fexample{target_data}{7f}

View File

@ -1,53 +0,0 @@
\pagebreak
\chapter{\code{target} \code{update} Construct}
\label{chap:target_update}
\section{Simple \code{target} \code{data} and \code{target} \code{update} Constructs}
The following example shows how the \code{target} \code{update} construct updates
variables in a device data environment.
The \code{target} \code{data} construct maps array sections \plc{v1[:N]} and \plc{v2[:N]}
(arrays \plc{v1} and \plc{v2} in the Fortran code) into a device data environment.
The task executing on the host device encounters the first \code{target} region
and waits for the completion of the region.
After the execution of the first \code{target} region, the task executing on
the host device then assigns new values to \plc{v1[:N]} and \plc{v2[:N]} (\plc{v1} and \plc{v2} arrays
in Fortran code) in the task's data environment by calling the function \code{init\_again()}.
The \code{target} \code{update} construct assigns the new values of \plc{v1} and
\plc{v2} from the task's data environment to the corresponding mapped array sections
in the device data environment of the \code{target} \code{data} construct.
The task executing on the host device then encounters the second \code{target}
region and waits for the completion of the region.
The second \code{target} region uses the updated values of \plc{v1[:N]} and \plc{v2[:N]}.
\cexample{target_update}{1c}
\fexample{target_update}{1f}
\section{\code{target} \code{update} Construct with \code{if} Clause}
The following example shows how the \code{target} \code{update} construct updates
variables in a device data environment.
The \code{target} \code{data} construct maps array sections \plc{v1[:N]} and \plc{v2[:N]}
(arrays \plc{v1} and \plc{v2} in the Fortran code) into a device data environment. In between
the two \code{target} regions, the task executing on the host device conditionally
assigns new values to \plc{v1} and \plc{v2} in the task's data environment. The function \code{maybe\_init\_again()}
returns \plc{true} if new data is written.
When the conditional expression (the return value of \code{maybe\_init\_again()}) in the
\code{if} clause is \plc{true}, the \code{target} \code{update} construct
assigns the new values of \plc{v1} and \plc{v2} from the task's data environment to the corresponding
mapped array sections in the \code{target} \code{data} construct's device data
environment.
\cexample{target_update}{2c}
\fexample{target_update}{2f}

View File

@ -1,72 +0,0 @@
\pagebreak
\chapter{Task Dependences}
\label{chap:task_dep}
\section{Flow Dependence}
In this example we show a simple flow dependence expressed using the \code{depend}
clause on the \code{task} construct.
\cexample{task_dep}{1c}
\fexample{task_dep}{1f}
The program will always print \texttt{"}x = 2\texttt{"}, because the \code{depend}
clauses enforce the ordering of the tasks. If the \code{depend} clauses had been
omitted, then the tasks could execute in any order and the program and the program
would have a race condition.
\section{Anti-dependence}
In this example we show an anti-dependence expressed using the \code{depend}
clause on the \code{task} construct.
\cexample{task_dep}{2c}
\fexample{task_dep}{2f}
The program will always print \texttt{"}x = 1\texttt{"}, because the \code{depend}
clauses enforce the ordering of the tasks. If the \code{depend} clauses had been
omitted, then the tasks could execute in any order and the program would have a
race condition.
\section{Output Dependence}
In this example we show an output dependence expressed using the \code{depend}
clause on the \code{task} construct.
\cexample{task_dep}{3c}
\fexample{task_dep}{3f}
The program will always print \texttt{"}x = 2\texttt{"}, because the \code{depend}
clauses enforce the ordering of the tasks. If the \code{depend} clauses had been
omitted, then the tasks could execute in any order and the program would have a
race condition.
\section{Concurrent Execution with Dependences}
In this example we show potentially concurrent execution of tasks using multiple
flow dependences expressed using the \code{depend} clause on the \code{task}
construct.
\cexample{task_dep}{4c}
\fexample{task_dep}{4f}
The last two tasks are dependent on the first task. However there is no dependence
between the last two tasks, which may execute in any order (or concurrently if
more than one thread is available). Thus, the possible outputs are \texttt{"}x
+ 1 = 3. x + 2 = 4. \texttt{"} and \texttt{"}x + 2 = 4. x + 1 = 3. \texttt{"}.
If the \code{depend} clauses had been omitted, then all of the tasks could execute
in any order and the program would have a race condition.
\section{Matrix multiplication}
This example shows a task-based blocked matrix multiplication. Matrices are of
NxN elements, and the multiplication is implemented using blocks of BSxBS elements.
\cexample{task_dep}{5c}
\fexample{task_dep}{5f}

View File

@ -1,20 +0,0 @@
\pagebreak
\chapter{The \code{taskgroup} Construct}
\label{chap:taskgroup}
In this example, tasks are grouped and synchronized using the \code{taskgroup}
construct.
Initially, one task (the task executing the \code{start\_background\_work()}
call) is created in the \code{parallel} region, and later a parallel tree traversal
is started (the task executing the root of the recursive \code{compute\_tree()}
calls). While synchronizing tasks at the end of each tree traversal, using the
\code{taskgroup} construct ensures that the formerly started background task
does not participate in the synchronization, and is left free to execute in parallel.
This is opposed to the behaviour of the \code{taskwait} construct, which would
include the background tasks in the synchronization.
\cexample{taskgroup}{1c}
\fexample{taskgroup}{1f}

View File

@ -1,190 +0,0 @@
\pagebreak
\chapter{The \code{task} and \code{taskwait} Constructs}
\label{chap:tasking}
The following example shows how to traverse a tree-like structure using explicit
tasks. Note that the \code{traverse} function should be called from within a
parallel region for the different specified tasks to be executed in parallel. Also
note that the tasks will be executed in no specified order because there are no
synchronization directives. Thus, assuming that the traversal will be done in post
order, as in the sequential code, is wrong.
\cexample{tasking}{1c}
\fexample{tasking}{1f}
In the next example, we force a postorder traversal of the tree by adding a \code{taskwait}
directive. Now, we can safely assume that the left and right sons have been executed
before we process the current node.
\cexample{tasking}{2c}
\fexample{tasking}{2f}
The following example demonstrates how to use the \code{task} construct to process
elements of a linked list in parallel. The thread executing the \code{single}
region generates all of the explicit tasks, which are then executed by the threads
in the current team. The pointer \plc{p} is \code{firstprivate} by default
on the \code{task} construct so it is not necessary to specify it in a \code{firstprivate}
clause.
\cexample{tasking}{3c}
\fexample{tasking}{3f}
The \code{fib()} function should be called from within a \code{parallel} region
for the different specified tasks to be executed in parallel. Also, only one thread
of the \code{parallel} region should call \code{fib()} unless multiple concurrent
Fibonacci computations are desired.
\cexample{tasking}{4c}
\fexample{tasking}{4f}
Note: There are more efficient algorithms for computing Fibonacci numbers. This
classic recursion algorithm is for illustrative purposes.
The following example demonstrates a way to generate a large number of tasks with
one thread and execute them with the threads in the team. While generating these
tasks, the implementation may reach its limit on unassigned tasks. If it does,
the implementation is allowed to cause the thread executing the task generating
loop to suspend its task at the task scheduling point in the \code{task} directive,
and start executing unassigned tasks. Once the number of unassigned tasks is sufficiently
low, the thread may resume execution of the task generating loop.
\cexample{tasking}{5c}
\pagebreak
\fexample{tasking}{5f}
The following example is the same as the previous one, except that the tasks are
generated in an untied task. While generating the tasks, the implementation may
reach its limit on unassigned tasks. If it does, the implementation is allowed
to cause the thread executing the task generating loop to suspend its task at the
task scheduling point in the \code{task} directive, and start executing unassigned
tasks. If that thread begins execution of a task that takes a long time to complete,
the other threads may complete all the other tasks before it is finished.
In this case, since the loop is in an untied task, any other thread is eligible
to resume the task generating loop. In the previous examples, the other threads
would be forced to idle until the generating thread finishes its long task, since
the task generating loop was in a tied task.
\cexample{tasking}{6c}
\fexample{tasking}{6f}
The following two examples demonstrate how the scheduling rules illustrated in
Section 2.11.3 of the OpenMP 4.0 specification affect the usage of
\code{threadprivate} variables in tasks. A \code{threadprivate}
variable can be modified by another task that is executed by the same thread. Thus,
the value of a \code{threadprivate} variable cannot be assumed to be unchanged
across a task scheduling point. In untied tasks, task scheduling points may be
added in any place by the implementation.
A task switch may occur at a task scheduling point. A single thread may execute
both of the task regions that modify \code{tp}. The parts of these task regions
in which \code{tp} is modified may be executed in any order so the resulting
value of \code{var} can be either 1 or 2.
\cexample{tasking}{7c}
\fexample{tasking}{7f}
In this example, scheduling constraints prohibit a thread in the team from executing
a new task that modifies \code{tp} while another such task region tied to the
same thread is suspended. Therefore, the value written will persist across the
task scheduling point.
\cexample{tasking}{8c}
\fexample{tasking}{8f}
The following two examples demonstrate how the scheduling rules illustrated in
Section 2.11.3 of the OpenMP 4.0 specification affect the usage of locks
and critical sections in tasks. If a lock is held
across a task scheduling point, no attempt should be made to acquire the same lock
in any code that may be interleaved. Otherwise, a deadlock is possible.
In the example below, suppose the thread executing task 1 defers task 2. When
it encounters the task scheduling point at task 3, it could suspend task 1 and
begin task 2 which will result in a deadlock when it tries to enter critical region
1.
\cexample{tasking}{9c}
\fexample{tasking}{9f}
In the following example, \code{lock} is held across a task scheduling point.
However, according to the scheduling restrictions, the executing thread can't
begin executing one of the non-descendant tasks that also acquires \code{lock} before
the task region is complete. Therefore, no deadlock is possible.
\cexample{tasking}{10c}
\fexample{tasking}{10f}
The following examples illustrate the use of the \code{mergeable} clause in the
\code{task} construct. In this first example, the \code{task} construct has
been annotated with the \code{mergeable} clause. The addition of this clause
allows the implementation to reuse the data environment (including the ICVs) of
the parent task for the task inside \code{foo} if the task is included or undeferred.
Thus, the result of the execution may differ depending on whether the task is merged
or not. Therefore the mergeable clause needs to be used with caution. In this example,
the use of the mergeable clause is safe. As \code{x} is a shared variable the
outcome does not depend on whether or not the task is merged (that is, the task
will always increment the same variable and will always compute the same value
for \code{x}).
\cexample{tasking}{11c}
\fexample{tasking}{11f}
This second example shows an incorrect use of the \code{mergeable} clause. In
this example, the created task will access different instances of the variable
\code{x} if the task is not merged, as \code{x} is \code{firstprivate}, but
it will access the same variable \code{x} if the task is merged. As a result,
the behavior of the program is unspecified and it can print two different values
for \code{x} depending on the decisions taken by the implementation.
\cexample{tasking}{12c}
\fexample{tasking}{12f}
The following example shows the use of the \code{final} clause and the \code{omp\_in\_final}
API call in a recursive binary search program. To reduce overhead, once a certain
depth of recursion is reached the program uses the \code{final} clause to create
only included tasks, which allow additional optimizations.
The use of the \code{omp\_in\_final} API call allows programmers to optimize
their code by specifying which parts of the program are not necessary when a task
can create only included tasks (that is, the code is inside a \code{final} task).
In this example, the use of a different state variable is not necessary so once
the program reaches the part of the computation that is finalized and copying from
the parent state to the new state is eliminated. The allocation of \code{new\_state}
in the stack could also be avoided but it would make this example less clear. The
\code{final} clause is most effective when used in conjunction with the \code{mergeable}
clause since all tasks created in a \code{final} task region are included tasks
that can be merged if the \code{mergeable} clause is present.
\cexample{tasking}{13c}
\fexample{tasking}{13f}
The following example illustrates the difference between the \code{if} and the
\code{final} clauses. The \code{if} clause has a local effect. In the first
nest of tasks, the one that has the \code{if} clause will be undeferred but
the task nested inside that task will not be affected by the \code{if} clause
and will be created as usual. Alternatively, the \code{final} clause affects
all \code{task} constructs in the \code{final} task region but not the \code{final}
task itself. In the second nest of tasks, the nested tasks will be created as included
tasks. Note also that the conditions for the \code{if} and \code{final} clauses
are usually the opposite.
\cexample{tasking}{14c}
\fexample{tasking}{14f}

View File

@ -1,14 +0,0 @@
\pagebreak
\chapter{The \code{taskyield} Construct}
\label{chap:taskyield}
The following example illustrates the use of the \code{taskyield} directive.
The tasks in the example compute something useful and then do some computation
that must be done in a critical region. By using \code{taskyield} when a task
cannot get access to the \code{critical} region the implementation can suspend
the current task and schedule some other task that can do something useful.
\cexample{taskyield}{1c}
\fexample{taskyield}{1f}

View File

@ -1,118 +0,0 @@
\pagebreak
\chapter{\code{teams} Constructs}
\label{chap:teams}
\section{\code{target} and \code{teams} Constructs with \code{omp\_get\_num\_teams}\\
and \code{omp\_get\_team\_num} Routines}
The following example shows how the \code{target} and \code{teams} constructs
are used to create a league of thread teams that execute a region. The \code{teams}
construct creates a league of at most two teams where the master thread of each
team executes the \code{teams} region.
The \code{omp\_get\_num\_teams} routine returns the number of teams executing in a \code{teams}
region. The \code{omp\_get\_team\_num} routine returns the team number, which is an integer
between 0 and one less than the value returned by \code{omp\_get\_num\_teams}. The following
example manually distributes a loop across two teams.
\cexample{teams}{1c}
\fexample{teams}{1f}
\section{\code{target}, \code{teams}, and \code{distribute} Constructs}
The following example shows how the \code{target}, \code{teams}, and \code{distribute}
constructs are used to execute a loop nest in a \code{target} region. The \code{teams}
construct creates a league and the master thread of each team executes the \code{teams}
region. The \code{distribute} construct schedules the subsequent loop iterations
across the master threads of each team.
The number of teams in the league is less than or equal to the variable \plc{num\_blocks}.
Each team in the league has a number of threads less than or equal to the variable
\plc{block\_threads}. The iterations in the outer loop are distributed among the master
threads of each team.
When a team's master thread encounters the parallel loop construct before the inner
loop, the other threads in its team are activated. The team executes the \code{parallel}
region and then workshares the execution of the loop.
Each master thread executing the \code{teams} region has a private copy of the
variable \plc{sum} that is created by the \code{reduction} clause on the \code{teams} construct.
The master thread and all threads in its team have a private copy of the variable
\plc{sum} that is created by the \code{reduction} clause on the parallel loop construct.
The second private \plc{sum} is reduced into the master thread's private copy of \plc{sum}
created by the \code{teams} construct. At the end of the \code{teams} region,
each master thread's private copy of \plc{sum} is reduced into the final \plc{sum} that is
implicitly mapped into the \code{target} region.
\cexample{teams}{2c}
\fexample{teams}{2f}
\section{\code{target} \code{teams}, and Distribute Parallel Loop Constructs}
The following example shows how the \code{target} \code{teams} and distribute
parallel loop constructs are used to execute a \code{target} region. The \code{target}
\code{teams} construct creates a league of teams where the master thread of each
team executes the \code{teams} region.
The distribute parallel loop construct schedules the loop iterations across the
master threads of each team and then across the threads of each team.
\cexample{teams}{3c}
\fexample{teams}{3f}
\section{\code{target} \code{teams} and Distribute Parallel Loop
Constructs with Scheduling Clauses}
The following example shows how the \code{target} \code{teams} and distribute
parallel loop constructs are used to execute a \code{target} region. The \code{teams}
construct creates a league of at most eight teams where the master thread of each
team executes the \code{teams} region. The number of threads in each team is
less than or equal to 16.
The \code{distribute} parallel loop construct schedules the subsequent loop iterations
across the master threads of each team and then across the threads of each team.
The \code{dist\_schedule} clause on the distribute parallel loop construct indicates
that loop iterations are distributed to the master thread of each team in chunks
of 1024 iterations.
The \code{schedule} clause indicates that the 1024 iterations distributed to
a master thread are then assigned to the threads in its associated team in chunks
of 64 iterations.
\cexample{teams}{4c}
\fexample{teams}{4f}
\section{\code{target} \code{teams} and \code{distribute} \code{simd} Constructs}
The following example shows how the \code{target} \code{teams} and \code{distribute}
\code{simd} constructs are used to execute a loop in a \code{target} region.
The \code{target} \code{teams} construct creates a league of teams where the
master thread of each team executes the \code{teams} region.
The \code{distribute} \code{simd} construct schedules the loop iterations across
the master thread of each team and then uses SIMD parallelism to execute the iterations.
\cexample{teams}{5c}
\fexample{teams}{5f}
\section{\code{target} \code{teams} and Distribute Parallel Loop SIMD Constructs}
The following example shows how the \code{target} \code{teams} and the distribute
parallel loop SIMD constructs are used to execute a loop in a \code{target} \code{teams}
region. The \code{target} \code{teams} construct creates a league of teams
where the master thread of each team executes the \code{teams} region.
The distribute parallel loop SIMD construct schedules the loop iterations across
the master thread of each team and then across the threads of each team where each
thread uses SIMD parallelism.
\cexample{teams}{6c}
\fexample{teams}{6f}

View File

@ -1,106 +0,0 @@
\pagebreak
\chapter{The \code{threadprivate} Directive}
\label{chap:threadprivate}
The following examples demonstrate how to use the \code{threadprivate} directive
to give each thread a separate counter.
\cexample{threadprivate}{1c}
\fexample{threadprivate}{1f}
\ccppspecificstart
The following example uses \code{threadprivate} on a static variable:
\cnexample{threadprivate}{2c}
The following example demonstrates unspecified behavior for the initialization
of a \code{threadprivate} variable. A \code{threadprivate} variable is initialized
once at an unspecified point before its first reference. Because \code{a} is
constructed using the value of \code{x} (which is modified by the statement
\code{x++}), the value of \code{a.val} at the start of the \code{parallel}
region could be either 1 or 2. This problem is avoided for \code{b}, which uses
an auxiliary \code{const} variable and a copy-constructor.
\cnexample{threadprivate}{3c}
\ccppspecificend
The following examples show non-conforming uses and correct uses of the \code{threadprivate}
directive.
\fortranspecificstart
The following example is non-conforming because the common block is not declared
local to the subroutine that refers to it:
\fnexample{threadprivate}{2f}
The following example is also non-conforming because the common block is not declared
local to the subroutine that refers to it:
\fnexample{threadprivate}{3f}
The following example is a correct rewrite of the previous example:
% blue line floater at top of this page for "Fortran, cont."
\begin{figure}[t!]
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
\end{figure}
\fnexample{threadprivate}{4f}
The following is an example of the use of \code{threadprivate} for local variables:
\fnexample{threadprivate}{5f}
% blue line floater at top of this page for "Fortran, cont."
\begin{figure}[t!]
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
\end{figure}
The above program, if executed by two threads, will print one of the following
two sets of output:
\code{a = 11 12 13}
\\
\code{ptr = 4}
\\
\code{i = 15}
\code{A is not allocated}
\\
\code{ptr = 4}
\\
\code{i = 5}
or
\code{A is not allocated}
\\
\code{ptr = 4}
\\
\code{i = 15}
\code{a = 1 2 3}
\\
\code{ptr = 4}
\\
\code{i = 5}
The following is an example of the use of \code{threadprivate} for module variables:
\fnexample{threadprivate}{6f}
\fortranspecificend
\cppspecificstart
The following example illustrates initialization of \code{threadprivate} variables
for class-type \code{T}. \code{t1} is default constructed, \code{t2} is constructed
taking a constructor accepting one argument of integer type, \code{t3} is copy
constructed with argument \code{f()}:
\cnexample{threadprivate}{4c}
The following example illustrates the use of \code{threadprivate} for static
class members. The \code{threadprivate} directive for a static class member must
be placed inside the class definition.
\cnexample{threadprivate}{5c}
\cppspecificend

View File

@ -1,76 +0,0 @@
\pagebreak
\chapter{The \code{workshare} Construct}
\fortranspecificstart
\label{chap:workshare}
The following are examples of the \code{workshare} construct.
In the following example, \code{workshare} spreads work across the threads executing
the \code{parallel} region, and there is a barrier after the last statement.
Implementations must enforce Fortran execution rules inside of the \code{workshare}
block.
\fnexample{workshare}{1f}
In the following example, the barrier at the end of the first \code{workshare}
region is eliminated with a \code{nowait} clause. Threads doing \code{CC =
DD} immediately begin work on \code{EE = FF} when they are done with \code{CC
= DD}.
\fnexample{workshare}{2f}
% blue line floater at top of this page for "Fortran, cont."
\begin{figure}[t!]
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
\end{figure}
The following example shows the use of an \code{atomic} directive inside a \code{workshare}
construct. The computation of \code{SUM(AA)} is workshared, but the update to
\code{R} is atomic.
\fnexample{workshare}{3f}
Fortran \code{WHERE} and \code{FORALL} statements are \emph{compound statements},
made up of a \emph{control} part and a \emph{statement} part. When \code{workshare}
is applied to one of these compound statements, both the control and the statement
parts are workshared. The following example shows the use of a \code{WHERE} statement
in a \code{workshare} construct.
Each task gets worked on in order by the threads:
\code{AA = BB} then
\\
\code{CC = DD} then
\\
\code{EE .ne. 0} then
\\
\code{FF = 1 / EE} then
\\
\code{GG = HH}
\fnexample{workshare}{4f}
% blue line floater at top of this page for "Fortran, cont."
\begin{figure}[t!]
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
\end{figure}
In the following example, an assignment to a shared scalar variable is performed
by one thread in a \code{workshare} while all other threads in the team wait.
\fnexample{workshare}{5f}
The following example contains an assignment to a private scalar variable, which
is performed by one thread in a \code{workshare} while all other threads wait.
It is non-conforming because the private scalar variable is undefined after the
assignment statement.
\fnexample{workshare}{6f}
Fortran execution rules must be enforced inside a \code{workshare} construct.
In the following example, the same result is produced in the following program
fragment regardless of whether the code is executed sequentially or inside an OpenMP
program with multiple threads:
\fnexample{workshare}{7f}
\fortranspecificend

View File

@ -1,18 +0,0 @@
\pagebreak
\chapter{Worksharing Constructs Inside a \code{critical} Construct}
\label{chap:worksharing_critical}
The following example demonstrates using a worksharing construct inside a \code{critical}
construct. This example is conforming because the worksharing \code{single}
region is not closely nested inside the \code{critical} region. A single thread
executes the one and only section in the \code{sections} region, and executes
the \code{critical} region. The same thread encounters the nested \code{parallel}
region, creates a new team of threads, and becomes the master of the new team.
One of the threads in the new team enters the \code{single} region and increments
\code{i} by \code{1}. At the end of this example \code{i} is equal to \code{2}.
\cexample{worksharing_critical}{1c}
\fexample{worksharing_critical}{1f}

48
Foreword_Chapt.tex Normal file
View File

@ -0,0 +1,48 @@
\chapter*{Foreword}
\label{chap:foreword}
The OpenMP Examples document has been updated with new features
found in the OpenMP \SVER\ Specification.
In order to provide users with new feature examples concurrently
with the release of the OpenMP 6.0 Specification,
the 6.0 Examples document is being released early
with a caveat that some of the 6.0 features
(such as \kcode{workdistribute} construct, \kcode{taskgraph} construct,
\kcode{threadset} clause and free-agent threads) will be covered
in the next release of the document.
For a list of the new examples and updates in this release,
please refer to the Document Revision History of the Appendix on page~\pageref{chap:history}.
Text describing an example with a \SVER\ feature specifically states
that the feature support begins in the OpenMP \SVER\ Specification. Also,
an \kcode{\small{}omp_\SVER} keyword is included in the metadata of the source code.
These distinctions are presented to remind readers that a \SVER\ compliant
OpenMP implementation is necessary to use these features in codes.
%Examples for most of the \SVER\ features are included in this document,
%and
Incremental releases will become available as more feature examples
and updates are submitted and approved by the OpenMP Examples Subcommittee.
Examples are accepted for this document after discussions, revisions and reviews
in the Examples Subcommittee, and two reviews/discussions and two votes
in the OpenMP Language Committee.
Draft examples are often derived from case studies for new features in the language,
and are revised to illustrate the basic application of the features with code comments,
and a text description. We are grateful to the numerous members of the Language Committee
who took the time to prepare codes and descriptions, and shepherd them through
the acceptance process. We sincerely appreciate the Example Subcommittee members, who
actively participated and contributed in weekly meetings over the years.
\bigskip
Examples Subcommittee Co-chairs: \smallskip\linebreak
Henry Jin (\textsc{NASA} Ames Research Center) \linebreak
Swaroop Pophale (Oak Ridge National Laboratory)
\bigskip
\bigskip
Past Examples Subcommittee Co-chairs:
\begin{itemize}
\item Kent Milfeld (2014 - 2022)
\end{itemize}

View File

@ -1,11 +1,463 @@
\chapter{Document Revision History}
\cchapter{Document Revision History}{history}
\label{chap:history}
%=====================================
\section{Changes from 5.2.2 to 6.0}
\label{sec:history_522_to_60}
\begin{itemize}
\item General changes:
\begin{itemize}
\item Added a set of structured LaTeX environments for specifying
language-dependent text. This allows extracting language-specific
content of the Examples document. Refer to the content of
\examplesblob{v6.0/Contributions.md} for details.
\end{itemize}
\item Added the following examples for the 6.0 features:
\begin{itemize}
\item \kcode{omp::decl} attribute for declarative directives in C/C++
(\specref{sec:attributes})
\item \kcode{transparent} clause on the \kcode{task} construct to enable dependences
between non-sibling tasks (\specref{subsec:depend_trans_task})
\item Task dependences for \kcode{taskloop} construct
(\specref{sec:taskloop_depend})
\item \kcode{num_threads} clause that appears inside \kcode{target} region
(\specref{subsec:target_teams_num_teams})
\item \kcode{nowait} clause with argument on the \kcode{target} construct to control deferment
of target task (\specref{subsec:async_target_nowait_arg})
\item Traits for specifying devices (\specref{sec:device_env_traits})
\item \kcode{apply} clause with modifier argument to
support selective loop transformations
(\specref{sec:apply_clause})
\item Reduction on private variables in a \kcode{parallel} region
(\specref{subsec:priv_reduction})
\item \kcode{induction} clause (\specref{subsec:induction})
and user-defined induction (\specref{subsec:user-defined-induction})
\item \kcode{init_complete} clause for \kcode{scan} directive to
support initialization phase in scan operation
(\specref{sec:scan})
\item \kcode{assume} construct with \kcode{no_openmp} and \kcode{no_parallelism} clauses (\specref{sec:assumption})
\item \kcode{num_threads} clause with a list
(\specref{subsec:icv_nthreads})
\item \kcode{dispatch} construct to control variant substitution
for a procedure call (\specref{sec:dispatch})
\end{itemize}
\item Other changes:
\begin{itemize}
\item Changed attribute specifier as a directive form from C++ only to C/C++
(\specref{chap:directive_syntax})
\item Added missing \bcode{include <omp.h>} in Example \example{atomic.4.c}
and \bcode{use omp_lib} in Example \example{atomic.4.f90}
(\specref{sec:atomic_hint})
\item Fixed the function declaration order for variant functions in
Examples \example{selector_scoring.[12].c} and Fortran pointer
initialization in Example \example{selector_scoring.2.f90}
(\specref{subsec:context_selector_scoring})
\item Replaced the deprecated use of \plc{combiner-exp}
in \kcode{declare reduction} directive with \kcode{combiner} clause
(\specref{subsec:UDR} and \specref{sec:Updated Examples})
\item Fixed the initialization of Fortran pointers
in Example \example{cancellation.2.f90} and changed to
use \kcode{atomic write} for performing atomic writes
(\specref{sec:cancellation})
\item Added missing \kcode{declare target} directive for external procedure
called inside \kcode{target} region in Example
\example{requires.1.f90} (\specref{sec:requires})
\end{itemize}
\end{itemize}
%=====================================
\section{Changes from 5.2.1 to 5.2.2}
\label{sec:history_521_to_522}
\begin{itemize}
\item To improve the style of the document, a set of macros was introduced
and consistently used for language keywords, names, concepts, and user codes
in the text description of the document. Refer to the content of
\examplesblob{v5.2.2/Contributions.md}
for details.
\item Added the following examples:
\begin{itemize}
\item Orphaned and nested \kcode{loop} constructs (\specref{sec:loop})
\item \kcode{all} variable category for the \kcode{defaultmap} clause
(\specref{sec:defaultmap})
\item \kcode{target update} construct using a custom mapper
(\specref{subsec:target_update_mapper})
\item \kcode{indirect} clause for indirect procedure calls in a
\kcode{target} region (\specref{subsec:indirect})
\item \kcode{omp_target_memcpy_async} routine with depend object
(\specref{subsec:target_mem_and_device_ptrs})
\item Synchronization hint for atomic operation (\specref{sec:atomic_hint})
\item Implication of passing shared variable to a procedure
in Fortran (\specref{sec:fort_shared_var})
\item Assumption directives for providing additional information
about program properties (\specref{sec:assumption})
\item Mapping behavior of scalars, pointers, references (C++) and associate names
(Fortran) when unified shared memory is required
(\specref{sec:requires})
\item \kcode{begin declare variant} paired with \kcode{end declare variant}
example to show use of nested declare variant
directives (\specref{subsec:declare_variant})
\item Explicit scoring in context selectors
(\specref{subsec:context_selector_scoring})
\end{itemize}
\item Miscellaneous changes:
\begin{itemize}
\item Included a general statement in Introduction about the number of
threads used throughout the examples document (\specref{sec:examples})
\item Clarified the mapping of virtual functions in \kcode{target} regions
(\specref{sec:virtual_functions})
\item Added missing \kcode{declare target} directive for procedures
called inside \kcode{target} region in \example{Examples}
\example{declare_mapper.1.f90} (\specref{sec:declare_mapper}),
\example{target_reduction.*.f90} (\specref{subsec:target_reduction}),
and \example{target_task_reduction.*.f90}
(\specref{subsec:target_task_reduction})
\item Added missing \kcode{end target} directive in
\example{Example declare_mapper.3.f90}
(\specref{sec:declare_mapper})
\item Removed example for \kcode{flush} without a list from Synchronization
since the example is confusing and the use of \kcode{flush} is already
covered in other examples
(\specref{chap:synchronization})
\item \docref{declare variant Directive} and \docref{Metadirective} sections were moved to
subsections in the new \docref{Context-based Variant Selection} section,
with a section introduction on context selectors.
(\specref{sec:context_based_variants})
\item Fixed a typo (`\kcode{for}' $\rightarrow$ `\kcode{do}') in
\example{Example metadirective.4.f90}
(\specref{subsec:metadirective})
\end{itemize}
\end{itemize}
%=====================================
\section{Changes from 5.2 to 5.2.1}
\label{sec:history_52_to_521}
\begin{itemize}
\item General changes:
\begin{itemize}
\item Updated source metadata tags for all examples to use an improved form
(see \examplesblob{v5.2.1/Contributions.md})
\item Explicitly included the version tag \verlabel[pre\_]{3.0} in those
examples that did not contain a version tag previously
\end{itemize}
\item Added the following examples for the 5.2 features:
\begin{itemize}
\item \kcode{uses_allocators} clause for the use of allocators in
\kcode{target} regions (\specref{sec:allocators})
\end{itemize}
\item Added the following examples for the 5.1 features:
\begin{itemize}
\item The \kcode{inoutset} dependence type (\specref{subsec:task_concurrent_depend})
\item Atomic compare and capture (\specref{sec:cas})
\end{itemize}
\item Added the following examples for the 5.0 features:
\begin{itemize}
\item \kcode{declare target} directive with \kcode{device_type(nohost)}
clause (\specref{subsec:declare_target_device_type})
\item \kcode{omp_pause_resource} and \kcode{omp_pause_resource_all}
routines (\specref{sec:pause_resource})
\end{itemize}
\item Miscellaneous fixes:
\begin{itemize}
\item Cast to implementation-defined enum type \kcode{omp_event_handle_t}
now uses \bcode{uintptr_t} (not \bcode{void *}) in
\example{Example task_detach.2.c}
(\specref{sec:task_detachment})
\item Moved Fortran \kcode{requires} directive into program main (\ucode{rev_off}),
the program unit, in \example{Example target_reverse_offload.7.f90}
(\specref{subsec:target_reverse_offload})
\item Fixed an inconsistent use of mapper in \example{Example target_mapper.3.f90}
(\specref{sec:declare_mapper})
\item Added a missing semicolon at end of \ucode{XOR1} class definition in
\example{Example declare_target.2a.cpp}
(\specref{subsec:declare_target_class})
\item Fixed the placement of \kcode{declare simd} directive in
\example{Examples linear_modifier.*.f90} (\specref{sec:linear_modifier})
and added a general statement about where a Fortran declarative
directive can appear (\specref{chap:directive_syntax})
\item Fixed mismatched argument list in \example{Example fort_sa_private.5.f}
(\specref{sec:fort_sa_private})
\item Moved the placement of \kcode{declare target enter}
directive after function declaration
(\specref{subsec:target_task_reduction})
\item Fixed an incorrect use of \kcode{omp_in_parallel} routine in
\example{Example metadirective.4}
(\specref{subsec:metadirective})
\item Fixed an incorrect value for \kcode{at} clause
(\specref{subsec:error})
\end{itemize}
\end{itemize}
%=====================================
\section{Changes from 5.1 to 5.2}
\label{sec:history_51_to_52}
\begin{itemize}
\item General changes:
\begin{itemize}
\item Included a description of the semantics for OpenMP directive syntax
(see \specref{chap:directive_syntax})
\item Reorganized the Introduction Chapter and moved the Feature
Deprecation Chapter to Appendix~\ref{chap:deprecated_features}
\item Included a list of examples that were updated for feature deprecation
and replacement in each version (see Appendix~\ref{sec:Updated Examples})
\item Added Index entries
\end{itemize}
\item Updated the examples for feature deprecation and replacement in OpenMP 5.2.
See Table~\ref{tab:Deprecated Features} and
Table~\ref{tab:Updated Examples 5.2} for details.
\item Added the following examples for the 5.2 features:
\begin{itemize}
\item Mapping class objects with virtual functions
(\specref{sec:virtual_functions})
\item \kcode{allocators} construct for Fortran \bcode{allocate} statement
(\specref{sec:allocators})
\item Behavior of reallocation of variables through OpenMP allocator in
Fortran (\specref{sec:allocators})
\end{itemize}
\item Added the following examples for the 5.1 features:
\begin{itemize}
\item Clarification of optional \kcode{end} directive for strictly structured
block in Fortran (\specref{sec:fortran_free_format_comments})
\item \kcode{filter} clause on \kcode{masked} construct (\specref{sec:masked})
\item \kcode{omp_all_memory} reserved locator for specifying task dependences
(\specref{subsec:depend_undefer_task})
\item Behavior of Fortran allocatable variables in \kcode{target} regions
(\specref{sec:fort_allocatable_array_mapping})
\item Device memory routines in Fortran
(\specref{subsec:target_mem_and_device_ptrs})
\item Partial tiles from \kcode{tile} construct
(\specref{sec:incomplete_tiles})
\item Fortran associate names and selectors in \kcode{target} region
(\specref{sec:associate_target})
\item \kcode{allocate} directive for variable declarations and
\kcode{allocate} clause on \kcode{task} constructs
(\specref{sec:allocators})
\item Controlling concurrency and reproducibility with \kcode{order} clause
(\specref{sec:reproducible_modifier})
\end{itemize}
\item Added other examples:
\begin{itemize}
\item Using lambda expressions with \kcode{target} constructs
(\specref{sec:lambda_expressions})
\item Target memory and device pointer routines
(\specref{subsec:target_mem_and_device_ptrs})
\item Examples to illustrate the ordering properties of
the \plc{flush} operation (\specref{sec:mem_model})
\item User selector in the \kcode{metadirective} directive
(\specref{subsec:metadirective})
\end{itemize}
\end{itemize}
%=====================================
\section{Changes from 5.0.1 to 5.1}
\label{sec:history_501_to_51}
\begin{itemize}
\item General changes:
\begin{itemize}
\item Replaced \kcode{master} construct example with equivalent \kcode{masked} construct example (\specref{sec:masked})
\item Primary thread is now used to describe thread number 0 in the current team
\item \kcode{primary} thread affinity policy is now used to specify that every
thread in the team is assigned to the same place as the primary thread (\specref{subsec:affinity_primary})
\item The \kcode{omp_lock_hint_*} constants have been renamed \kcode{omp_sync_hint_*} (\specref{sec:critical}, \specref{sec:locks})
\end{itemize}
\item Added the following new chapters:
\begin{itemize}
\item Deprecated Features (on page~\pageref{chap:deprecated_features})
\item Directive Syntax (\specref{chap:directive_syntax})
\item Loop Transformations (\specref{chap:loop_transformations})
\item OMPT Interface (\specref{chap:ompt_interface})
\end{itemize}
\item Added the following examples for the 5.1 features:
\begin{itemize}
\item OpenMP directives in C++ \plc{attribute} specifiers
(\specref{sec:attributes})
\item Directive syntax adjustment to allow Fortran \bcode{BLOCK} ...
\bcode{END BLOCK} as a structured block
(\specref{sec:fortran_free_format_comments})
\item \kcode{omp_target_is_accessible} API routine
(\specref{sec:pointer_mapping})
\item Fortran allocatable array mapping in \kcode{target} regions (\specref{sec:fort_allocatable_array_mapping})
\item \kcode{begin declare target} (with
\kcode{end declare target}) directive
(\specref{subsec:declare_target_class})
\item \kcode{tile} construct (\specref{sec:tile})
\item \kcode{unroll} construct (\specref{sec:unroll})
\item Reduction with the \kcode{scope} construct
(\specref{subsec:reduction_scope})
\item \kcode{metadirective} directive with dynamic \kcode{condition} selector
(\specref{subsec:metadirective})
\item \kcode{interop} construct (\specref{sec:interop})
\item Environment display with the \kcode{omp_display_env} routine
(\specref{subsec:display_env})
\item \kcode{error} directive (\specref{subsec:error})
\end{itemize}
\item Included additional examples for the 5.0 features:
\begin{itemize}
\item \kcode{collapse} clause for non-rectangular loop nest
(\specref{sec:collapse})
\item \kcode{detach} clause for tasks (\specref{sec:task_detachment})
\item Pointer attachment for a structure member (\specref{sec:structure_mapping})
\item Host and device pointer association with the \kcode{omp_target_associate_ptr} routine (\specref{sec:target_associate_ptr})
\item Sample code on activating the tool interface
(\specref{sec:ompt_start})
\end{itemize}
\item Added other examples:
\begin{itemize}
\item The \kcode{omp_get_wtime} routine (\specref{subsec:get_wtime})
\end{itemize}
\end{itemize}
%=====================================
\section{Changes from 5.0.0 to 5.0.1}
\label{sec:history_50_to_501}
\begin{itemize}
\item Added version tags \verlabel{\plc{x.y}} in example labels
and the corresponding source codes for all examples that feature
OpenMP 3.0 and later.
\item Included additional examples for the 5.0 features:
\begin{itemize}
\item Extension to the \kcode{defaultmap} clause
(\specref{sec:defaultmap})
\item Transferring noncontiguous data with the \kcode{target update} directive in Fortran (\specref{sec:array-shaping})
\item \kcode{conditional} modifier for the \kcode{lastprivate} clause (\specref{sec:lastprivate})
\item \kcode{task} modifier for the \kcode{reduction} clause (\specref{subsec:task_reduction})
\item Reduction on combined target constructs (\specref{subsec:target_reduction})
\item Task reduction with \kcode{target} constructs
(\specref{subsec:target_task_reduction})
\item \kcode{scan} directive for returning the \emph{prefix sum} of a reduction (\specref{sec:scan})
\end{itemize}
\item Included additional examples for the 4.x features:
\begin{itemize}
\item Dependence for undeferred tasks
(\specref{subsec:depend_undefer_task})
\item \kcode{ref}, \kcode{val}, \kcode{uval} modifiers for \kcode{linear} clause (\specref{sec:linear_modifier})
\end{itemize}
\item Clarified the description of pointer mapping and pointer attachment in
\specref{sec:pointer_mapping}.
\item Clarified the description of memory model examples
in \specref{sec:mem_model}.
\end{itemize}
\section{Changes from 4.5.0 to 5.0.0}
\label{sec:history_45_to_50}
\begin{itemize}
\item Added the following examples for the 5.0 features:
\begin{itemize}
\item Extended \kcode{teams} construct for host execution (\specref{sec:host_teams})
\item \kcode{loop} and \kcode{teams loop} constructs specify loop iterations that can execute concurrently
(\specref{sec:loop})
\item Task data affinity is indicated by \kcode{affinity} clause of \kcode{task} construct
(\specref{sec: task_affinity})
\item Display thread affinity with \kcode{OMP_DISPLAY_AFFINITY} environment variable or \kcode{omp_display_affinity()} API routine
(\specref{sec:affinity_display})
\item \kcode{taskwait} with dependences (\specref{subsec:taskwait_depend})
\item \kcode{mutexinoutset} task dependences (\specref{subsec:task_dep_mutexinoutset})
\item Multidependence Iterators (in \kcode{depend} clauses) (\specref{subsec:depend_iterator})
\item Combined constructs: \kcode{parallel master taskloop} and \kcode{parallel master taskloop simd}
(\specref{sec:parallel_masked_taskloop})
\item Reverse Offload through \kcode{ancestor} modifier of \kcode{device} clause. (\specref{subsec:target_reverse_offload})
\item Pointer Mapping - behavior of mapped pointers (\specref{sec:pointer_mapping}) %Example_target_ptr_map*
\item Structure Mapping - behavior of mapped structures (\specref{sec:structure_mapping}) %Examples_target_structure_mapping.tex target_struct_map*
\item Array Shaping with the \plc{shape-operator} (\specref{sec:array-shaping})
\item The \kcode{declare mapper} directive (\specref{sec:declare_mapper})
\item Acquire and Release Semantics Synchronization: Memory ordering
clauses \kcode{acquire}, \kcode{release}, and \kcode{acq_rel} were added
to flush and atomic constructs
(\specref{sec:acquire_and_release_semantics})
\item \kcode{depobj} construct provides dependence objects for subsequent use in \kcode{depend} clauses
(\specref{sec:depobj})
\item \kcode{reduction} clause for \kcode{task} construct (\specref{subsec:task_reduction})
\item \kcode{reduction} clause for \kcode{taskloop} construct (\specref{subsec:taskloop_reduction})
\item \kcode{reduction} clause for \kcode{taskloop simd} construct (\specref{subsec:taskloop_reduction})
\item Memory Allocators for making OpenMP memory requests with traits (\specref{sec:allocators})
\item \kcode{requires} directive specifies required features of implementation (\specref{sec:requires})
\item \kcode{declare variant} directive - for function variants
(\specref{subsec:declare_variant})
\item \kcode{metadirective} directive - for directive variants
(\specref{subsec:metadirective})
\item \kcode{OMP_TARGET_OFFLOAD} Environment Variable - controls offload behavior (\specref{sec:target_offload})
\end{itemize}
\item Included the following additional examples for the 4.x features:
\begin{itemize}
\item more taskloop examples (\specref{sec:taskloop})
\item user-defined reduction (UDR) (\specref{subsec:UDR})
%NEW 5.0
%\item \code{target} \code{enter} and \code{exit} \code{data} unstructured data constructs (\specref{sec:target_enter_exit_data}) %Example_target_unstructured_data.* ?
\end{itemize}
\end{itemize}
\section{Changes from 4.0.2 to 4.5.0}
\begin{itemize}
\item Reorganized into chapters of major topics
\item Included file extensions in example labels to indicate source type
\item Applied the explicit \kcode{map(tofrom)} for scalar variables
in a number of examples to comply with
the change of the default behavior for scalar variables from
\kcode{map(tofrom)} to \kcode{firstprivate} in the 4.5 specification
\item Added the following new examples:
\begin{itemize}
\item \kcode{linear} clause in loop constructs (\specref{sec:linear_in_loop})
\item \kcode{priority} clause for \kcode{task} construct (\specref{sec:task_priority})
\item \kcode{taskloop} construct (\specref{sec:taskloop})
\item \plc{directive-name} modifier in multiple \kcode{if} clauses on
a combined construct (\specref{subsec:target_if})
\item unstructured data mapping (\specref{sec:target_enter_exit_data})
\item \kcode{link} clause for \kcode{declare target} directive
(\specref{subsec:declare_target_link})
\item asynchronous target execution with \kcode{nowait} clause (\specref{sec:async_target_exec_depend})
\item device memory routines and device pointers (\specref{subsec:target_mem_and_device_ptrs})
\item doacross loop nest (\specref{sec:doacross})
\item locks with hints (\specref{sec:locks})
\item C/C++ array reduction (\specref{subsec:reduction})
\item C++ reference types in data sharing clauses (\specref{sec:cpp_reference})
\end{itemize}
\end{itemize}
\section{Changes from 4.0.1 to 4.0.2}
\begin{itemize}
\item Names of examples were changed from numbers to mnemonics
\item Added SIMD examples (\specref{chap:SIMD})
\item Added SIMD examples (\specref{sec:SIMD})
\item Applied miscellaneous fixes in several source codes
\item Added the revision history
\end{itemize}
@ -14,27 +466,29 @@
Added the following new examples:
\begin{itemize}
\item the \code{proc\_bind} clause (\specref{chap:affinity})
\item the \code{taskgroup} construct (\specref{chap:taskgroup})
\item the \kcode{proc_bind} clause (\specref{sec:affinity})
\item the \kcode{taskgroup} construct (\specref{sec:taskgroup})
\end{itemize}
\section{Changes from 3.1 to 4.0}
Beginning with OpenMP 4.0, examples were placed in a separate document
from the specification document.
Version 4.0 added the following new examples:
\begin{itemize}
\item task dependences (\specref{chap:task_dep})
\item cancellation constructs (\specref{chap:cancellation})
\item \code{target} construct (\specref{chap:target})
\item \code{target} \code{data} construct (\specref{chap:target_data})
\item \code{target} \code{update} construct (\specref{chap:target_update})
\item \code{declare} \code{target} construct (\specref{chap:declare_target})
\item \code{teams} constructs (\specref{chap:teams})
\item asynchronous execution of a \code{target} region using tasks
(\specref{chap:async_target})
\item array sections in device constructs (\specref{chap:array_sections})
\item device runtime routines (\specref{chap:device})
\item Fortran ASSOCIATE construct (\specref{chap:associate})
\item Beginning with OpenMP 4.0, examples were placed in a separate document
from the specification document.
\item Version 4.0 added the following new examples:
\begin{itemize}
\item task dependences (\specref{sec:task_depend})
\item \kcode{target} construct (\specref{sec:target})
\item array sections in device constructs (\specref{sec:array_sections})
\item \kcode{target data} construct (\specref{sec:target_data})
\item \kcode{target update} construct (\specref{sec:target_update})
\item \kcode{declare target} directive (\specref{sec:declare_target})
\item \kcode{teams} constructs (\specref{sec:teams})
\item asynchronous execution of a \kcode{target} region using tasks (\specref{subsec:async_target_with_tasks})
\item device runtime routines (\specref{sec:device})
\item Fortran ASSOCIATE construct (\specref{sec:associate})
\item cancellation constructs (\specref{sec:cancellation})
\end{itemize}
\end{itemize}

179
Makefile
View File

@ -1,77 +1,41 @@
# Makefile for the OpenMP Examples document in LaTex format.
# For more information, see the master document, openmp-examples.tex.
# For more information, see the main document, openmp-examples.tex.
SHELL=bash
include versioninfo
version=4.0.2
default: openmp-examples.pdf
diff: clean openmp-diff-abridged.pdf
release: VERSIONSTR="$(version_date)"
release: clean openmp-examples.pdf
book: BOOK_BUILD="\\def\\bookbuild{1}"
book: clean release
mv openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf
ccpp-only: LANG_OPT="\\ccpptrue\\fortranfalse"
ccpp-only: clean release
fortran-only: LANG_OPT="\\ccppfalse\\fortrantrue"
fortran-only: clean release
CHAPTERS=Title_Page.tex \
Introduction_Chapt.tex \
Examples_Chapt.tex \
Examples_ploop.tex \
Examples_mem_model.tex \
Examples_cond_comp.tex \
Examples_icv.tex \
Examples_parallel.tex \
Examples_nthrs_nesting.tex \
Examples_nthrs_dynamic.tex \
Examples_affinity.tex \
Examples_fort_do.tex \
Examples_fort_loopvar.tex \
Examples_nowait.tex \
Examples_collapse.tex \
Examples_psections.tex \
Examples_fpriv_sections.tex \
Examples_single.tex \
Examples_tasking.tex \
Examples_task_dep.tex \
Examples_taskgroup.tex \
Examples_taskyield.tex \
Examples_workshare.tex \
Examples_master.tex \
Examples_critical.tex \
Examples_worksharing_critical.tex \
Examples_barrier_regions.tex \
Examples_atomic.tex \
Examples_atomic_restrict.tex \
Examples_flush_nolist.tex \
Examples_standalone.tex \
Examples_ordered.tex \
Examples_cancellation.tex \
Examples_threadprivate.tex \
Examples_pra_iterator.tex \
Examples_fort_sp_common.tex \
Examples_default_none.tex \
Examples_fort_race.tex \
Examples_private.tex \
Examples_fort_sa_private.tex \
Examples_carrays_fpriv.tex \
Examples_lastprivate.tex \
Examples_reduction.tex \
Examples_copyin.tex \
Examples_copyprivate.tex \
Examples_nested_loop.tex \
Examples_nesting_restrict.tex \
Examples_set_dynamic_nthrs.tex \
Examples_get_nthrs.tex \
Examples_init_lock.tex \
Examples_lock_owner.tex \
Examples_simple_lock.tex \
Examples_nestable_lock.tex \
Examples_SIMD.tex \
Examples_target.tex \
Examples_target_data.tex \
Examples_target_update.tex \
Examples_declare_target.tex \
Examples_teams.tex \
Examples_async_target.tex \
Examples_array_sections.tex \
Examples_device.tex \
Examples_associate.tex \
History.tex
Foreword_Chapt.tex \
Chap_*.tex \
Deprecated_Features.tex \
History.tex \
*/*.tex
SOURCES=*/sources/*.c \
*/sources/*.cpp \
*/sources/*.f90 \
*/sources/*.f
INTERMEDIATE_FILES=openmp-examples.pdf \
openmp-examples.toc \
openmp-examples.lof \
openmp-examples.lot \
openmp-examples.idx \
openmp-examples.aux \
openmp-examples.ilg \
@ -79,13 +43,90 @@ INTERMEDIATE_FILES=openmp-examples.pdf \
openmp-examples.out \
openmp-examples.log
openmp-examples.pdf: $(CHAPTERS) openmp.sty openmp-examples.tex openmp-logo.png
LATEXCMD=pdflatex -interaction=batchmode -file-line-error
LATEXDCMD=$(LATEXCMD) -draftmode
# check for branches names with "name_XXX"
DIFF_TICKET_ID=$(shell git rev-parse --abbrev-ref HEAD)
GITREV=$(shell git rev-parse --short HEAD || echo "??")
VERSIONSTR="GIT rev $(GITREV)"
LANG_OPT="\\ccpptrue\\fortrantrue"
openmp-examples.pdf: $(CHAPTERS) $(SOURCES) openmp.sty openmp-examples.tex openmp-logo.png generated-include.tex
rm -f $(INTERMEDIATE_FILES)
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
pdflatex -interaction=batchmode -file-line-error openmp-examples.tex
touch generated-include.tex
$(LATEXDCMD) openmp-examples.tex
makeindex -s openmp-index.ist openmp-examples.idx
$(LATEXDCMD) openmp-examples.tex
$(LATEXCMD) openmp-examples.tex
cp openmp-examples.pdf openmp-examples-${version}.pdf
check:
sources/check_tags
clean:
rm -f $(INTERMEDIATE_FILES)
rm -f generated-include.tex
rm -f openmp-diff-full.pdf openmp-diff-abridged.pdf
rm -rf *.tmpdir
cd util; make clean
rm -f chk_tags.log sources/*.log
realclean: clean
rm -f openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf
ifdef DIFF_TO
VC_DIFF_TO := -r ${DIFF_TO}
else
VC_DIFF_TO :=
endif
ifdef DIFF_FROM
VC_DIFF_FROM := -r ${DIFF_FROM}
else
VC_DIFF_FROM := -r work_6.0
endif
DIFF_TO:=HEAD
DIFF_FROM:=work_6.0
DIFF_TYPE:=UNDERLINE
COMMON_DIFF_OPTS:=--math-markup=whole \
--append-safecmd=plc,code,kcode,scode,ucode,vcode,splc,bcode,pvar,pout,example \
--append-textcmd=subsubsubsection
VC_DIFF_OPTS:=${COMMON_DIFF_OPTS} --force -c latexdiff.cfg --flatten --type="${DIFF_TYPE}" --git --pdf ${VC_DIFF_FROM} ${VC_DIFF_TO} --subtype=ZLABEL --graphics-markup=none
VC_DIFF_MINIMAL_OPTS:= --only-changes --force
generated-include.tex:
echo "$(BOOK_BUILD)" > $@
echo "\\def\\VER{${version}}" >> $@
echo "\\def\\SVER{${version_spec}}" >> $@
echo "\\def\\VERDATE{${VERSIONSTR}}" >> $@
@echo "\\newif\\ifccpp\\newif\\iffortran" >> $@
echo "$(LANG_OPT)" >> $@
util/list_tags -vtag */sources/* >> $@
%.tmpdir: $(wildcard *.sty) $(wildcard *.png) $(wildcard *.aux) openmp-examples.pdf
mkdir -p $@/sources
for i in affinity devices loop_transformations parallel_execution SIMD tasking \
data_environment memory_model program_control synchronization \
directives ompt_interface; do \
mkdir -p $@/$$i; ln -sf "$$PWD"/$$i/sources $@/$$i/sources; done
mkdir -p $@/figs
cp -f $^ "$@/"
cp -f sources/* "$@/sources"
cp -f figs/* "$@/figs"
openmp-diff-abridged.pdf: diff-fast-minimal.tmpdir openmp-examples.pdf
env PATH="$(shell pwd)/util/latexdiff:$(PATH)" latexdiff-vc ${VC_DIFF_MINIMAL_OPTS} --fast -d $< ${VC_DIFF_OPTS} openmp-examples.tex
cp $</openmp-examples.pdf $@
if [ "x$(DIFF_TICKET_ID)" != "x" ]; then cp $@ ${@:.pdf=-$(DIFF_TICKET_ID).pdf}; fi
# Slow but portable diffs
openmp-diff-minimal.pdf: diffs-slow-minimal.tmpdir
env PATH="$(shell pwd)/util/latexdiff:$(PATH)" latexdiff-vc ${VC_DIFF_MINIMAL_OPTS} -d $< ${VC_DIFF_OPTS} openmp-examples.tex
cp $</openmp-examples.pdf $@
if [ "x$(DIFF_TICKET_ID)" != "x" ]; then cp $@ ${@:.pdf=-$(DIFF_TICKET_ID).pdf}; fi
.PHONY: diff default book clean realclean

62
README
View File

@ -1,62 +0,0 @@
This is the OpenMP Examples document in LaTeX format.
Please see the master file, openmp-examples.tex, for more information.
For a brief revision history, please see Changes.log.
1) Process for adding an example
- Prepare source code and text description
- Give a high level description in a trac ticket
- Determine a name (ename) for the example
- Propose a new name if creating a new chapter
- Use the existing name if adding to an existing chapter
- Number the example within the chapter (seq-no)
- Create files for the source code with proper tags in
sources/Example_<ename>.<seq-no>c.c
sources/Example_<ename>.<seq-no>f.f
- Create or update the description text in the chapter file
Examples_<ename>,tex
- If needed, add the new chapter file name in
Makefile
openmp-examples.tex
- Commit the changes in git and push to the GitHub repo
- Discuss and vote in committee
2) Tags (meta data) for example sources
@@name: <ename>.<seq-no>[c|f]
@@type: C|C++|F-fixed|F-free
@@compilable: yes|no|maybe
@@linkable: yes|no|maybe
@@expect: success|failure|nothing|rt-error
"name" is the name of an example
"type" is the source code type, which can be translated into or from
proper file extension (c,cpp,f,f90)
"compilable" indicates whether the source code is compilable
"linkable" indicates whether the source code is linkable
"expect" indicates some expected result for testing purpose
"success|failure|nothing" applies to the result of code compilation
"rt-error" is for a case where compilation may be successful,
but the code contains potential runtime issues (such as race condition).
Alternative would be to just use "conforming" or "non-conforming".
3) LaTeX macros for examples
- Source code with language h-rules
\cexample{<ename>}{<seq-no>c}
\fexample{<ename>}{<seq-no>f}
- Source code without language h-rules
\cnexample{<ename>}{<seq-no>c}
\fnexample{<ename>}{<seq-no>f}
- Language h-rules
\cspecificstart, \cspecificend
\cppspecificstart, \cppspecificend
\ccppspecificstart, \ccppspecificend
\fortranspecificstart, \fortranspecificend
- See openmp.sty for more information

View File

@ -1,2 +1,10 @@
# Examples
LaTeX Examples Document Source
# OpenMP Examples Document
This is the OpenMP Examples document in LaTeX format.
Please see [Contributions.md](Contributions.md) on how to make contributions to adding new examples.
For a brief revision history, please see [Changes.log](Changes.log).
For copyright information, please see [omp_copyright.txt](omp_copyright.txt).

150
SIMD/SIMD.tex Normal file
View File

@ -0,0 +1,150 @@
%\pagebreak
\section{\kcode{simd} and \kcode{declare simd} Directives}
\label{sec:SIMD}
\index{constructs!simd@\kcode{simd}}
\index{simd construct@\kcode{simd} construct}
The following example illustrates the basic use of the \kcode{simd} construct
to assure the compiler that the loop can be vectorized.
\cexample[4.0]{SIMD}{1}
\ffreeexample[4.0]{SIMD}{1}
\index{directives!declare simd@\kcode{declare simd}}
\index{declare simd directive@\kcode{declare simd} directive}
\index{clauses!uniform@\kcode{uniform}}
\index{uniform clause@\kcode{uniform} clause}
\index{clauses!linear@\kcode{linear}}
\index{linear clause@\kcode{linear} clause}
When a function can be inlined within a loop the compiler has an opportunity to
vectorize the loop. By guaranteeing SIMD behavior of a function's operations,
characterizing the arguments of the function and privatizing temporary
variables of the loop, the compiler can often create faster, vector code for
the loop. In the examples below the \kcode{declare simd} directive is
used on the \ucode{add1} and \ucode{add2} functions to enable creation of their
corresponding SIMD function versions for execution within the associated SIMD
loop. The functions characterize two different approaches of accessing data
within the function: by a single variable and as an element in a data array,
respectively. The \ucode{add3} C function uses dereferencing.
The \kcode{declare simd} directives also illustrate the use of
\kcode{uniform} and \kcode{linear} clauses. The \kcode{uniform(\ucode{fact})} clause
indicates that the variable \ucode{fact} is invariant across the SIMD lanes. In
the \ucode{add2} function \ucode{a} and \ucode{b} are included in the \kcode{uniform}
list because the C pointer and the Fortran array references are constant. The
\ucode{i} index used in the \ucode{add2} function is included in a \kcode{linear}
clause with a constant-linear-step of 1, to guarantee a unity increment of the
associated loop. In the \kcode{declare simd} directive for the \ucode{add3}
C function the \kcode{linear(\ucode{a,b:1})} clause instructs the compiler to generate
unit-stride loads across the SIMD lanes; otherwise, costly \emph{gather}
instructions would be generated for the unknown sequence of access of the
pointer dereferences.
In the \kcode{simd} constructs for the loops the \kcode{private(\ucode{tmp})} clause is
necessary to assure that each vector operation has its own \ucode{tmp}
variable.
\cexample[4.0]{SIMD}{2}
\ffreeexample[4.0]{SIMD}{2}
%\pagebreak
\index{clauses!private@\kcode{private}}
\index{private clause@\kcode{private} clause}
\index{clauses!reduction@\kcode{reduction}}
\index{reduction clause@\kcode{reduction} clause}
\index{reductions!reduction clause@\kcode{reduction} clause}
A thread that encounters a SIMD construct executes a vectorized code of the
iterations. Similar to the concerns of a worksharing loop a loop vectorized
with a SIMD construct must assure that temporary and reduction variables are
privatized and declared as reductions with clauses. The example below
illustrates the use of \kcode{private} and \kcode{reduction} clauses in a SIMD
construct.
\cexample[4.0]{SIMD}{3}
\ffreeexample[4.0]{SIMD}{3}
%\pagebreak
\index{clauses!safelen@\kcode{safelen}}
\index{safelen clause@\kcode{safelen} clause}
A \kcode{safelen(\ucode{N})} clause in a \kcode{simd} construct assures the compiler that
there are no loop-carried dependences for vectors of size \ucode{N} or below. If
the \kcode{safelen} clause is not specified, then the default safelen value is
the number of loop iterations.
The \kcode{safelen(\ucode{16})} clause in the example below guarantees that the vector
code is safe for vectors up to and including size 16. In the loop, \ucode{m} can
be 16 or greater, for correct code execution. If the value of \ucode{m} is less
than 16, the behavior is undefined.
\cexample[4.0]{SIMD}{4}
\ffreeexample[4.0]{SIMD}{4}
%\pagebreak
\index{clauses!collapse@\kcode{collapse}}
\index{collapse clause@\kcode{collapse} clause}
The following SIMD construct instructs the compiler to collapse the \ucode{i} and
\ucode{j} loops into a single SIMD loop in which SIMD chunks are executed by
threads of the team. Within the workshared loop chunks of a thread, the SIMD
chunks are executed in the lanes of the vector units.
\cexample[4.0]{SIMD}{5}
\ffreeexample[4.0]{SIMD}{5}
%%% section
\section{\kcode{inbranch} and \kcode{notinbranch} Clauses}
\label{sec:SIMD_branch}
\index{clauses!inbranch@\kcode{inbranch}}
\index{inbranch clause@\kcode{inbranch} clause}
\index{clauses!notinbranch@\kcode{notinbranch}}
\index{notinbranch clause@\kcode{notinbranch} clause}
The following examples illustrate the use of the \kcode{declare simd}
directive with the \kcode{inbranch} and \kcode{notinbranch} clauses. The
\kcode{notinbranch} clause informs the compiler that the function \ucode{foo} is
never called conditionally in the SIMD loop of the function \ucode{myaddint}. On
the other hand, the \kcode{inbranch} clause for the function goo indicates that
the function is always called conditionally in the SIMD loop inside
the function \ucode{myaddfloat}.
\cexample[4.0]{SIMD}{6}
\ffreeexample[4.0]{SIMD}{6}
In the code below, the function \ucode{fib()} is called in the main program and
also recursively called in the function \ucode{fib()} within an \bcode{if}
condition. The compiler creates a masked vector version and a non-masked vector
version for the function \ucode{fib()} while retaining the original scalar
version of the \ucode{fib()} function.
\cexample[4.0]{SIMD}{7}
\ffreeexample[4.0]{SIMD}{7}
%%% section
%\pagebreak
\section{Loop-Carried Lexical Forward Dependence}
\label{sec:SIMD_forward_dep}
\index{dependences!loop-carried lexical forward}
The following example tests the restriction on an SIMD loop with the loop-carried lexical forward-dependence. This dependence must be preserved for the correct execution of SIMD loops.
A loop can be vectorized even though the iterations are not completely independent when it has loop-carried dependences that are forward lexical dependences, indicated in the code below by the read of \ucode{A[j+1]} and the write to \ucode{A[j]} in C/C++ code (or \ucode{A(j+1)} and \ucode{A(j)} in Fortran). That is, the read of \ucode{A[j+1]} (or \ucode{A(j+1)} in Fortran) before the write to \ucode{A[j]} (or \ucode{A(j)} in Fortran) ordering must be preserved for each iteration in \ucode{j} for valid SIMD code generation.
This test assures that the compiler preserves the loop-carried lexical forward-dependence for generating a correct SIMD code.
\cexample[4.0]{SIMD}{8}
\ffreeexample[4.0]{SIMD}{8}

83
SIMD/linear_modifier.tex Normal file
View File

@ -0,0 +1,83 @@
%%% section
\section{\kcode{ref}, \kcode{val}, \kcode{uval} Modifiers for \kcode{linear} Clause}
\label{sec:linear_modifier}
\index{modifiers, linear@modifiers, \kcode{linear}!ref@\kcode{ref}}
\index{modifiers, linear@modifiers, \kcode{linear}!val@\kcode{val}}
\index{modifiers, linear@modifiers, \kcode{linear}!uval@\kcode{uval}}
\index{clauses!linear@\kcode{linear}}
\index{linear clause@\kcode{linear} clause}
When generating vector functions from \kcode{declare simd} directives,
it is important for a compiler to know the proper types of function arguments in
order to generate efficient codes.
This is especially true for C++ reference types and Fortran arguments.
In the following example, the function \ucode{add_one2} has a C++ reference
parameter (or Fortran argument) \ucode{p}. Variable \ucode{p} gets incremented by 1 in the function.
The caller loop \ucode{i} in the main program passes
a variable \ucode{k} as a reference to the function \ucode{add_one2} call.
The \kcode{ref} modifier for the \kcode{linear} clause on the
\kcode{declare simd} directive specifies that the
reference-type parameter \ucode{p} is to match the property of the variable
\ucode{k} in the loop.
This use of reference type is equivalent to the second call to
\ucode{add_one2} with a direct passing of the array element \ucode{a[i]}.
In the example, the preferred vector
length 8 is specified for both the caller loop and the callee function.
When \kcode{linear(\ucode{p}: ref)} is applied to an argument passed by reference,
it tells the compiler that the addresses in its vector argument are consecutive,
and so the compiler can generate a single vector load or store instead of
a gather or scatter. This allows more efficient SIMD code to be generated with
less source changes.
\cppexample[5.2]{linear_modifier}{1}
\ffreeexample[5.2]{linear_modifier}{1}
%\clearpage
The following example is a variant of the above example. The function \ucode{add_one2}
in the C++ code includes an additional C++ reference parameter \ucode{i}.
The loop index \ucode{i} of the caller loop \ucode{i} in the main program
is passed as a reference to the function \ucode{add_one2} call.
The loop index \ucode{i} has a uniform address with
linear value of step 1 across SIMD lanes.
Thus, the \kcode{uval} modifier is used for the \kcode{linear} clause
to specify that the C++ reference-type parameter \ucode{i} is to match
the property of loop index \ucode{i}.
In the corresponding Fortran code the arguments \ucode{p} and
\ucode{i} in the routine \ucode{add_on2} are passed by references.
Similar modifiers are used for these variables in the \kcode{linear} clauses
to match with the property at the caller loop in the main program.
When \kcode{linear(\ucode{i}: uval)} is applied to an argument passed by reference, it
tells the compiler that its addresses in the vector argument are uniform
so that the compiler can generate a scalar load or scalar store and create
linear values. This allows more efficient SIMD code to be generated with
less source changes.
\cppexample[5.2]{linear_modifier}{2}
\ffreeexample[5.2]{linear_modifier}{2}
In the following example, the function \ucode{func} takes arrays \ucode{x} and \ucode{y}
as arguments, and accesses the array elements referenced by the index \ucode{i}.
The caller loop \ucode{i} in the main program passes a linear copy of
the variable \ucode{k} to the function \ucode{func}.
The \kcode{val} modifier is used for the \kcode{linear} clause
in the \kcode{declare simd} directive for the function
\ucode{func} to specify that the argument \ucode{i} is to match the property of
the actual argument \ucode{k} passed in the SIMD loop.
Arrays \ucode{x} and \ucode{y} have uniform addresses across SIMD lanes.
When \kcode{linear(\ucode{i}: val,step(\ucode{1}))} is applied to an argument,
it tells the compiler that its addresses in the vector argument may not be
consecutive, however, their values are linear (with stride 1 here). When the value of \ucode{i} is used
in subscript of array references (e.g., \ucode{x[i]}), the compiler can generate
a vector load or store instead of a gather or scatter. This allows more
efficient SIMD code to be generated with less source changes.
\cexample[5.2]{linear_modifier}{3}
\ffreeexample[5.2]{linear_modifier}{3}

View File

@ -1,9 +1,9 @@
/*
* @@name: SIMD.1c
* @@name: SIMD.1
* @@type: C
* @@compilable: yes
* @@linkable: no
* @@operation: compile
* @@expect: success
* @@version: omp_4.0
*/
void star( double *a, double *b, double *c, int n, int *ioff )
{

View File

@ -1,8 +1,8 @@
! @@name: SIMD.1f
! @@name: SIMD.1
! @@type: F-free
! @@compilable: yes
! @@linkable: no
! @@operation: compile
! @@expect: success
! @@version: omp_4.0
subroutine star(a,b,c,n,ioff_ptr)
implicit none
double precision :: a(*),b(*),c(*)

View File

@ -1,9 +1,9 @@
/*
* @@name: SIMD.2c
* @@name: SIMD.2
* @@type: C
* @@compilable: yes
* @@linkable: yes
* @@operation: link
* @@expect: success
* @@version: omp_4.0
*/
#include <stdio.h>

View File

@ -1,8 +1,8 @@
! @@name: SIMD.2f
! @@name: SIMD.2
! @@type: F-free
! @@compilable: yes
! @@linkable: yes
! @@operation: link
! @@expect: success
! @@version: omp_4.0
program main
implicit none
integer, parameter :: N=32
@ -19,15 +19,15 @@ program main
end program
function add1(a,b,fact) result(c)
!$omp declare simd(add1) uniform(fact)
implicit none
!$omp declare simd(add1) uniform(fact)
double precision :: a,b,fact, c
c = a + b + fact
end function
function add2(a,b,i, fact) result(c)
!$omp declare simd(add2) uniform(a,b,fact) linear(i:1)
implicit none
!$omp declare simd(add2) uniform(a,b,fact) linear(i:1)
integer :: i
double precision :: a(*),b(*),fact, c
c = a(i) + b(i) + fact

View File

@ -1,9 +1,9 @@
/*
* @@name: SIMD.3c
* @@name: SIMD.3
* @@type: C
* @@compilable: yes
* @@linkable: no
* @@operation: compile
* @@expect: success
* @@version: omp_4.0
*/
double work( double *a, double *b, int n )
{

View File

@ -1,8 +1,8 @@
! @@name: SIMD.3f
! @@name: SIMD.3
! @@type: F-free
! @@compilable: yes
! @@linkable: no
! @@operation: compile
! @@expect: success
! @@version: omp_4.0
subroutine work( a, b, n, sum )
implicit none
integer :: i, n

View File

@ -1,9 +1,9 @@
/*
* @@name: SIMD.4c
* @@name: SIMD.4
* @@type: C
* @@compilable: yes
* @@linkable: no
* @@operation: compile
* @@expect: success
* @@version: omp_4.0
*/
void work( float *b, int n, int m )
{

View File

@ -1,8 +1,8 @@
! @@name: SIMD.4f
! @@name: SIMD.4
! @@type: F-free
! @@compilable: yes
! @@linkable: no
! @@operation: compile
! @@expect: success
! @@version: omp_4.0
subroutine work( b, n, m )
implicit none
real :: b(n)

View File

@ -1,9 +1,9 @@
/*
* @@name: SIMD.5c
* @@name: SIMD.5
* @@type: C
* @@compilable: yes
* @@linkable: no
* @@operation: compile
* @@expect: success
* @@version: omp_4.0
*/
void work( double **a, double **b, double **c, int n )
{

View File

@ -1,14 +1,14 @@
! @@name: SIMD.5f
! @@name: SIMD.5
! @@type: F-free
! @@compilable: yes
! @@linkable: no
! @@operation: compile
! @@expect: success
! @@version: omp_4.0
subroutine work( a, b, c, n )
implicit none
integer :: i,j,n
double precision :: a(n,n), b(n,n), c(n,n), tmp
!$omp for simd collapse(2) private(tmp)
!$omp do simd collapse(2) private(tmp)
do j = 1,n
do i = 1,n
tmp = a(i,j) + b(i,j)

View File

@ -1,9 +1,9 @@
/*
* @@name: SIMD.6c
* @@name: SIMD.6
* @@type: C
* @@compilable: yes
* @@linkable: no
* @@operation: compile
* @@expect: success
* @@version: omp_4.0
*/
#pragma omp declare simd linear(p:1) notinbranch
int foo(int *p){

View File

@ -1,17 +1,17 @@
! @@name: SIMD.6f
! @@name: SIMD.6
! @@type: F-free
! @@compilable: yes
! @@linkable: no
! @@operation: compile
! @@expect: success
! @@version: omp_4.0
function foo(p) result(r)
!$omp declare simd(foo) notinbranch
implicit none
!$omp declare simd(foo) notinbranch
integer :: p, r
p = p + 10
r = p
end function foo
function myaddint(int *a, int *b, int n) result(r)
function myaddint(a, b, n) result(r)
implicit none
integer :: a(*), b(*), n, r
integer :: i
@ -19,15 +19,15 @@ function myaddint(int *a, int *b, int n) result(r)
!$omp simd
do i=1, n
a(i) = foo(b[i]) ! foo is not called under a condition
a(i) = foo(b(i)) ! foo is not called under a condition
end do
r = a(n)
end function myaddint
function goo(p) result(r)
!$omp declare simd(goo) inbranch
implicit none
!$omp declare simd(goo) inbranch
real :: p, r
p = p + 18.5
r = p

View File

@ -1,9 +1,9 @@
/*
* @@name: SIMD.7c
* @@name: SIMD.7
* @@type: C
* @@compilable: yes
* @@linkable: yes
* @@operation: run
* @@expect: success
* @@version: omp_4.0
*/
#include <stdio.h>
#include <stdlib.h>
@ -14,7 +14,7 @@ int a[N], b[N], c[N];
#pragma omp declare simd inbranch
int fib( int n )
{
if (n <= 2)
if (n <= 1)
return n;
else {
return fib(n-1) + fib(n-2);
@ -32,6 +32,6 @@ int main(void)
for (i=0; i < N; i++) {
a[i] = fib(b[i]);
}
printf("Done a[%d] = %d\n", N-1, a[N-1]);
printf("Done a[%d] = %d\n", N-1, a[N-1]); //Done a[44] = 701408733
return 0;
}

View File

@ -1,8 +1,8 @@
! @@name: SIMD.7f
! @@name: SIMD.7
! @@type: F-free
! @@compilable: yes
! @@linkable: yes
! @@operation: run
! @@expect: success
! @@version: omp_4.0
program fibonacci
implicit none
integer,parameter :: N=45
@ -21,15 +21,15 @@ program fibonacci
end do
write(*,*) "Done a(", N-1, ") = ", a(N-1)
! 44 1134903168
! 44 701408733
end program
recursive function fib(n) result(r)
!$omp declare simd(fib) inbranch
implicit none
!$omp declare simd(fib) inbranch
integer :: n, r
if (n <= 2) then
if (n <= 1) then
r = n
else
r = fib(n-1) + fib(n-2)

View File

@ -1,9 +1,9 @@
/*
* @@name: SIMD.8c
* @@name: SIMD.8
* @@type: C
* @@compilable: yes
* @@linkable: yes
* @@operation: run
* @@expect: success
* @@version: omp_4.0
*/
#include <stdio.h>
#include <math.h>
@ -14,8 +14,9 @@ float A[1000];
float do_work(float *arr)
{
float pri;
int i;
#pragma omp simd lastprivate(pri)
for (int i = 0; i < 999; ++i) {
for (i = 0; i < 999; ++i) {
int j = P[i];
pri = 0.5f;
@ -31,8 +32,9 @@ float do_work(float *arr)
int main(void)
{
float pri, arr[1000];
int i;
for (int i = 0; i < 1000; ++i) {
for (i = 0; i < 1000; ++i) {
P[i] = i;
A[i] = i * 1.5f;
arr[i] = i * 1.8f;

View File

@ -1,8 +1,8 @@
! @@name: SIMD.8f
! @@name: SIMD.8
! @@type: F-free
! @@compilable: yes
! @@linkable: yes
! @@operation: run
! @@expect: success
! @@version: omp_4.0
module work
integer :: P(1000)

View File

@ -0,0 +1,44 @@
/*
* @@name: linear_modifier.1
* @@type: C++
* @@operation: run
* @@expect: success
* @@version: omp_5.2
*/
#include <stdio.h>
#define NN 1023
int a[NN];
#pragma omp declare simd linear(p: ref) simdlen(8)
void add_one2(int& p)
{
p += 1;
}
int main(void)
{
int i;
int* p = a;
for (i = 0; i < NN; i++) {
a[i] = i;
}
#pragma omp simd linear(p) simdlen(8)
for (i = 0; i < NN; i++) {
int& k = *p;
add_one2(k);
add_one2(a[i]);
p++;
}
for (i = 0; i < NN; i++) {
if (a[i] != i+2) {
printf("failed\n");
return 1;
}
}
printf("passed\n");
return 0;
}

View File

@ -0,0 +1,47 @@
! @@name: linear_modifier.1
! @@type: F-free
! @@operation: run
! @@expect: success
! @@version: omp_5.2
module m
integer, parameter :: NN = 1023
integer :: a(NN)
contains
subroutine add_one2(p)
implicit none
!$omp declare simd(add_one2) linear(p: ref) simdlen(8)
integer :: p
p = p + 1
end subroutine
end module
program main
use m
implicit none
integer :: i, p
do i = 1, NN
a(i) = i
end do
p = 1
!$omp simd linear(p) simdlen(8)
do i = 1, NN
associate(k => a(p))
call add_one2(k)
end associate
call add_one2(a(i))
p = p + 1
end do
do i = 1, NN
if (a(i) /= i+2) then
print *, "failed"
stop
endif
end do
print *, "passed"
end program

View File

@ -0,0 +1,43 @@
/*
* @@name: linear_modifier.2
* @@type: C++
* @@operation: run
* @@expect: success
* @@version: omp_5.2
*/
#include <stdio.h>
#define NN 1023
int a[NN];
#pragma omp declare simd linear(p: ref) linear(i: uval)
void add_one2(int& p, const int& i)
{
p += i;
}
int main(void)
{
int i;
int* p = a;
for (i = 0; i < NN; i++) {
a[i] = i;
}
#pragma omp simd linear(p)
for (i = 0; i < NN; i++) {
int& k = *p;
add_one2(k, i);
p++;
}
for (i = 0; i < NN; i++) {
if (a[i] != i*2) {
printf("failed\n");
return 1;
}
}
printf("passed\n");
return 0;
}

Some files were not shown because too many files have changed in this diff Show More