mirror of
https://github.com/OpenMP/Examples.git
synced 2025-04-03 13:21:33 +01:00
synced with v5.0.0 of the examples-internal repo
This commit is contained in:
parent
156a12ca09
commit
eaec9ede64
@ -1,3 +1,8 @@
|
||||
[02-Feb-2018] Note
|
||||
This "Changes.log" is no longer updated. Please use History.tex and
|
||||
the git log messages for changes.
|
||||
|
||||
|
||||
[20-May-2016] Version 4.5.0
|
||||
Changes from 4.0.2ltx
|
||||
|
||||
|
@ -33,7 +33,7 @@ directive. Clauses provide argument specifications (\code{linear},
|
||||
\code{uniform}, and \code{aligned}), a requested vector length
|
||||
(\code{simdlen}), and designate whether the function is always/never
|
||||
called conditionally in a loop (\code{branch}/\code{inbranch}).
|
||||
The latter is for optimizing peformance.
|
||||
The latter is for optimizing performance.
|
||||
|
||||
Also, the \code{simd} construct has been combined with the worksharing loop
|
||||
constructs (\code{for simd} and \code{do simd}) to enable simultaneous thread
|
||||
|
@ -44,7 +44,7 @@ subsection of the OpenMP Specifications document.
|
||||
\bigskip
|
||||
DATA-MAPPING ATTRIBUTES
|
||||
|
||||
The \code{map} clause on a device construct explictly specifies how the list items in
|
||||
The \code{map} clause on a device construct explicitly specifies how the list items in
|
||||
the clause are mapped from the encountering task's data environment (on the host)
|
||||
to the corresponding item in the device data environment (on the device).
|
||||
The common \plc{list items} are arrays, array sections, scalars, pointers, and
|
||||
@ -55,7 +55,7 @@ within the list or block of a \code{declare target} directive. Also, a C/C++ poi
|
||||
is mapped as a zero-length array section, as is a C++ variable that is a reference to a pointer.
|
||||
% Waiting for response from Eric on this.
|
||||
|
||||
Without explict mapping, non-scalar and non-pointer variables within the scope of the \code{target}
|
||||
Without explicit mapping, non-scalar and non-pointer variables within the scope of the \code{target}
|
||||
construct are implicitly mapped with a \plc{map-type} of \code{tofrom}.
|
||||
Without explicit mapping, scalar variables within the scope of the \code{target}
|
||||
construct are not mapped, but have an implicit firstprivate data-sharing
|
||||
|
@ -2,44 +2,71 @@
|
||||
\chapter{Memory Model}
|
||||
\label{chap:memory_model}
|
||||
|
||||
In this chapter, examples illustrate race conditions on access to variables with
|
||||
shared data-sharing attributes. A race condition can exist when two
|
||||
or more threads are involved in accessing a variable in which not all
|
||||
of the accesses are reads; that is, a WaR, RaW or WaW condition
|
||||
exists (R=read, a=after, W=write). A RaR does not produce a race condition.
|
||||
Ensuring thread execution order at
|
||||
the processor level is not enough to avoid race conditions, because the
|
||||
local storage at the processor level (registers, caches, etc.)
|
||||
must be synchronized so that a consistent view of the variable in the
|
||||
memory hierarchy can be seen by the threads accessing the variable.
|
||||
OpenMP provides a shared-memory model that allows all threads on a given
|
||||
device shared access to \emph{memory}. For a given OpenMP region that may be
|
||||
executed by more than one thread or SIMD lane, variables in memory may be
|
||||
\emph{shared} or \emph{private} with respect to those threads or SIMD lanes. A
|
||||
variable's data-sharing attribute indicates whether it is shared (the
|
||||
\emph{shared} attribute) or private (the \emph{private}, \emph{firstprivate},
|
||||
\emph{lastprivate}, \emph{linear}, and \emph{reduction} attributes) in the data
|
||||
environment of an OpenMP region. While private variables in an OpenMP region
|
||||
are new copies of the original variable (with same name) that may then be
|
||||
concurrently accessed or modified by their respective threads or SIMD lanes, a
|
||||
shared variable in an OpenMP region is the same as the variable of the same
|
||||
name in the enclosing region. Concurrent accesses or modifications to a
|
||||
shared variable may therefore require synchronization to avoid data races.
|
||||
|
||||
OpenMP provides a shared-memory model which allows all threads access
|
||||
to \plc{memory} (shared data). Each thread also has exclusive
|
||||
access to \plc{threadprivate memory} (private data). A private
|
||||
variable referenced in an OpenMP directive's structured block is a
|
||||
new version of the original variable (with the same name) for each
|
||||
task (or SIMD lane) within the code block. A private variable is
|
||||
initially undefined (except for variables in \code{firstprivate}
|
||||
and \code{linear} clauses), and the original variable value is
|
||||
unaltered by assignments to the private variable, (except for
|
||||
\code{reduction}, \code{lastprivate} and \code{linear} clauses).
|
||||
OpenMP's memory model also includes a \emph{temporary view} of memory that is
|
||||
associated with each thread. Two different threads may see different values for
|
||||
a given variable in their respective temporary views. Threads may employ flush
|
||||
operations for the purposes of making their temporary view of a variable
|
||||
consistent with the value of the variable in memory. The effect of a given
|
||||
flush operation is characterized by its flush properties -- some combination of
|
||||
\emph{strong}, \emph{release}, and \emph{acquire} -- and, for \emph{strong}
|
||||
flushes, a \emph{flush-set}.
|
||||
|
||||
Private variables in an outer \code{parallel} region can be
|
||||
shared by implicit tasks of an inner \code{parallel} region
|
||||
(with a \code{share} clause on the inner \code{parallel} directive).
|
||||
Likewise, a private variable may be shared in the region of an
|
||||
explicit \code{task} (through a \code{shared} clause).
|
||||
A \emph{strong} flush will force consistency between the temporary view and the
|
||||
memory for all variables in its \emph{flush-set}. Furthermore all strong flushes in a
|
||||
program that have intersecting flush-sets will execute in some total order, and
|
||||
within a thread strong flushes may not be reordered with respect to other
|
||||
memory operations on variables in its flush-set. \emph{Release} and
|
||||
\emph{acquire} flushes operate in pairs. A release flush may ``synchronize''
|
||||
with an acquire flush, and when it does so the local memory operations that
|
||||
precede the release flush will appear to have been completed before the local
|
||||
memory operations on the same variables that follow the acquire flush.
|
||||
|
||||
Flush operations arise from explicit \code{flush} directives, implicit
|
||||
\code{flush} directives, and also from the execution of \code{atomic}
|
||||
constructs. The \code{flush} directive forces a consistent view of local
|
||||
variables of the thread executing the \code{flush}. When a list is supplied on
|
||||
the directive, only the items (variables) in the list are guaranteed to be
|
||||
flushed. Implied flushes exist at prescribed locations of certain constructs.
|
||||
For the complete list of these locations and associated constructs, please
|
||||
refer to the \plc{flush Construct} section of the OpenMP Specifications
|
||||
document.
|
||||
|
||||
In this chapter, examples illustrate how race conditions may arise for accesses
|
||||
to variables with a \plc{shared} data-sharing attribute when flush operations
|
||||
are not properly employed. A race condition can exist when two or more threads
|
||||
are involved in accessing a variable in which not all of the accesses are
|
||||
reads; that is, a WaR, RaW or WaW condition exists (R=read, a=after, W=write).
|
||||
A RaR does not produce a race condition. In particular, a data race will arise
|
||||
when conflicting accesses do not have a well-defined \emph{completion order}.
|
||||
The existence of data races in OpenMP programs result in undefined behavior,
|
||||
and so they should generally be avoided for programs to be correct. The
|
||||
completion order of accesses to a shared variable is guaranteed in OpenMP
|
||||
through a set of memory consistency rules that are described in the \plc{OpenMP
|
||||
Memory Consitency} section of the OpenMP Specifications document.
|
||||
|
||||
%This chapter also includes examples that exhibit non-sequentially consistent
|
||||
%(\emph{non-SC}) behavior. Sequential consistency (\emph{SC}) is the desirable
|
||||
%property that the results of a multi-threaded program are as if all operations
|
||||
%are performed in some total order, consistent with the program order of
|
||||
%operations performed by each thread. OpenMP guarantees that a correct program
|
||||
%(i.e. a program that does not have a data race) will exhibit SC behavior
|
||||
%so long as the only \code{atomic} constructs it uses are SC atomic directives.
|
||||
|
||||
|
||||
The \code{flush} directive forces a consistent view of local variables
|
||||
of the thread executing the \code{flush}.
|
||||
When a list is supplied on the directive, only the items (variables)
|
||||
in the list are guaranteed to be flushed.
|
||||
|
||||
Implied flushes exist at prescribed locations of certain constructs.
|
||||
For the complete list of these locations and associated constructs,
|
||||
please refer to the \plc{flush Construct} section of the OpenMP
|
||||
Specifications document.
|
||||
|
||||
% The following table lists construct in which implied flushes exist, and the
|
||||
% location of their execution.
|
||||
@ -102,4 +129,4 @@ Specifications document.
|
||||
% specific storage location accessed atomically (specified as the \plc{x} variable
|
||||
% in \plc{atomic Construct} subsection of the OpenMP Specifications document).
|
||||
|
||||
Examples 1-3 show the difficulty of synchronizing threads through \code{flush} and \code{atomic} directives.
|
||||
% Examples 1-3 show the difficulty of synchronizing threads through \code{flush} and \code{atomic} directives.
|
||||
|
@ -24,7 +24,7 @@ That is, inclusion of one of the \plc{construct-type-clause} names \code{parall
|
||||
activates the corresponding region.
|
||||
The \code{cancel} construct is activated by the first encountering thread, and it
|
||||
continues execution at the end of the named region.
|
||||
The \code{cancel} construct is also a concellation point for any other thread of the team
|
||||
The \code{cancel} construct is also a cancellation point for any other thread of the team
|
||||
to also continue execution at the end of the named region.
|
||||
|
||||
Also, once the specified region has been activated for cancellation any thread that encounnters
|
||||
|
@ -19,10 +19,15 @@ mechanism.
|
||||
On a finer scale the \code{atomic} construct allows only a single thread at
|
||||
a time to have atomic access to a storage location involving a single read,
|
||||
write, update or capture statement, and a limited number of combinations
|
||||
when specifying the \code{capture} \plc{atomic-clause} clause. The \plc{atomic-clause} clause
|
||||
is required for some expression statements, but are not required for
|
||||
\code{update} statements. Please see the details in the \plc{atomic Construct}
|
||||
subsection of the \plc{Directives} chapter in the OpenMP Specifications document.
|
||||
when specifying the \code{capture} \plc{atomic-clause} clause. The
|
||||
\plc{atomic-clause} clause is required for some expression statements, but is
|
||||
not required for \code{update} statements. The \plc{memory-order} clause can be
|
||||
used to specify the degree of memory ordering enforced by an \code{atomic}
|
||||
construct. From weakest to strongest, they are \code{relaxed} (the default),
|
||||
acquire and/or release clauses (specified with \code{acquire}, \code{release},
|
||||
or \code{acq\_rel}), and \code{seq\_cst}. Please see the details in the
|
||||
\plc{atomic Construct} subsection of the \plc{Directives} chapter in the OpenMP
|
||||
Specifications document.
|
||||
|
||||
% The following three sentences were stolen from the spec.
|
||||
The \code{ordered} construct either specifies a structured block in a loop,
|
||||
@ -37,15 +42,22 @@ iteration vector argument (vec) to indicate the iteration that satisfies the
|
||||
dependence. The \code{depend} clause with a \code{source}
|
||||
\plc{dependence-type} specifies dependence satisfaction.
|
||||
|
||||
The \code{flush} directive is a stand-alone construct that forces a thread's
|
||||
temporal local storage (view) of a variable to memory where a consistent view
|
||||
of the variable storage can be accesses. When the construct is used without
|
||||
a variable list, all the locally thread-visible data as defined by the
|
||||
base language are flushed. A construct with a list applies the flush
|
||||
operation only to the items in the list. The \code{flush} construct also
|
||||
effectively insures that no memory (load or store) operation for
|
||||
the variable set (list items, or default set) may be reordered across
|
||||
the \code{flush} directive.
|
||||
The \code{flush} directive is a stand-alone construct for enforcing consistency
|
||||
between a thread's view of memory and the view of memory for other threads (see
|
||||
the Memory Model chapter of this document for more details). When the construct
|
||||
is used with an explicit variable list, a \plc{strong flush} that forces a
|
||||
thread's temporary view of memory to be consistent with the actual memory is
|
||||
applied to all listed variables. When the construct is used without an explicit
|
||||
variable list and without a \plc{memory-order} clause, a strong flush is
|
||||
applied to all locally thread-visible data as defined by the base language, and
|
||||
additionally the construct provides both acquire and release memory ordering
|
||||
semantics. When an explicit variable list is not present and a
|
||||
\plc{memory-order} clause is present, the construct provides acquire and/or
|
||||
release memory ordering semantics according to the \plc{memory-order} clause,
|
||||
but no strong flush is performed. A resulting strong flush that applies to a
|
||||
set of variables effectively ensures that no memory (load or store)
|
||||
operation for the affected variables may be reordered across the \code{flush}
|
||||
directive.
|
||||
|
||||
General-purpose routines provide mutual exclusion semantics through locks,
|
||||
represented by lock variables.
|
||||
|
@ -8,6 +8,8 @@ to assure the compiler that the loop can be vectorized.
|
||||
\cexample{SIMD}{1}
|
||||
|
||||
\ffreeexample{SIMD}{1}
|
||||
|
||||
\clearpage
|
||||
|
||||
|
||||
When a function can be inlined within a loop the compiler has an opportunity to
|
||||
@ -24,7 +26,7 @@ respectively. The \plc{add3} C function uses dereferencing.
|
||||
The \code{declare} \code{simd} constructs also illustrate the use of
|
||||
\code{uniform} and \code{linear} clauses. The \code{uniform(fact)} clause
|
||||
indicates that the variable \plc{fact} is invariant across the SIMD lanes. In
|
||||
the \plc{add2} function \plc{a} and \plc{b} are included in the \code{unform}
|
||||
the \plc{add2} function \plc{a} and \plc{b} are included in the \code{uniform}
|
||||
list because the C pointer and the Fortran array references are constant. The
|
||||
\plc{i} index used in the \plc{add2} function is included in a \code{linear}
|
||||
clause with a constant-linear-step of 1, to guarantee a unity increment of the
|
||||
@ -42,7 +44,7 @@ variable.
|
||||
|
||||
\ffreeexample{SIMD}{2}
|
||||
|
||||
|
||||
\pagebreak
|
||||
A thread that encounters a SIMD construct executes a vectorized code of the
|
||||
iterations. Similar to the concerns of a worksharing loop a loop vectorized
|
||||
with a SIMD construct must assure that temporary and reduction variables are
|
||||
@ -55,6 +57,7 @@ construct.
|
||||
\ffreeexample{SIMD}{3}
|
||||
|
||||
|
||||
\pagebreak
|
||||
A \code{safelen(N)} clause in a \code{simd} construct assures the compiler that
|
||||
there are no loop-carried dependencies for vectors of size \plc{N} or below. If
|
||||
the \code{safelen} clause is not specified, then the default safelen value is
|
||||
@ -69,7 +72,7 @@ than 16, the behavior is undefined.
|
||||
|
||||
\ffreeexample{SIMD}{4}
|
||||
|
||||
|
||||
\pagebreak
|
||||
The following SIMD construct instructs the compiler to collapse the \plc{i} and
|
||||
\plc{j} loops into a single SIMD loop in which SIMD chunks are executed by
|
||||
threads of the team. Within the workshared loop chunks of a thread, the SIMD
|
||||
@ -110,6 +113,7 @@ version of the \plc{fib()} function.
|
||||
|
||||
|
||||
%%% section
|
||||
\pagebreak
|
||||
\section{Loop-Carried Lexical Forward Dependence}
|
||||
\label{sec:SIMD_forward_dep}
|
||||
|
||||
|
141
Examples_acquire_release.tex
Normal file
141
Examples_acquire_release.tex
Normal file
@ -0,0 +1,141 @@
|
||||
\pagebreak
|
||||
\section{Synchronization Based on Acquire/Release Semantics}
|
||||
\label{sec:acquire_and_release_semantics}
|
||||
|
||||
%OpenMP 5.0 introduced ``release/acquire'' memory ordering semantics to the
|
||||
%specification. The memory ordering behavior of OpenMP constructs and routines
|
||||
%that permit two threads to synchronize with each other are defined in terms of
|
||||
%\textit{release flushes} and \textit{acquire flushes}, where a release flush
|
||||
%must occur at the source of the synchronization and an acquire flush must occur
|
||||
%at the sink of the synchronization. Flushes resulting from a \code{flush}
|
||||
%directive without a list may function as a release flush, an acquire flush, or
|
||||
%both a release and acquire flush. Flushes implied on entry to or exit from an
|
||||
%atomic operation (specified by an \code{atomic} construct) may also function as
|
||||
%a release flush or an acquire flush, depending on if a memory ordering clause
|
||||
%appears on a construct. Flushes implied by other OpenMP constructs or routines
|
||||
%also function as either a release flush or an acquire flush, according to the
|
||||
%synchronization semantics of the construct.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%
|
||||
|
||||
As explained in the Memory Model chapter of this document, a flush operation
|
||||
may be an \emph{acquire flush} and/or a \emph{release flush}, and OpenMP 5.0
|
||||
defines acquire/release semantics in terms of these fundamental flush
|
||||
operations. For any synchronization between two threads that is specified by
|
||||
OpenMP, a release flush logically occurs at the source of the synchronization
|
||||
and an acquire flush logically occurs at the sink of the synchronization.
|
||||
OpenMP 5.0 added memory ordering clauses -- \code{acquire}, \code{release}, and
|
||||
\code{acq\_rel} -- to the \code{flush} and \code{atomic} constructs for
|
||||
explicitly requesting acquire/release semantics. Furthermore, implicit flushes
|
||||
for all OpenMP constructs and runtime routines that synchronize OpenMP threads
|
||||
in some manner were redefined in terms of synchronizing release and acquire
|
||||
flushes to avoid the requirement of strong memory fences (see the \plc{Flush
|
||||
Synchronization and Happens Before} and \plc{Implicit Flushes} sections of the
|
||||
OpenMP Specifications document).
|
||||
|
||||
The examples that follow in this section illustrate how acquire and release
|
||||
flushes may be employed, implicitly or explicitly, for synchronizing threads. A
|
||||
\code{flush} directive without a list and without any memory ordering clause
|
||||
can also function as both an acquire and release flush for facilitating thread
|
||||
synchronization. Flushes implied on entry to, or exit from, an atomic
|
||||
operation (specified by an \code{atomic} construct) may function as an acquire
|
||||
flush or a release flush if a memory ordering clause appears on the construct.
|
||||
On entry to and exit from a \code{critical} construct there is now an implicit
|
||||
acquire flush and release flush, respectively.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%
|
||||
|
||||
The first example illustrates how the release and acquire flushes implied by a
|
||||
\code{critical} region guarantee a value written by the first thread is visible
|
||||
to a read of the value on the second thread. Thread 0 writes to \plc{x} and
|
||||
then executes a \code{critical} region in which it writes to \plc{y}; the write
|
||||
to \plc{x} happens before the execution of the \code{critical} region,
|
||||
consistent with the program order of the thread. Meanwhile, thread 1 executes a
|
||||
\code{critical} region in a loop until it reads a non-zero value from
|
||||
\plc{y} in the \code{critical} region, after which it prints the value of
|
||||
\plc{x}; again, the execution of the \code{critical} regions happen before the
|
||||
read from \plc{x} based on the program order of the thread. The \code{critical}
|
||||
regions executed by the two threads execute in a serial manner, with a
|
||||
pair-wise synchronization from the exit of one \code{critical} region to the
|
||||
entry to the next \code{critical} region. These pair-wise synchronizations
|
||||
result from the implicit release flushes that occur on exit from
|
||||
\code{critical} regions and the implicit acquire flushes that occur on entry to
|
||||
\code{critical} regions; hence, the execution of each \code{critical} region in
|
||||
the sequence happens before the execution of the next \code{critical} region.
|
||||
A ``happens before'' order is therefore established between the assignment to \plc{x}
|
||||
by thread 0 and the read from \plc{x} by thread 1, and so thread 1 must see that
|
||||
\plc{x} equals 10.
|
||||
|
||||
\pagebreak
|
||||
\cexample{acquire_release}{1}
|
||||
\ffreeexample{acquire_release}{1}
|
||||
|
||||
In the second example, the \code{critical} constructs are exchanged with
|
||||
\code{atomic} constructs that have \textit{explicit} memory ordering specified. When the
|
||||
atomic read operation on thread 1 reads a non-zero value from \plc{y}, this
|
||||
results in a release/acquire synchronization that in turn implies that the
|
||||
assignment to \plc{x} on thread 0 happens before the read of \plc{x} on thread
|
||||
1. Therefore, thread 1 will print ``x = 10''.
|
||||
|
||||
\cexample{acquire_release}{2}
|
||||
\ffreeexample{acquire_release}{2}
|
||||
|
||||
\pagebreak
|
||||
In the third example, \code{atomic} constructs that specify relaxed atomic
|
||||
operations are used with explicit \code{flush} directives to enforce memory
|
||||
ordering between the two threads. The explicit \code{flush} directive on thread
|
||||
0 must specify a release flush and the explicit \code{flush} directive on
|
||||
thread 1 must specify an acquire flush to establish a release/acquire
|
||||
synchronization between the two threads. The \code{flush} and \code{atomic}
|
||||
constructs encountered by thread 0 can be replaced by the \code{atomic} construct used in
|
||||
Example 2 for thread 0, and similarly the \code{flush} and \code{atomic}
|
||||
constructs encountered by thread 1 can be replaced by the \code{atomic}
|
||||
construct used in Example 2 for thread 1.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%3
|
||||
%{\color{violet}
|
||||
%For this example, the implicit release flush of the \code{flush} directive for thread 0 creates
|
||||
%a source synchronization with release memory ordering, while the implicit release flush of the
|
||||
%\code{flush} directive for thread 1 creates a sink synchronization with acquire memory ordering.
|
||||
%The code performs the same thread synchronization of the previous example, with only a slight
|
||||
%coding change.
|
||||
%The explicit \code{release} and \code{acquire} clauses of the atomic construct has been
|
||||
%replaced with implicit release and aquire flushes of explicit \code{flush} constructs.
|
||||
%(Here, the \code{atomic} constructs have \plc{relaxed} operations.)
|
||||
%}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%3
|
||||
|
||||
\cexample{acquire_release}{3}
|
||||
\ffreeexample{acquire_release}{3}
|
||||
|
||||
Example 4 will fail to order the write to \plc{x} on thread 0 before the read
|
||||
from \plc{x} on thread 1. Importantly, the implicit release flush on exit from
|
||||
the \code{critical} region will not synchronize with the acquire flush that
|
||||
occurs on the atomic read operation performed by thread 1. This is because
|
||||
implicit release flushes that occur on a given construct may only synchronize
|
||||
with implicit acquire flushes on a compatible construct (and vice-versa) that
|
||||
internally makes use of the same synchronization variable. For a
|
||||
\code{critical} construct, this might correspond to a \plc{lock} object that
|
||||
is used by a given implementation (for the synchronization semantics of other
|
||||
constructs due to implicit release and acquire flushes, refer to the \plc{Implicit
|
||||
Flushes} section of the OpenMP Specifications document). Either an explicit \code{flush}
|
||||
directive that provides a release flush (i.e., a flush without a list that does
|
||||
not have the \code{acquire} clause) must be specified between the
|
||||
\code{critical} construct and the atomic write, or an atomic operation that
|
||||
modifies \plc{y} and provides release semantics must be specified.
|
||||
|
||||
%{\color{violet}
|
||||
%In the following example synchronization between the acquire flush of the atomic read
|
||||
%of \plc{y} by thread 1 is not synchronized with the relaxed atomic construct that
|
||||
%assigns a value to \plc{y} by thread 0.
|
||||
%While there is a \code{critical} construct and implicit release flush
|
||||
%for the \plc{x} assignment of thread 0,
|
||||
%a release flush association with the \plc{y} assignment of
|
||||
%thread 0 is not formed. A \code{release} or \code{acq-rel} clause on the
|
||||
%\code{atomic write} construct or a \code{flush} directive after the assignment to \plc{y}
|
||||
%will form a synchronization and will guarantee memory ordering of the x and y assignments
|
||||
%by thread 0.
|
||||
%}
|
||||
|
||||
\cexample{acquire_release_broke}{4}
|
||||
\ffreeexample{acquire_release_broke}{4}
|
104
Examples_affinity_display.tex
Normal file
104
Examples_affinity_display.tex
Normal file
@ -0,0 +1,104 @@
|
||||
\section{Affinity Display}
|
||||
\label{sec:affinity_display}
|
||||
|
||||
The following examples illustrate ways to display thread affinity.
|
||||
Automatic display of affinity can be invoked by setting
|
||||
the \code{OMP\_DISPLAY\_AFFINITY} environment variable to \code{TRUE}.
|
||||
The format of the output can be customized by setting the
|
||||
\code{OMP\_AFFINITY\_FORMAT} environment variable to an appropriate string.
|
||||
Also, there are API calls for the user to display thread affinity
|
||||
at selected locations within code.
|
||||
|
||||
For the first example the environment variable \code{OMP\_DISPLAY\_AFFINITY} has been
|
||||
set to \code{TRUE}, and execution occurs on an 8-core system with \code{OMP\_NUM\_THREADS} set to 8.
|
||||
|
||||
The affinity for the master thread is reported through a call to the API
|
||||
\code{omp\_display\_affinity()} routine. For default affinity settings
|
||||
the report shows that the master thread can execute on any of the cores.
|
||||
In the following parallel region the affinity for each of the team threads is reported
|
||||
automatically since the \code{OMP\_DISPLAY\_AFFINITY} environment variable has been set
|
||||
to \code{TRUE}.
|
||||
|
||||
These two reports are often useful (as in hybrid codes using both MPI and OpenMP)
|
||||
to observe the affinity (for an MPI task) before the parallel region,
|
||||
and during an OpenMP parallel region. Note: the next parallel region uses the
|
||||
same number of threads as in the previous parallel region and affinities are
|
||||
not changed, so affinity is NOT reported.
|
||||
|
||||
In the last parallel region, the thread affinities are reported
|
||||
because the thread affinity has changed.
|
||||
|
||||
\cexample{affinity_display}{1}
|
||||
|
||||
\ffreeexample{affinity_display}{1}
|
||||
|
||||
|
||||
In the following example 2 threads are forked, and each executes on a socket. Next,
|
||||
a nested parallel region runs half of the available threads on each socket.
|
||||
|
||||
These OpenMP environment variables have been set:
|
||||
|
||||
\begin{compactitem}
|
||||
\item \code{OMP\_PROC\_BIND}="TRUE"
|
||||
\item \code{OMP\_NUM\_THREADS}="2,4"
|
||||
\item \code{OMP\_PLACES}="\{0,2,4,6\},\{1,3,5,7\}"
|
||||
\item \code{OMP\_AFFINITY\_FORMAT}="nest\_level= \%L, parent\_thrd\_num= \%a, thrd\_num= \%n, thrd\_affinity= \%A"
|
||||
\end{compactitem}
|
||||
|
||||
where the numbers correspond to core ids for the system. Note, \code{OMP\_DISPLAY\_AFFINITY} is not
|
||||
set and is \code{FALSE} by default. This example shows how to use API routines to
|
||||
perform affinity display operations.
|
||||
|
||||
For each of the two first-level threads the \code{OMP\_PLACES} variable specifies
|
||||
a place with all the core-ids of the socket (\{0,2,4,6\} for one thread and \{1,3,5,7\} for the other).
|
||||
(As is sometimes the case in 2-socket systems, one socket may consist
|
||||
of the even id numbers, while the other may have the odd id numbers.) The affinities
|
||||
are printed according to the \code{OMP\_AFFINITY\_FORMAT} format: providing
|
||||
the parallel nesting level (\%L), the ancestor thread number (\%a), the thread number (\%n)
|
||||
and the thread affinity (\%A). In the nested parallel region within the \plc{socket\_work} routine
|
||||
the affinities for the threads on each socket are printed according to this format.
|
||||
|
||||
\cexample{affinity_display}{2}
|
||||
|
||||
\ffreeexample{affinity_display}{2}
|
||||
|
||||
The next example illustrates more details about affinity formatting.
|
||||
First, the \code{omp\_get\_affininity\_format()} API routine is used to
|
||||
obtain the default format. The code checks to make sure the storage
|
||||
provides enough space to hold the format.
|
||||
Next, the \code{omp\_set\_affinity\_format()} API routine sets a user-defined
|
||||
format: \plc{host=\%20H thrd\_num=\%0.4n binds\_to=\%A}.
|
||||
|
||||
The host, thread number and affinity fields are specified by \plc{\%20H},
|
||||
\plc{\%0.4n} and \plc{\%A}: \plc{H}, \plc{n} and \plc{A} are single character "short names"
|
||||
for the host, thread\_num and thread\_affinity data to be printed,
|
||||
with format sizes of \plc{20}, \plc{4}, and "size as needed".
|
||||
The period (.) indicates that the field is displayed right-justified (default is left-justified)
|
||||
and the "0" indicates that any unused space is to be prefixed with zeros
|
||||
(e.g. instead of "1", "0001" is displayed for the field size of 4).
|
||||
|
||||
%The period (.) indicates that the field is displayed left-justified and the "0" indicates
|
||||
%that leading zeros are to be added so that the total length for the display of this “n” (thread_num) field is 4.
|
||||
|
||||
%The period (\plc{.}) indicates right justified and \plc{0} leading zeros.
|
||||
%All other text in the format is just user narrative.
|
||||
|
||||
Within the parallel region the affinity for each thread is captured by
|
||||
\code{omp\_capture\_affinity()} into a buffer array with elements indexed
|
||||
by the thread number (\plc{thrd\_num}).
|
||||
After the parallel region, the thread affinities are printed in thread-number order.
|
||||
|
||||
If the storage area in buffer is inadequate for holding the affinity
|
||||
data, the stored affinity data is truncated.
|
||||
%The \plc{max} reduction on the required storage, returned by
|
||||
%\code{omp\_capture\_affinity} in \plc{nchars}, is used to report
|
||||
%possible truncation (if \plc{max\_req\_store} > \plc{buffer\_store}).
|
||||
The maximum value for the number of characters (\plc{nchars}) returned by
|
||||
\code{omp\_capture\_affinity} is captured by the \code{reduction(max:max\_req\_store)}
|
||||
clause and the \plc{if(nchars >= max\_req\_store) max\_req\_store=nchars} statement.
|
||||
It is used to report possible truncation (if \plc{max\_req\_store} > \plc{buffer\_store}).
|
||||
|
||||
\cexample{affinity_display}{3}
|
||||
|
||||
\ffreeexample{affinity_display}{3}
|
||||
|
@ -37,7 +37,7 @@ On some systems there are utilities, files or user guides that provide configura
|
||||
information. For instance, the socket number and proc\_id's for a socket
|
||||
can be found in the /proc/cpuinfo text file on Linux systems.
|
||||
|
||||
\cexample{affinity}{6}
|
||||
\cexample{affinity_query}{1}
|
||||
|
||||
\ffreeexample{affinity}{6}
|
||||
\ffreeexample{affinity_query}{1}
|
||||
|
||||
|
63
Examples_allocators.tex
Normal file
63
Examples_allocators.tex
Normal file
@ -0,0 +1,63 @@
|
||||
\pagebreak
|
||||
\section{ Memory Allocators}
|
||||
\label{sec:allocators}
|
||||
|
||||
OpenMP memory allocators can be used to allocate memory with
|
||||
specific allocator traits. In the following example an OpenMP allocator is used to
|
||||
specify an alignment for arrays \plc{x} and \plc{y}. The
|
||||
general approach for attributing traits to variables allocated by
|
||||
OpenMP is to create or specify a pre-defined \plc{memory space}, create an
|
||||
array of \plc{traits}, and then form an \plc{allocator} from the
|
||||
memory space and trait.
|
||||
The allocator is then specified
|
||||
in an OpenMP allocation (using an API \plc{omp\_alloc()} function
|
||||
for C/C++ code and an \code{allocate} directive for Fortran code
|
||||
in the allocators.1 example).
|
||||
|
||||
In the example below the \plc{xy\_memspace} variable is declared
|
||||
and assigned the default memory space (\plc{omp\_default\_mem\_space}).
|
||||
Next, an array for \plc{traits} is created. Since only one
|
||||
trait will be used, the array size is \plc{1}.
|
||||
A trait is a structure in C/C++ and a derived type in Fortran,
|
||||
containing 2 components: a key and a corresponding value (key-value pair).
|
||||
The trait key used here is \plc{omp\_atk\_alignment} (an enum for C/C++
|
||||
and a parameter for Fortran)
|
||||
and the trait value of 64 is specified in the \plc{xy\_traits} declaration.
|
||||
These declarations are followed by a call to the
|
||||
\plc{omp\_init\_allocator()} function to combine the memory
|
||||
space (\plc{xy\_memspace}) and the traits (\plc{xy\_traits})
|
||||
to form an allocator (\plc{xy\_alloc}).
|
||||
|
||||
%In the C/C++ code the API \plc{omp\_allocate()} function is used
|
||||
%to allocate space, similar to \plc{malloc}, except that the allocator
|
||||
%is specified as the second argument.
|
||||
%In Fortran an API allocation function is not available.
|
||||
%An \code{allocate} construct is used (with \plc{x} and \plc{y}
|
||||
%listed as the variables to be allocated), along
|
||||
%with an \code{allocator} clause (specifying the \plc{xy\_alloc} as the allocator)
|
||||
%for the following Fortran \plc{allocate} statement.
|
||||
|
||||
In the C/C++ code the API \plc{omp\_allocate()} function is used
|
||||
to allocate space, similar to \plc{malloc}, except that the allocator
|
||||
is specified as the second argument.
|
||||
In Fortran an \code{allocate} directive is used to specify an allocator
|
||||
for a following Fortran \plc{allocate} statement.
|
||||
A variable list may be supplied if the allocator
|
||||
is to be applied to a subset of variables in the Fortran allocate
|
||||
statement. Specifying the complete list is optional.
|
||||
Here, the \plc{xy\_alloc} allocator is specified
|
||||
in the \code{allocator} clause,
|
||||
and the set of all variables used in the allocate statement is specified in the list.
|
||||
|
||||
%"for a following Fortran allocation statement" (no using "immediately" here)
|
||||
% it looks like if you have a list, the allocation statement does not need
|
||||
% to follow immediately.(?)
|
||||
% spec5.0 157:19-20 The allocate directive must appear in the same scope as
|
||||
% the declarations of each of its list items and must follow all such declarations.
|
||||
|
||||
%\pagebreak
|
||||
|
||||
\cexample{allocators}{1}
|
||||
\ffreeexample{allocators}{1}
|
||||
|
||||
|
@ -5,13 +5,14 @@
|
||||
The following examples show the usage of array sections in \code{map} clauses
|
||||
on \code{target} and \code{target} \code{data} constructs.
|
||||
|
||||
This example shows the invalid usage of two seperate sections of the same array
|
||||
This example shows the invalid usage of two separate sections of the same array
|
||||
inside of a \code{target} construct.
|
||||
|
||||
\cexample{array_sections}{1}
|
||||
|
||||
\ffreeexample{array_sections}{1}
|
||||
|
||||
\pagebreak
|
||||
This example shows the invalid usage of two separate sections of the same array
|
||||
inside of a \code{target} construct.
|
||||
|
||||
@ -19,6 +20,7 @@ inside of a \code{target} construct.
|
||||
|
||||
\ffreeexample{array_sections}{2}
|
||||
|
||||
\pagebreak
|
||||
This example shows the valid usage of two separate sections of the same array inside
|
||||
of a \code{target} construct.
|
||||
|
||||
@ -26,6 +28,7 @@ of a \code{target} construct.
|
||||
|
||||
\ffreeexample{array_sections}{3}
|
||||
|
||||
\pagebreak
|
||||
This example shows the valid usage of a wholly contained array section of an already
|
||||
mapped array section inside of a \code{target} construct.
|
||||
|
||||
|
27
Examples_array_shaping.tex
Normal file
27
Examples_array_shaping.tex
Normal file
@ -0,0 +1,27 @@
|
||||
\section{Array Shaping}
|
||||
\label{sec:array-shaping}
|
||||
|
||||
\ccppspecificstart
|
||||
A pointer variable can be shaped to a multi-dimensional array to facilitate
|
||||
data access. This is achieved by a \plc{shape-operator} casted in front of
|
||||
a pointer (lvalue expression):
|
||||
\begin{description}
|
||||
\item[]\hspace*{5mm}\code{([$s_1$][$s_2$]...[$s_n$])}\plc{pointer}
|
||||
\end{description}
|
||||
where each $s_i$ is an integral-type expression of positive value.
|
||||
The shape-operator can appear in either the \plc{motion-clause}
|
||||
of the \code{target}~\code{update} directive or the \code{depend} clause.
|
||||
|
||||
The following example shows the use of the shape-operator in the
|
||||
\code{target}~\code{update} directive. The shape-operator \code{([nx][ny+2])}
|
||||
casts pointer variable $a$ to a 2-dimentional array of size
|
||||
\plc{nx}$\times$\plc{(ny+2)}. The resulting array is then accessed as
|
||||
array sections (such as \code{[0:nx][1]} and \code{[0:nx][ny]})
|
||||
in the \code{from} or \code{to} clause for transferring two columns of
|
||||
noncontiguous boundary data from or to the device.
|
||||
Note the use of additional parentheses
|
||||
around the shape-operator and $a$ to ensure the correct precedence
|
||||
over array-section operations.
|
||||
|
||||
\cnexample{array_shaping}{1}
|
||||
\ccppspecificend
|
@ -12,4 +12,3 @@ As of OpenMP 4.5 and beyond the \code{nowait} clause can be used on the
|
||||
|
||||
This section also shows the use of \code{depend} clauses to order
|
||||
executions through dependences.
|
||||
|
||||
|
@ -11,6 +11,7 @@ task or one of the previously generated explicit tasks.
|
||||
|
||||
\cexample{async_target}{1}
|
||||
|
||||
\pagebreak
|
||||
The Fortran version has an interface block that contains the \code{declare} \code{target}.
|
||||
An identical statement exists in the function declaration (not shown here).
|
||||
|
||||
|
@ -22,6 +22,8 @@ the worksharing construct after the cancellation has been activated.
|
||||
|
||||
\ffreeexample{cancellation}{1}
|
||||
|
||||
\clearpage
|
||||
|
||||
The following example shows how to cancel a parallel search on a binary tree as
|
||||
soon as the search value has been detected. The code creates a task to descend
|
||||
into the child nodes of the current tree node. If the search value has been found,
|
||||
|
49
Examples_depobj.tex
Normal file
49
Examples_depobj.tex
Normal file
@ -0,0 +1,49 @@
|
||||
\pagebreak
|
||||
\section{The \code{depobj} Construct}
|
||||
\label{sec:depobj}
|
||||
|
||||
The stand-alone \code{depobj} construct provides a mechanism
|
||||
to create a \plc{depend object} that expresses a dependence to be
|
||||
used subsequently in the \code{depend} clause of another construct.
|
||||
The dependence is created from a dependence type and a storage location,
|
||||
within a \code{depend} clause of an \code{depobj} construct;
|
||||
%just as one would find directly on a \code{task} construct.
|
||||
and it is stored in the depend object.
|
||||
The depend object is represented by a variable of type \code{omp\_depend\_t}
|
||||
in C/C++ (by a scalar variable of integer kind \code{omp\_depend\_kind} in Fortran).
|
||||
|
||||
In the example below the stand-alone \code{depobj} construct uses the
|
||||
\code{depend}, \code{update} and \code{destroy} clauses to
|
||||
\plc{initialize}, \plc{update} and \plc{uninitialize}
|
||||
a depend object (\code{obj}).
|
||||
|
||||
The first \code{depobj} construct initializes the \code{obj}
|
||||
depend object with
|
||||
an \code{inout} dependence type with a storage
|
||||
location defined by variable \code{a}.
|
||||
This dependence is passed into the \plc{driver}
|
||||
routine via the \code{obj} depend object.
|
||||
|
||||
In the first \plc{driver} routine call, \emph{Task 1} uses
|
||||
the dependence of the object (\code{inout}),
|
||||
while \emph{Task 2} uses an \code{in} dependence specified
|
||||
directly in a \code{depend} clause.
|
||||
For these task dependences \emph{Task 1} must execute and
|
||||
complete before \emph{Task 2} begins.
|
||||
|
||||
Before the second call to \plc{driver}, \code{obj} is updated
|
||||
using the \code{depobj} construct to represent an \code{in} dependence.
|
||||
Hence, in the second call to \plc{driver}, \emph{Task 1}
|
||||
will have an \code{in} dependence; and \emph{Task 1} and
|
||||
\emph{Task 2} can execute simultaneously. Note: in an \code{update}
|
||||
clause, only the dependence type can be (is) updated.
|
||||
|
||||
The third \code{depobj} construct uses the \code{destroy} clause.
|
||||
It frees resources as it puts the depend object in an uninitialized state--
|
||||
effectively destroying the depend object.
|
||||
After an object has been uninitialized it can be initialized again
|
||||
with a new dependence type \emph{and} a new variable.
|
||||
|
||||
\cexample{depobj}{1}
|
||||
|
||||
\ffreeexample{depobj}{1}
|
@ -16,9 +16,9 @@ The following example is also conforming:
|
||||
|
||||
\fnexample{fort_sp_common}{2}
|
||||
% blue line floater at top of this page for "Fortran, cont."
|
||||
\begin{figure}[t!]
|
||||
\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
\end{figure}
|
||||
%\begin{figure}[t!]
|
||||
%\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em}
|
||||
%\end{figure}
|
||||
|
||||
The following example is conforming:
|
||||
|
||||
|
@ -11,6 +11,7 @@ call should be inside the \code{parallel} region.
|
||||
|
||||
\fexample{get_nthrs}{1}
|
||||
|
||||
\pagebreak
|
||||
The following example shows how to rewrite this program without including a query
|
||||
for the number of threads:
|
||||
|
||||
|
28
Examples_host_teams.tex
Normal file
28
Examples_host_teams.tex
Normal file
@ -0,0 +1,28 @@
|
||||
\pagebreak
|
||||
\section{\code{teams} Construct on Host}
|
||||
\label{sec:host_teams}
|
||||
|
||||
%{\color{blue} ... } {\color{violet} ... }
|
||||
Originally the \code{teams} construct was created for devices (such as GPUs)
|
||||
for independent executions of a structured block by teams within a league (on SMs).
|
||||
It was only available through offloading with the \code{target} construct,
|
||||
and the execution of a \code{teams} region could only be directed to host
|
||||
execution by various means such as \code{if} and \code{device} clauses,
|
||||
and the \code{OMP\_TARGET\_OFFLOAD} environment variable.
|
||||
|
||||
In OpenMP 5.0 the \code{teams} construct was extended to enable the host
|
||||
to execute a \code{teams} region (without an associated \code{target} construct),
|
||||
with anticipation of further affinity and threading controls in future OpenMP releases.
|
||||
%With additional affinity controls, a team could be
|
||||
%assigned to execute on a socket or use only a specified number of threads.
|
||||
|
||||
In the example below the \code{teams} construct is used to create two
|
||||
teams, one to execute single precision code, and the other
|
||||
to execute double precision code. Two teams are required, and
|
||||
the thread limit for each team is set to 1/2 of the number of
|
||||
available processors.
|
||||
|
||||
\cexample{host_teams}{1}
|
||||
|
||||
\ffreeexample{host_teams}{1}
|
||||
|
@ -50,6 +50,7 @@ one of the threads in the team. Since we have a total of two inner \code{paralle
|
||||
regions, the print statement will be executed twice -- once per inner \code{parallel}
|
||||
region.
|
||||
|
||||
\pagebreak
|
||||
\cexample{icv}{1}
|
||||
|
||||
\fexample{icv}{1}
|
||||
|
13
Examples_loop.tex
Normal file
13
Examples_loop.tex
Normal file
@ -0,0 +1,13 @@
|
||||
\pagebreak
|
||||
\section{The \code{loop} Construct}
|
||||
\label{sec:loop}
|
||||
|
||||
The following example illustrates the use of the OpenMP 5.0 \code{loop}
|
||||
construct for the execution of a loop.
|
||||
The \code{loop} construct asserts to the compiler that the iterations
|
||||
of the loop are free of data dependencies and may be executed concurrently.
|
||||
It allows the compiler to use heuristics to select the parallelization scheme
|
||||
and compiler-level optimizations for the concurrency.
|
||||
|
||||
\cexample{loop}{1}
|
||||
\ffreeexample{loop}{1}
|
@ -1,3 +1,4 @@
|
||||
|
||||
\pagebreak
|
||||
\section{The OpenMP Memory Model}
|
||||
\label{sec:mem_model}
|
||||
@ -18,6 +19,7 @@ be printed by both Print 2 and Print 3.
|
||||
|
||||
\ffreeexample{mem_model}{1}
|
||||
|
||||
\pagebreak
|
||||
The following example demonstrates why synchronization is difficult to perform
|
||||
correctly through variables. The value of flag is undefined in both prints on thread
|
||||
1 and the value of data is only well-defined in the second print.
|
||||
@ -26,6 +28,7 @@ correctly through variables. The value of flag is undefined in both prints on th
|
||||
|
||||
\fexample{mem_model}{2}
|
||||
|
||||
\pagebreak
|
||||
The next example demonstrates why synchronization is difficult to perform correctly
|
||||
through variables. Because the \plc{write}(1)-\plc{flush}(1)-\plc{flush}(2)-\plc{read}(2)
|
||||
sequence cannot be guaranteed in the example, the statements on thread 0 and thread
|
||||
|
88
Examples_metadirective.tex
Normal file
88
Examples_metadirective.tex
Normal file
@ -0,0 +1,88 @@
|
||||
\pagebreak
|
||||
\section{Metadirective Directive}
|
||||
\label{sec:metadirective}
|
||||
|
||||
A \code{metadirective} directive provides a mechanism to select a directive in
|
||||
a \code{when} clause to be used, depending upon one or more contexts:
|
||||
implementation, available devices and the present enclosing construct.
|
||||
The directive in a \code{default} clause is used when a directive of the
|
||||
\code{when} clause is not selected.
|
||||
|
||||
In the \code{when} clause the \plc{context selector} (or just \plc{selector}) defines traits that are
|
||||
evaluated for selection of the directive that follows the selector.
|
||||
This "selectable" directive is called a \plc{directive variant}.
|
||||
Traits are grouped by \plc{construct}, \plc{implementation} and
|
||||
\plc{device} \plc{sets} to be used by a selector of the same name.
|
||||
|
||||
In the first example the architecture trait \plc{arch} of the
|
||||
\plc{device} selector set specifies that if an \plc{nvptx} (NVIDIA) architecture is
|
||||
active in the OpenMP context, then the \code{teams}~\code{loop}
|
||||
\plc{directive variant} is selected as the directive; otherwise, the \code{parallel}~\code{loop}
|
||||
\plc{directive variant} of the \code{default} clause is selected as the directive.
|
||||
That is, if a \plc{device} of \plc{nvptx} architecture is supported by the implementation within
|
||||
the enclosing \code{target} construct, its \plc{directive variant} is selected.
|
||||
The architecture names, such as \plc{nvptx}, are implementation defined.
|
||||
Also, note that \plc{device} as used in a \code{target} construct specifies
|
||||
a device number, while \plc{device}, as used in the \code{metadirective}
|
||||
directive as selector set, has traits of \plc{kind}, \plc{isa} and \plc{arch}.
|
||||
|
||||
|
||||
|
||||
\cexample{metadirective}{1}
|
||||
|
||||
\ffreeexample{metadirective}{1}
|
||||
|
||||
%\pagebreak
|
||||
In the second example, the \plc{implementation} selector set is specified
|
||||
in the \code{when} clause to distinguish between AMD and NVIDIA platforms.
|
||||
Additionally, specific architectures are specified with the \plc{device}
|
||||
selector set.
|
||||
|
||||
In the code, different \code{teams} constructs are employed as determined
|
||||
by the \code{metadirective} directive.
|
||||
The number of teams is restricted by a \code{num\_teams} clause
|
||||
and a thread limit is also set by a \code{thread\_limit} clause for
|
||||
\plc{vendor} AMD and NVIDIA platforms and specific architecture
|
||||
traits. Otherwise, just the \code{teams} construct is used without
|
||||
any clauses, as prescribed by the \code{default} clause.
|
||||
|
||||
|
||||
\cexample{metadirective}{2}
|
||||
|
||||
\ffreeexample{metadirective}{2}
|
||||
|
||||
\clearpage
|
||||
|
||||
%\pagebreak
|
||||
In the third example, a \plc{construct} selector set is specified in the \code{when} clause.
|
||||
Here, a \code{metadirective} directive is used within a function that is also
|
||||
compiled as a function for a target device as directed by the \code{declare}~\code{target} directive.
|
||||
The \plc{target} directive name of the \code{construct} selector ensures that the
|
||||
\code{distribute}~\code{parallel}~\code{for/do} construct is employed for the target compilation.
|
||||
Otherwise, for the host-compiled version the \code{parallel}~\code{for/do}~\code{simd} construct is used.
|
||||
|
||||
In the first call to the \plc{exp\_pi\_diff()} routine the context is a
|
||||
\code{target}~\code{teams} construct and the \code{distribute}~\code{parallel}~\code{for/do}
|
||||
construct version of the function is invoked,
|
||||
while in the second call the \code{parallel}~\code{for/do}~\code{simd} construct version is used.
|
||||
|
||||
%%%%%%%%
|
||||
This case illustrates an important point for users that may want to hoist the
|
||||
\code{target} directive out of a function that contains the usual
|
||||
\code{target}~\code{teams}~\code{distribute}~\code{parallel}~\code{for/do} construct
|
||||
(for providing alternate constructs through the \code{metadirective} directive as here).
|
||||
While this combined construct can be decomposed into a \code{target} and
|
||||
\code{teams distribute parallel for/do} constructs, the OpenMP 5.0 specification has the restriction:
|
||||
``If a \code{teams} construct is nested within a \code{target} construct, that \code{target} construct must
|
||||
contain no statements, declarations or directives outside of the \code{teams} construct''.
|
||||
So, the \code{teams} construct must immediately follow the \code{target} construct without any intervening
|
||||
code statements (which includes function calls).
|
||||
Since the \code{target} construct alone cannot be hoisted out of a function,
|
||||
the \code{target}~\code{teams} construct has been hoisted out of the function, and the
|
||||
\code{distribute}~\code{parallel}~\code{for/do} construct is used
|
||||
as the \plc{variant} directive of the \code{metadirective} directive within the function.
|
||||
%%%%%%%%
|
||||
|
||||
\cexample{metadirective}{3}
|
||||
|
||||
\ffreeexample{metadirective}{3}
|
@ -16,6 +16,7 @@ abort the program or to supply any number of threads available.
|
||||
|
||||
\fexample{nthrs_dynamic}{1}
|
||||
|
||||
\pagebreak
|
||||
The call to the \code{omp\_set\_dynamic} routine with a non-zero argument in
|
||||
C/C++, or \code{.TRUE.} in Fortran, allows the OpenMP implementation to choose
|
||||
any number of threads between 1 and 10.
|
||||
|
33
Examples_parallel_master_taskloop.tex
Normal file
33
Examples_parallel_master_taskloop.tex
Normal file
@ -0,0 +1,33 @@
|
||||
\pagebreak
|
||||
\section{The \code{parallel master taskloop} Construct}
|
||||
\label{sec:parallel_master_taskloop}
|
||||
|
||||
In the OpenMP 5.0 Specification several combined constructs containing
|
||||
the \code{taskloop} construct were added.
|
||||
|
||||
Just as the \code{for} and \code{do} constructs have been combined
|
||||
with the \code{parallel} construct for convenience, so too, the combined
|
||||
\code{parallel}~\code{master}~\code{taskloop} and
|
||||
\code{parallel}~\code{master}~\code{taskloop}~\code{simd}
|
||||
constructs have been created for convenience.
|
||||
|
||||
In the following example the first \code{taskloop} construct is enclosed
|
||||
by the usual \code{parallel} and \code{master} constructs to form
|
||||
a team of threads, and a single task generator (master thread) for
|
||||
the \code{taskloop} construct.
|
||||
|
||||
The same OpenMP operations for the first taskloop are accomplished by the second
|
||||
taskloop with the \code{parallel}~\code{master}~\code{taskloop}
|
||||
combined construct.
|
||||
The third taskloop uses the combined \code{parallel}~\code{master}~\code{taskloop}~\code{simd}
|
||||
construct to accomplish the same behavior as closely nested \code{parallel master},
|
||||
and \code{taskloop simd} constructs.
|
||||
|
||||
As with any combined construct the clauses of the components may be used
|
||||
with appropriate restrictions. The combination of the \code{parallel}~\code{master} construct
|
||||
with the \code{taskloop} or \code{taskloop}~\code{simd} construct produces no additional
|
||||
restrictions.
|
||||
|
||||
\cexample{parallel_master_taskloop}{1}
|
||||
|
||||
\ffreeexample{parallel_master_taskloop}{1}
|
@ -1,13 +1,21 @@
|
||||
\pagebreak
|
||||
\section{The \code{reduction} Clause}
|
||||
|
||||
\section{Reduction}
|
||||
\label{sec:reduction}
|
||||
|
||||
The following example demonstrates the \code{reduction} clause ; note that some
|
||||
This section covers ways to perform reductions in parallel, task, taskloop, and SIMD regions.
|
||||
|
||||
\subsection{The \code{reduction} Clause}
|
||||
\label{subsec:reduction}
|
||||
|
||||
The following example demonstrates the \code{reduction} clause; note that some
|
||||
reductions can be expressed in the loop in several ways, as shown for the \code{max}
|
||||
and \code{min} reductions below:
|
||||
|
||||
\cexample{reduction}{1}
|
||||
|
||||
\pagebreak
|
||||
|
||||
\ffreeexample{reduction}{1}
|
||||
|
||||
A common implementation of the preceding example is to treat it as if it had been
|
||||
@ -41,6 +49,7 @@ to \code{MIN}.
|
||||
\ffreenexample{reduction}{5}
|
||||
\fortranspecificend
|
||||
|
||||
\pagebreak
|
||||
The following example is non-conforming because the initialization (\code{a =
|
||||
0}) of the original list item \code{a} is not synchronized with the update of
|
||||
\code{a} as a result of the reduction computation in the \code{for} loop. Therefore,
|
||||
@ -63,3 +72,166 @@ The following example demonstrates the reduction of array \plc{a}. In C/C++ thi
|
||||
\cexample{reduction}{7}
|
||||
|
||||
\ffreeexample{reduction}{7}
|
||||
|
||||
|
||||
\subsection{Task Reduction}
|
||||
\label{subsec:task_reduction}
|
||||
|
||||
The following C/C++ and Fortran examples show how to implement
|
||||
a task reduction over a linked list.
|
||||
|
||||
Task reductions are supported by the \code{task\_reduction} clause which can only be
|
||||
applied to the \code{taskgroup} directive, and a \code{in\_reduction} clause
|
||||
which can be applied to the \code{task} construct among others.
|
||||
|
||||
The \code{task\_reduction} clause on the \code{taskgroup} construct is used to
|
||||
define the scope of a new reduction, and after the \code{taskgroup}
|
||||
region the original variable will contain the final value of the reduction.
|
||||
In the task-generating while loop the \code{in\_reduction} clause of the \code{task}
|
||||
construct is used to specify that the task participates "in" the reduction.
|
||||
|
||||
Note: The \plc{res} variable is private in the \plc{linked\_list\_sum} routine
|
||||
and is not required to be shared (as in the case of a \code{parallel} construct
|
||||
reduction).
|
||||
|
||||
|
||||
\cexample{task_reduction}{1}
|
||||
|
||||
\ffreeexample{task_reduction}{1}
|
||||
|
||||
|
||||
\subsection{Taskloop Reduction}
|
||||
\label{subsec:taskloop_reduction}
|
||||
|
||||
In the OpenMP 5.0 Specification the \code{taskloop} construct
|
||||
was extended to include the reductions.
|
||||
|
||||
The following two examples show how to implement a reduction over an array
|
||||
using taskloop reduction in two different ways.
|
||||
In the first
|
||||
example we apply the \code{reduction} clause to the \code{taskloop} construct. As it was
|
||||
explained above in the task reduction examples, a reduction over tasks is
|
||||
divided in two components: the scope of the reduction, which is defined by a
|
||||
\code{taskgroup} region, and the tasks that participate in the reduction. In this
|
||||
example, the \code{reduction} clause defines both semantics. First, it specifies that
|
||||
the implicit \code{taskgroup} region associated with the \code{taskloop} construct is the scope of the
|
||||
reduction, and second, it defines all tasks created by the \code{taskloop} construct as
|
||||
participants of the reduction. About the first property, it is important to note
|
||||
that if we add the \code{nogroup} clause to the \code{taskloop} construct the code will be
|
||||
nonconforming, basically because we have a set of tasks that participate in a
|
||||
reduction that has not been defined.
|
||||
|
||||
\cexample{taskloop_reduction}{1}
|
||||
\ffreeexample{taskloop_reduction}{1}
|
||||
|
||||
%In the second example, we are computing exactly the same
|
||||
%value but we do it in a very different way. The first thing that we do in the
|
||||
%\plc{array\_sum} function is to create a \code{taskgroup} region that defines the scope of a
|
||||
%new reduction using the \code{task\_reduction} clause.
|
||||
%After that, we specify that a task and also the tasks generated
|
||||
%by a taskloop will participate in that reduction using the \code{in\_reduction} clause
|
||||
%on the \code{task} and \code{taskloop} constructs, respectively. Note that
|
||||
%we also added the \code{nogroup} clause to the \code{taskloop} construct. This is allowed
|
||||
%because what we are expressing with the \code{in\_reduction} clause is different
|
||||
%from what we were expressing with the \code{reduction} clause. In one case we specify
|
||||
%that the generated tasks will participate in a previously declared reduction
|
||||
%(\code{in\_reduction} clause) whereas in the other case we specify that we want to
|
||||
%create a new reduction and also that all tasks generated by the taskloop will
|
||||
%participate on it.
|
||||
|
||||
The second example computes exactly the same value as in the preceding\plc{taskloop\_reduction.1} code section,
|
||||
but in a very different way.
|
||||
First, in the \plc{array\_sum} function a \code{taskgroup} region is created
|
||||
that defines the scope of a new reduction using the \code{task\_reduction} clause.
|
||||
After that, a task and also the tasks generated by a taskloop participate in
|
||||
that reduction by using the \code{in\_reduction} clause on the \code{task}
|
||||
and \code{taskloop} constructs, respectively.
|
||||
Note that the \code{nogroup} clause was added to the \code{taskloop} construct.
|
||||
This is allowed because what is expressed with the \code{in\_reduction} clause
|
||||
is different from what is expressed with the \code{reduction} clause.
|
||||
In one case the generated tasks are specified to participate in a previously
|
||||
declared reduction (\code{in\_reduction} clause) whereas in the other case
|
||||
creation of a new reduction is specified and also that all tasks generated
|
||||
by the taskloop will participate on it.
|
||||
|
||||
\cexample{taskloop_reduction}{2}
|
||||
\ffreeexample{taskloop_reduction}{2}
|
||||
|
||||
In the OpenMP 5.0 Specification, \code{reduction} clauses for the
|
||||
\code{taskloop}~\code{ simd} construct were also added.
|
||||
|
||||
The examples below compare reductions for the \code{taskloop} and the \code{taskloop}~\code{simd} constructs.
|
||||
These examples illustrate the use of \code{reduction} clauses within
|
||||
"stand-alone" \code{taskloop} constructs, and the use of \code{in\_reduction} clauses for tasks of taskloops to participate
|
||||
with other reductions within the scope of a parallel region.
|
||||
|
||||
\textbf{taskloop reductions:}
|
||||
|
||||
In the \plc{taskloop reductions} section of the example below,
|
||||
\plc{taskloop 1} uses the \code{reduction} clause
|
||||
in a \code{taskloop} construct for a sum reduction, accumulated in \plc{asum}.
|
||||
The behavior is as though a \code{taskgroup} construct encloses the
|
||||
taskloop region with a \code{task\_reduction} clause, and each taskloop
|
||||
task has an \code{in\_reduction} clause with the specifications
|
||||
of the \code{reduction} clause.
|
||||
At the end of the taskloop region \plc{asum} contains the result of the reduction.
|
||||
|
||||
The next taskloop, \plc{taskloop 2}, illustrates the use of the
|
||||
\code{in\_reduction} clause to participate in a previously defined
|
||||
reduction scope of a \code{parallel} construct.
|
||||
|
||||
The task reductions of \plc{task 2} and \plc{taskloop 2} are combined
|
||||
across the \code{taskloop} construct and the single \code{task} construct, as specified
|
||||
in the \code{reduction(task,}~\code{+:asum)} clause of the \code{parallel} construct.
|
||||
At the end of the parallel region \plc{asum} contains the combined result of all reductions.
|
||||
|
||||
\textbf{taskloop simd reductions:}
|
||||
|
||||
Reductions for the \code{taskloop}~\code{simd} construct are shown in the second half of the code.
|
||||
Since each component construct, \code{taskloop} and \code{simd},
|
||||
can accept a reduction-type clause, the \code{taskloop}~\code{simd} construct
|
||||
is a composite construct, and the specific application of the reduction clause is defined
|
||||
within the \code{taskloop}~\code{simd} construct section of the OpenMP 5.0 Specification.
|
||||
The code below illustrates use cases for these reductions.
|
||||
|
||||
In the \plc{taskloop simd reduction} section of the example below,
|
||||
\plc{taskloop simd 3} uses the \code{reduction} clause
|
||||
in a \code{taskloop}~\code{simd} construct for a sum reduction within a loop.
|
||||
For this case a \code{reduction} clause is used, as one would use
|
||||
for a \code{simd} construct.
|
||||
The SIMD reductions of each task are combined, and the results of these tasks are further
|
||||
combined just as in the \code{taskloop} construct with the \code{reduction} clause for \plc{taskloop 1}.
|
||||
At the end of the taskloop region \plc{asum} contains the combined result of all reductions.
|
||||
|
||||
If a \code{taskloop}~\code{simd} construct is to participate in a previously defined
|
||||
reduction scope, the reduction participation should be specified with
|
||||
a \code{in\_reduction} clause, as shown in the \code{parallel} region enclosing
|
||||
\plc{task 4} and \plc{taskloop simd 4} code sections.
|
||||
|
||||
Here the \code{taskloop}~\code{simd} construct's
|
||||
\code{in\_reduction} clause specifies participation of the construct's tasks as
|
||||
a task reduction within the scope of the parallel region.
|
||||
That is, the results of each task of the \code{taskloop} construct component
|
||||
contribute to the reduction in a broader level, just as in \plc{parallel reduction a} code section above.
|
||||
Also, each \code{simd}-component construct
|
||||
occurs as if it has a \code{reduction} clause, and the
|
||||
SIMD results of each task are combined as though to form a single result for
|
||||
each task (that participates in the \code{in\_reduction} clause).
|
||||
At the end of the parallel region \plc{asum} contains the combined result of all reductions.
|
||||
|
||||
%Just as in \plc{parallel reduction a} the
|
||||
%\code{taskloop simd} construct reduction results are combined
|
||||
%with the \code{task} construct reduction results
|
||||
%as specified by the \code{in\_reduction} clause of the \code{task} construct
|
||||
%and the \plc{task} reduction-modifier of the \code{reduction} clause of
|
||||
%the \code{parallel} construct.
|
||||
%At the end of the parallel region \plc{asum} contains the combined result of all reductions.
|
||||
|
||||
|
||||
\cexample{taskloop_simd_reduction}{1}
|
||||
|
||||
\ffreeexample{taskloop_simd_reduction}{1}
|
||||
|
||||
|
||||
|
||||
% All other reductions
|
||||
|
31
Examples_requires.tex
Normal file
31
Examples_requires.tex
Normal file
@ -0,0 +1,31 @@
|
||||
\pagebreak
|
||||
\section{The \code{requires} Directive}
|
||||
\label{sec:requires}
|
||||
|
||||
The declarative \code{requires} directive can be used to
|
||||
specify features that an implementation must provide to compile and
|
||||
execute correctly.
|
||||
|
||||
In the following example the \code{unified\_shared\_memory} clause
|
||||
of the \code{requires} directive ensures that the host and all
|
||||
devices accessible through OpenMP provide a \plc{unified address} space
|
||||
for memory that is shared by all devices.
|
||||
|
||||
The example illustrates the use of the \code{requires} directive specifying
|
||||
\plc{unified shared memory} in file scope, before any device
|
||||
directives or device routines. No \code{map} clause is needed for
|
||||
the \plc{p} structure on the device (and its address \plc{\&p}, for the C++ code,
|
||||
is the same address on the host and device).
|
||||
However, scalar variables referenced within the \code{target}
|
||||
construct still have a default data-sharing attribute of firstprivate.
|
||||
The \plc{q} scalar is incremented on the device, and its change is
|
||||
not updated on the host.
|
||||
% will defaultmap(toform:scalar) make q use shared address space?
|
||||
%Or will it be ignored at this point.
|
||||
% Does before device routines also mean before prototype?
|
||||
|
||||
%\pagebreak
|
||||
|
||||
\cppexample{requires}{1}
|
||||
|
||||
\ffreeexample{requires}{1}
|
@ -9,6 +9,7 @@ and cannot be the immediate substatement of an \code{if} statement.
|
||||
|
||||
\cexample{standalone}{1}
|
||||
|
||||
\pagebreak
|
||||
The following example is non-conforming, because the \code{flush}, \code{barrier},
|
||||
\code{taskwait}, and \code{taskyield} directives are stand-alone directives
|
||||
and cannot be the action statement of an \code{if} statement or a labeled branch
|
||||
@ -22,6 +23,7 @@ in a compound statement.
|
||||
|
||||
\cexample{standalone}{2}
|
||||
|
||||
\pagebreak
|
||||
The following example is conforming because the \code{flush}, \code{barrier},
|
||||
\code{taskwait}, and \code{taskyield} directives are enclosed in an \code{if}
|
||||
construct or follow the labeled branch target.
|
||||
|
@ -110,3 +110,35 @@ to the \code{parallel} component of the combined directive.
|
||||
\cexample{target}{6}
|
||||
|
||||
\ffreeexample{target}{6}
|
||||
|
||||
\subsection{target Reverse Offload}
|
||||
\label{subsec:target_reverse_offload}
|
||||
|
||||
Beginning with OpenMP 5.0, implementations are allowed to
|
||||
offload back to the host (reverse offload).
|
||||
|
||||
In the example below the \plc{error\_handler} function
|
||||
is executed back on the host, if an erroneous value is
|
||||
detected in the \plc{A} array on the device.
|
||||
|
||||
This is accomplished by specifying the \plc{device-modifier}
|
||||
\code{ancestor} modifier, along with a device number of \code{1},
|
||||
to indicate that the execution is to be performed on the
|
||||
immediate parent (\plc{1st ancestor})-- the host.
|
||||
|
||||
The \code{requires} directive (another 5.0 feature)
|
||||
uses the \code{reverse\_offload} clause to guarantee
|
||||
that the reverse offload is implemented.
|
||||
|
||||
Note that the \code{declare target} directive uses the
|
||||
\code{device\_type} clause (another 5.0 feature) to specify that
|
||||
the \plc{error\_handler} function is compiled to
|
||||
execute on the \plc{host} only. This ensures that no
|
||||
attempt will be made to create a device version of the
|
||||
function. This feature may be necessary if the function
|
||||
exists in another compile unit.
|
||||
|
||||
|
||||
\cexample{target_reverse_offload}{7}
|
||||
|
||||
\ffreeexample{target_reverse_offload}{7}
|
||||
|
@ -16,6 +16,7 @@ environment.
|
||||
|
||||
\cexample{target_data}{1}
|
||||
|
||||
\pagebreak
|
||||
The Fortran code passes a reference and specifies the extent of the arrays in the
|
||||
declaration. No length information is necessary in the map clause, as is required
|
||||
with C/C++ pointers.
|
||||
@ -49,20 +50,16 @@ once by the \code{target} \code{data} construct.
|
||||
|
||||
\ffreeexample{target_data}{2}
|
||||
|
||||
In the following example, the variable tmp defaults to \code{tofrom} map-type
|
||||
and is mapped at each \code{target} construct. The array \plc{Q} is mapped once at
|
||||
the enclosing \code{target} \code{data} region instead of at each \code{target}
|
||||
construct.
|
||||
In the following example, the array \plc{Q} is mapped once at the enclosing
|
||||
\code{target}~\code{data} region instead of at each \code{target} construct.
|
||||
In OpenMP 4.0, a scalar variable is implicitly mapped with the \code{tofrom} map-type.
|
||||
But since OpenMP 4.5, a scalar variable, such as the \plc{tmp} variable, has to be explicitly mapped with
|
||||
the \code{tofrom} map-type at the first \code{target} construct in order to return
|
||||
its reduced value from the parallel loop construct to the host.
|
||||
The variable defaults to firstprivate at the second \code{target} construct.
|
||||
|
||||
\cexample{target_data}{3}
|
||||
|
||||
In the following example the arrays \plc{v1} and \plc{v2} are mapped at each \code{target}
|
||||
construct. Instead of mapping the array \plc{Q} twice at each \code{target} construct,
|
||||
\plc{Q} is mapped once by the \code{target} \code{data} construct. Note, the \plc{tmp}
|
||||
variable is implicitly remapped for each \code{target} region, mapping the value
|
||||
from the device to the host at the end of the first \code{target} region, and
|
||||
from the host to the device for the second \code{target} region.
|
||||
|
||||
\ffreeexample{target_data}{3}
|
||||
|
||||
\subsection{\code{target} \code{data} Construct with Orphaned Call}
|
||||
@ -145,6 +142,7 @@ of the \code{target} constructs.
|
||||
|
||||
\cexample{target_data}{6}
|
||||
|
||||
\pagebreak
|
||||
The \code{if} clauses work the same way for the following Fortran code. The \code{target}
|
||||
constructs enclosed in the \code{target} \code{data} region should also use
|
||||
an \code{if} clause with the same condition, so that the \code{target} \code{data}
|
||||
@ -153,6 +151,7 @@ are both ignored.
|
||||
|
||||
\ffreeexample{target_data}{6}
|
||||
|
||||
\pagebreak
|
||||
In the following example, when the \code{if} clause conditional expression on
|
||||
the \code{target} construct evaluates to \plc{false}, the target region will
|
||||
execute on the host device. However, the \code{target} \code{data} construct
|
||||
@ -164,6 +163,7 @@ the \code{target} \code{data} construct, resulting in undefined values in \plc{p
|
||||
|
||||
\cexample{target_data}{7}
|
||||
|
||||
\pagebreak
|
||||
The \code{if} clauses work the same way for the following Fortran code. When
|
||||
the \code{if} clause conditional expression on the \code{target} construct
|
||||
evaluates to \plc{false}, the \code{target} region will execute on the host
|
||||
|
86
Examples_target_mapper.tex
Normal file
86
Examples_target_mapper.tex
Normal file
@ -0,0 +1,86 @@
|
||||
\pagebreak
|
||||
\section{ \code{declare mapper} Construct}
|
||||
\label{sec:declare_mapper}
|
||||
|
||||
The following examples show how to use the \code{declare mapper}
|
||||
directive to prescribe a map for later use.
|
||||
It is also quite useful for pre-defining partitioned and nested
|
||||
structure elements.
|
||||
|
||||
In the first example the \code{declare mapper} directive specifies
|
||||
that any structure of type \plc{myvec\_t} for which implicit data-mapping
|
||||
rules apply will be mapped according to its \code{map} clause.
|
||||
The variable \plc{v} is used for referencing the structure and its
|
||||
elements within the \code{map} clause.
|
||||
Within the \code{map} clause the \plc{v} variable specifies that all
|
||||
elements of the structure are to be mapped. Additionally, the
|
||||
array section \plc{v.data[0:v.len]} specifies that the dynamic
|
||||
storage for data is to be mapped.
|
||||
|
||||
Within the main program the \plc{s} variable is typed as \plc{myvec\_t}.
|
||||
Since the variable is found within the target region and the type has a mapping prescribed by
|
||||
a \code{declare mapper} directive, it will be automatically mapped according to its prescription:
|
||||
full structure, plus the dynamic storage of the \plc{data} element.
|
||||
|
||||
%Note: By default the mapping is \code{tofrom}.
|
||||
%The associated Fortran allocatable \plc{data} array is automatically mapped with the derived
|
||||
%type, it does not require an array section as in the C/C++ example.
|
||||
|
||||
\cexample{target_mapper}{1}
|
||||
|
||||
\ffreeexample{target_mapper}{1}
|
||||
|
||||
\pagebreak
|
||||
The next example illustrates the use of the \plc{mapper-identifier} and deep copy within a structure.
|
||||
The structure, \plc{dzmat\_t}, represents a complex matrix,
|
||||
with separate real (\plc{r\_m}) and imaginary (\plc{i\_m}) elements.
|
||||
Two map identifiers are created for partitioning the \plc{dzmat\_t} structure.
|
||||
|
||||
For the C/C++ code the first identifier is named \plc{top\_id} and maps the top half of
|
||||
two matrices of type \plc{dzmat\_t}; while the second identifier, \plc{bottom\_id},
|
||||
maps the lower half of two matrices.
|
||||
Each identifier is applied to a different \code{target} construct,
|
||||
as \code{map(mapper(top\_id), tofrom: a,b)}
|
||||
and \code{map(mapper(bottom\_id), tofrom: a,b)}.
|
||||
Each target offload is allowed to execute concurrently on two different devices
|
||||
(\plc{0} and \plc{1}) through the \code{nowait} clause.
|
||||
The OpenMP 5.0 \code{parallel master} construct creates a region of two threads
|
||||
for these \code{target} constructs, with a single thread (\plc{master}) generator.
|
||||
|
||||
The Fortran code uses the \plc{left\_id} and \plc{right\_id} map identifiers in the
|
||||
\code{map(mapper(left\_id),tofrom: a,b)} and \code{map(mapper(right\_id),tofrom: a,b)} map clauses.
|
||||
The array sections for these left and right contiguous portions of the matrices
|
||||
were defined previously in the \code{declare mapper} directive.
|
||||
|
||||
Note, the \plc{is} and \plc{ie} scalars are firstprivate
|
||||
by default for a target region, but are declared firstprivate anyway
|
||||
to remind the user of important firstprivate data-sharing properties required here.
|
||||
|
||||
\cexample{target_mapper}{2}
|
||||
|
||||
\ffreeexample{target_mapper}{2}
|
||||
|
||||
\pagebreak
|
||||
In the third example \plc{myvec} structures are
|
||||
nested within a \plc{mypoints} structure. The \plc{myvec\_t} type is mapped
|
||||
as in the first example. Following the \plc{mypoints} structure declaration,
|
||||
the \plc{mypoints\_t} type is mapped by a \code{declare mapper} directive.
|
||||
For this structure the \plc{hostonly\_data} element will not be mapped;
|
||||
also the array section of \plc{x} (\plc{v.x[:1]}) and \plc{x} will be mapped; and
|
||||
\plc{scratch} will be allocated and used as scratch storage on the device.
|
||||
The default map-type mapping, \code{tofrom}, applies to the \plc{x} array section,
|
||||
but not to \plc{scratch} which is explicitly mapped with the \code{alloc} map-type.
|
||||
Note: the variable \plc{v} is not included in the map list (otherwise
|
||||
the \plc{hostonly\_data} would be mapped)-- just the elements
|
||||
to be mapped are listed.
|
||||
|
||||
The two mappers are combined when a \plc{mypoints\_t} structure type is mapped,
|
||||
because the mapper \plc{myvec\_t} structure type is used within a \plc{mypoints\_t}
|
||||
type structure.
|
||||
%Note, in the main program \plc{P} is an array of \plc{mypoints\_t} type structures,
|
||||
%and hence every element of the array is mapped with the mapper prescription.
|
||||
|
||||
\cexample{target_mapper}{3}
|
||||
|
||||
\ffreeexample{target_mapper}{3}
|
||||
|
46
Examples_target_offload.tex
Normal file
46
Examples_target_offload.tex
Normal file
@ -0,0 +1,46 @@
|
||||
\pagebreak
|
||||
\section{Target Offload}
|
||||
\label{sec:target_offload}
|
||||
|
||||
In the OpenMP 5.0 implementation the \code{OMP\_TARGET\_OFFLOAD}
|
||||
environment variable was defined to change \plc{default} offload behavior.
|
||||
By \plc{default} the target code (region) is executed on the host if the target device
|
||||
does not exist or the implementation does not support the target device.
|
||||
%Last sentence uses words of the 5.0 spec pg. 21 lines 7-8
|
||||
|
||||
In an OpenMP 5.0 compliant implementation, setting the
|
||||
\code{OMP\_TARGET\_OFFLOAD} variable to \code{MANDATORY} will
|
||||
force the program to terminate execution when a \code{target}
|
||||
construct is encountered and the target device is not supported or is not available.
|
||||
With a value \code{DEFAULT} the target region will execute on a device if the
|
||||
device exists and is supported by the implementation,
|
||||
otherwise it will execute on the host.
|
||||
Support for the \code{DISABLED}
|
||||
value is optional; when it is supported the behavior is as if only the
|
||||
host device exists (other devices are considered non-existent to the runtime),
|
||||
and target regions are executed on the host.
|
||||
|
||||
The following example reports execution behavior for different
|
||||
values of the \code{OMP\_TARGET\_OFFLOAD} variable. A handy routine
|
||||
for extracting the \code{OMP\_TARGET\_OFFLOAD} environment variable
|
||||
value is deployed here, because the OpenMP API does not have a routine
|
||||
for obtaining the value. %(\texit{yet}).
|
||||
|
||||
Note:
|
||||
The example issues a warning when a pre-5.0 implementation is used,
|
||||
indicating that the \code{OMP\_TARGET\_OFFLOAD} is ignored.
|
||||
The value of the \code{OMP\_TARGET\_OFFLOAD} variable is reported
|
||||
when the \code{OMP\_DISPLAY\_ENV}
|
||||
environment variable is set to \code{TRUE} or \code{VERBOSE}.
|
||||
|
||||
%\pagebreak
|
||||
\cexample{target_offload_control}{1}
|
||||
|
||||
%\pagebreak
|
||||
\ffreeexample{target_offload_control}{1}
|
||||
|
||||
|
||||
% OMP 4.5 target offload 15:9-11
|
||||
%If the target device does not exist or the
|
||||
%implementation does not support the target device, all target regions associated with that device
|
||||
%execute on the host device.
|
53
Examples_target_pointer_mapping.tex
Normal file
53
Examples_target_pointer_mapping.tex
Normal file
@ -0,0 +1,53 @@
|
||||
\pagebreak
|
||||
\section{Pointer mapping}
|
||||
\label{sec:pointer_mapping}
|
||||
|
||||
The following example shows the basics of mapping pointers with and without
|
||||
associated storage on the host.
|
||||
|
||||
Storage for pointers \plc{ptr1} and \plc{ptr2} is created on the host.
|
||||
To map storage that is associated with a pointer on the host, the data can be
|
||||
explicitly mapped as an array section so that the compiler knows
|
||||
the amount of data to be assigned in the device (to the "corresponding" data storage area).
|
||||
On the \code{target} construct array sections are mapped; however, the pointer \plc{ptr1}
|
||||
is mapped, while \plc{ptr2} is not. Since \plc{ptr2} is not explicitly mapped, it is
|
||||
firstprivate. This creates a subtle difference in the way these pointers can be used.
|
||||
|
||||
As a firstprivate pointer, \plc{ptr2} can be manipulated on the device;
|
||||
however, as an explicitly mapped pointer,
|
||||
\plc{ptr1} becomes an \emph{attached} pointer and cannot be manipulated.
|
||||
In both cases the host pointer is not updated with the device pointer
|
||||
address---as one would expect for distributed memory.
|
||||
The storage data on the host is updated from the corresponding device
|
||||
data at the end of the \code{target} region.
|
||||
|
||||
As a comparison, note that the \plc{aray} array is automatically mapped,
|
||||
since the compiler knows the extent of the array.
|
||||
|
||||
The pointer \plc{ptr3} is used in the \code{target} region and has
|
||||
a data-sharing attribute of firstprivate.
|
||||
The pointer is implicitly mapped to a zero-length array section.
|
||||
Neither the pointer address nor any
|
||||
of its locally assigned data on the device is returned
|
||||
to the host.
|
||||
|
||||
\cexample{target_ptr_map}{1}
|
||||
|
||||
In the following example the global pointer \plc{p} appears in a
|
||||
\code{declare}~\code{target} directive. Hence, the pointer \plc{p} will
|
||||
persist on the device throughout executions in all target regions.
|
||||
|
||||
The pointer is also used in an array section of a \code{map} clause on
|
||||
a \code{target} construct. When storage associated with
|
||||
a \code{declare}~\code{target} pointer
|
||||
is mapped, as for the array section \plc{p[:N]} in the
|
||||
\code{target} construct, the array section on the device is \emph{attached}
|
||||
to the device pointer \plc{p} on entry to the construct, and
|
||||
the value of the device pointer \plc{p} becomes undefined on exit.
|
||||
(Of course, storage allocation for
|
||||
the array section on the device will occur before the
|
||||
pointer on the device is \emph{attached}.)
|
||||
% For globals with declare target is there such a things a
|
||||
% original and corresponding?
|
||||
|
||||
\cexample{target_ptr_map}{2}
|
54
Examples_target_structure_mapping.tex
Normal file
54
Examples_target_structure_mapping.tex
Normal file
@ -0,0 +1,54 @@
|
||||
\pagebreak
|
||||
\section{Structure mapping}
|
||||
\label{sec:structure_mapping}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
In the example below, only structure elements \plc{S.a}, \plc{S.b} and \plc{S.p}
|
||||
of the \plc{S} structure appear in \code{map} clauses of a \code{target} construct.
|
||||
Only these components have corresponding variables and storage on the device.
|
||||
Hence, the large arrays, \plc{S.buffera} and \plc{S.bufferb}, and the \plc{S.x} component have no storage
|
||||
on the device and cannot be accessed.
|
||||
|
||||
Also, since the pointer member \plc{S.p} is used in an array section of a
|
||||
\code{map} clause, the array storage of the array section on the device,
|
||||
\plc{S.p[:N]}, is \emph{attached} to the pointer member \plc{S.p} on the device.
|
||||
Explicitly mapping the pointer member \plc{S.p} is optional in this case.
|
||||
|
||||
Note: The buffer arrays and the \plc{x} variable have been grouped together, so that
|
||||
the components that will reside on the device are all together (without gaps).
|
||||
This allows the runtime to optimize the transfer and the storage footprint on the device.
|
||||
|
||||
\cexample{target_struct_map}{1}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
The following example is a slight modification of the above example for
|
||||
a C++ class. In the member function \plc{SAXPY::driver}
|
||||
the array section \plc{p[:N]} is \emph{attached} to the pointer member \plc{p}
|
||||
on the device.
|
||||
|
||||
\cppexample{target_struct_map}{2}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
%In this example a pointer, \plc{p}, is mapped in a
|
||||
%\code{target}~\code{data} construct (\code{map(p)}) and remains
|
||||
%persistent throughout the \code{target}~\code{data} region. The address stored
|
||||
%on the host is not assigned to the device pointer variable, and
|
||||
%the device value is not copied back to the host at the end of the
|
||||
%region (for a pointer, it is as though \code{map(alloc:p}) is effectively
|
||||
%used). The array section, \plc{p[:N]}, is mapped on both \code{target}
|
||||
%constructs, and the pointer \plc{p} on the device is attached at the
|
||||
%beginning and detached at the end of the regions to the newly created
|
||||
%array section on the device.
|
||||
%
|
||||
%Also, in the following example the global variable, \plc{a}, becomes
|
||||
%allocated when it is first used on the device in a \code{target} region,
|
||||
%and persists on the device for all target regions. The value on the
|
||||
%device and host may be different, as shown by the print statements.
|
||||
%The values may be made consistent with the \code{update} construct,
|
||||
%as shown in the \plc{declare\_target.3.c} and \plc{declare\_target.3.f90}
|
||||
%examples.
|
||||
%
|
||||
%\cexample{target_struct_map}{2}
|
@ -24,6 +24,7 @@ construct occurs before the host data is deleted.
|
||||
|
||||
\cppexample{target_unstructured_data}{1}
|
||||
|
||||
\pagebreak
|
||||
The following C code allocates and frees the data member of a Matrix structure.
|
||||
The \code{init\_matrix} function allocates the memory used in the structure and
|
||||
uses the \code{target}~\code{enter}~\code{data} directive to map it to the target device. The
|
||||
@ -34,6 +35,7 @@ and then frees the memory on the host. Note, the stand-alone
|
||||
|
||||
\cexample{target_unstructured_data}{1}
|
||||
|
||||
\pagebreak
|
||||
The following Fortran code allocates and deallocates a module array. The
|
||||
\code{initialize} subroutine allocates the module array and uses the
|
||||
\code{target}~\code{enter}~\code{data} directive to map it to the target device. The
|
||||
|
32
Examples_task_affinity.tex
Normal file
32
Examples_task_affinity.tex
Normal file
@ -0,0 +1,32 @@
|
||||
\section{Task Affinity}
|
||||
\label{sec: task_affinity}
|
||||
|
||||
The next example illustrates the use of the \code{affinity}
|
||||
clause with a \code{task} construct.
|
||||
The variables in the \code{affinity} clause provide a
|
||||
hint to the runtime that the task should execute
|
||||
"close" to the physical storage location of the variables. For example,
|
||||
on a two-socket platform with a local memory component
|
||||
close to each processor socket, the runtime will attempt to
|
||||
schedule the task execution on the socket where the storage is located.
|
||||
|
||||
Because the C/C++ code employs a pointer, an array section is used in
|
||||
the \code{affinity} clause.
|
||||
Fortran code can use an array reference to specify the storage, as
|
||||
shown here.
|
||||
|
||||
Note, in the second task of the C/C++ code the \plc{B} pointer is declared
|
||||
shared. Otherwise, by default, it would be firstprivate since it is a local
|
||||
variable, and would probably be saved for the second task before being assigned
|
||||
a storage address by the first task. Also, one might think it reasonable to use
|
||||
the \code{affinity} clause \plc{affinity(B[:N])} on the second \code{task} construct.
|
||||
However, the storage behind \plc{B} is created in the first task, and the
|
||||
array section reference may not be valid when the second task is generated.
|
||||
The use of the \plc{A} array is sufficient for this case, because one
|
||||
would expect the storage for \plc{A} and \plc{B} would be physically "close"
|
||||
(as provided by the hint in the first task).
|
||||
|
||||
\cexample{affinity}{6}
|
||||
|
||||
\ffreeexample{affinity}{6}
|
||||
|
@ -5,7 +5,7 @@
|
||||
\subsection{Flow Dependence}
|
||||
\label{subsec:task_flow_depend}
|
||||
|
||||
In this example we show a simple flow dependence expressed using the \code{depend}
|
||||
This example shows a simple flow dependence using a \code{depend}
|
||||
clause on the \code{task} construct.
|
||||
|
||||
\cexample{task_dep}{1}
|
||||
@ -20,7 +20,7 @@ would have a race condition.
|
||||
\subsection{Anti-dependence}
|
||||
\label{subsec:task_anti_depend}
|
||||
|
||||
In this example we show an anti-dependence expressed using the \code{depend}
|
||||
This example shows an anti-dependence using the \code{depend}
|
||||
clause on the \code{task} construct.
|
||||
|
||||
\cexample{task_dep}{2}
|
||||
@ -35,7 +35,7 @@ race condition.
|
||||
\subsection{Output Dependence}
|
||||
\label{subsec:task_out_depend}
|
||||
|
||||
In this example we show an output dependence expressed using the \code{depend}
|
||||
This example shows an output dependence using the \code{depend}
|
||||
clause on the \code{task} construct.
|
||||
|
||||
\cexample{task_dep}{3}
|
||||
@ -47,6 +47,7 @@ clauses enforce the ordering of the tasks. If the \code{depend} clauses had been
|
||||
omitted, then the tasks could execute in any order and the program would have a
|
||||
race condition.
|
||||
|
||||
\pagebreak
|
||||
\subsection{Concurrent Execution with Dependences}
|
||||
\label{subsec:task_concurrent_depend}
|
||||
|
||||
@ -75,3 +76,141 @@ NxN elements, and the multiplication is implemented using blocks of BSxBS elemen
|
||||
|
||||
\ffreeexample{task_dep}{5}
|
||||
|
||||
\subsection{\code{taskwait} with Dependences}
|
||||
\label{subsec:taskwait_depend}
|
||||
|
||||
In this subsection three examples illustrate how the
|
||||
\code{depend} clause can be applied to a \code{taskwait} construct to make the
|
||||
generating task wait for specific child tasks to complete. This is an OpenMP 5.0 feature.
|
||||
In the same manner that
|
||||
dependences can order executions among child tasks with \code{depend} clauses on
|
||||
\code{task} constructs, the generating task can be scheduled to wait on child tasks
|
||||
at a \code{taskwait} before it can proceed.
|
||||
|
||||
Note: Since the \code{depend} clause on a \code{taskwait} construct relaxes the
|
||||
default synchronization behavior (waiting for all children to finish), it is important to
|
||||
realize that child tasks that are not predecessor tasks, as determined by the \code{depend}
|
||||
clause of the \code{taskwait} construct, may be running concurrently while the
|
||||
generating task is executing after the taskwait.
|
||||
|
||||
In the first example the generating task waits at the \code{taskwait} construct
|
||||
for the completion of the first child task because a dependence on the first task
|
||||
is produced by \plc{x} with an \code{in} dependence type within the \code{depend}
|
||||
clause of the \code{taskwait} construct.
|
||||
Immediately after the first \code{taskwait} construct it is safe to access the
|
||||
\plc{x} variable by the generating task, as shown in the print statement.
|
||||
There is no completion restraint on the second child task.
|
||||
Hence, immediately after the first \code{taskwait} it is unsafe to access the
|
||||
\plc{y} variable since the second child task may still be executing.
|
||||
The second \code{taskwait} ensures that the second child task has completed; hence
|
||||
it is safe to access the \plc{y} variable in the following print statement.
|
||||
|
||||
\cexample{task_dep}{6}
|
||||
|
||||
\ffreeexample{task_dep}{6}
|
||||
|
||||
In this example the first two tasks are serialized, because a dependence on
|
||||
the first child is produced by \plc{x} with the \code{in} dependence type
|
||||
in the \code{depend} clause of the second task.
|
||||
However, the generating task at the first \code{taskwait} waits only on the
|
||||
first child task to complete, because a dependence on only the first child task
|
||||
is produced by \plc{x} with an \code{in} dependence type within the
|
||||
\code{depend} clause of the \code{taskwait} construct.
|
||||
The second \code{taskwait} (without a \code{depend} clause) is included
|
||||
to guarantee completion of the second task before \plc{y} is accessed.
|
||||
(While unnecessary, the \code{depend(inout:} \code{y)} clause on the 2nd child task is
|
||||
included to illustrate how the child task dependences can be completely annotated
|
||||
in a data-flow model.)
|
||||
|
||||
|
||||
\cexample{task_dep}{7}
|
||||
|
||||
\ffreeexample{task_dep}{7}
|
||||
|
||||
|
||||
This example is similar to the previous one, except the generating task is
|
||||
directed to also wait for completion of the second task.
|
||||
|
||||
The \code{depend} clause of the \code{taskwait} construct now includes an
|
||||
\code{in} dependence type for \plc{y}. Hence the generating task must now
|
||||
wait on completion of any child task having \plc{y} with an \code{out}
|
||||
(here \code{inout}) dependence type in its \code{depend} clause.
|
||||
So, the \code{depend} clause of the \code{taskwait} construct now constrains
|
||||
the second task to complete at the \code{taskwait}, too.
|
||||
%--both tasks must now complete execution at the \code{taskwait}.
|
||||
(This change makes the second \code{taskwait} of the previous example unnecessary--
|
||||
it has been removed in this example.)
|
||||
|
||||
Note: While a taskwait construct ensures that all child tasks have completed; a depend clause on a taskwait
|
||||
construct only waits for specific child tasks (prescribed by the dependence type and list
|
||||
items in the \code{taskwait}'s \code{depend} clause).
|
||||
This and the previous example illustrate the need to carefully determine
|
||||
the dependence type of variables in the \code{taskwait} \code{depend} clause
|
||||
when selecting child tasks that the generating task must wait on, so that its execution after the
|
||||
taskwait does not produce race conditions on variables accessed by non-completed child tasks.
|
||||
|
||||
\cexample{task_dep}{8}
|
||||
|
||||
\ffreeexample{task_dep}{8}
|
||||
|
||||
\pagebreak
|
||||
\subsection{Mutually Exclusive Execution with Dependences}
|
||||
\label{subsec:task_dep_mutexinoutset}
|
||||
|
||||
In this example we show a series of tasks, including mutually exclusive
|
||||
tasks, expressing dependences using the \code{depend} clause on the
|
||||
\code{task} construct.
|
||||
|
||||
The program will always print~6. Tasks T1, T2 and T3 will be scheduled first,
|
||||
in any order. Task T4 will be scheduled after tasks T1 and T2 are
|
||||
completed. T5 will be scheduled after tasks T1 and T3 are completed. Due
|
||||
to the \code{mutexinoutset} dependence type on \code{c}, T4 and T5 may be
|
||||
scheduled in any order with respect to each other, but not at the same
|
||||
time. Tasks T6 will be scheduled after both T4 and T5 are completed.
|
||||
|
||||
\cexample{task_dep}{9}
|
||||
|
||||
\ffreeexample{task_dep}{9}
|
||||
|
||||
The following example demonstrates a situation where the \code{mutexinoutset}
|
||||
dependence type is advantageous. If \code{shortTaskB} completes
|
||||
before \code{longTaskA}, the runtime can take advantage of this by
|
||||
scheduling \code{longTaskBC} before \code{shortTaskAC}.
|
||||
|
||||
\cexample{task_dep}{10}
|
||||
|
||||
\ffreeexample{task_dep}{10}
|
||||
|
||||
\subsection{Multidependences Using Iterators}
|
||||
\label{subsec:depend_iterator}
|
||||
|
||||
The following example uses an iterator to define a dynamic number of
|
||||
dependences.
|
||||
|
||||
In the \code{single} construct of a parallel region a loop generates n tasks
|
||||
and each task has an \code{out} dependence specified through an element of
|
||||
the \plc{v} array. This is followed by a single task that defines an \code{in}
|
||||
dependence on each element of the array. This is accomplished by
|
||||
using the \code{iterator} modifier in the \code{depend} clause, supporting a dynamic number
|
||||
of dependences (\plc{n} here).
|
||||
|
||||
The task for the \plc{print\_all\_elements} function is not executed until all dependences
|
||||
prescribed (or registered) by the iterator are fulfilled; that is,
|
||||
after all the tasks generated by the loop have completed.
|
||||
|
||||
Note, one cannot simply use an array section in the \code{depend} clause
|
||||
of the second task construct because this would violate the \code{depend} clause restriction:
|
||||
|
||||
"List items used in \code{depend} clauses of the same task or sibling tasks
|
||||
must indicate identical storage locations or disjoint storage locations".
|
||||
|
||||
In this case each of the loop tasks use a single disjoint (different storage)
|
||||
element in their \code{depend} clause; however,
|
||||
the array-section storage area prescribed in the commented directive is neither
|
||||
identical nor disjoint to the storage prescibed by the elements of the
|
||||
loop tasks. The iterator overcomes this restriction by effectively
|
||||
creating n disjoint storage areas.
|
||||
|
||||
\cexample{task_dep}{11}
|
||||
|
||||
\ffreeexample{task_dep}{11}
|
||||
|
@ -11,7 +11,7 @@ is started (the task executing the root of the recursive \code{compute\_tree()}
|
||||
calls). While synchronizing tasks at the end of each tree traversal, using the
|
||||
\code{taskgroup} construct ensures that the formerly started background task
|
||||
does not participate in the synchronization, and is left free to execute in parallel.
|
||||
This is opposed to the behaviour of the \code{taskwait} construct, which would
|
||||
This is opposed to the behavior of the \code{taskwait} construct, which would
|
||||
include the background tasks in the synchronization.
|
||||
|
||||
\cexample{taskgroup}{1}
|
||||
|
@ -53,7 +53,7 @@ and start executing unassigned tasks. Once the number of unassigned tasks is su
|
||||
low, the thread may resume execution of the task generating loop.
|
||||
|
||||
\cexample{tasking}{5}
|
||||
\pagebreak
|
||||
|
||||
\fexample{tasking}{5}
|
||||
|
||||
The following example is the same as the previous one, except that the tasks are
|
||||
|
@ -12,3 +12,28 @@ The \code{nogroup} clause removes the implicit taskgroup of the \code{taskloop}
|
||||
\cexample{taskloop}{1}
|
||||
|
||||
\ffreeexample{taskloop}{1}
|
||||
|
||||
%\clearpage
|
||||
|
||||
Because a \code{taskloop} construct encloses a loop, it is often incorrectly
|
||||
perceived as a worksharing construct (when it is directly nested in
|
||||
a \code{parallel} region).
|
||||
|
||||
While a worksharing construct distributes the loop iterations across all threads in a team,
|
||||
the entire loop of a \code{taskloop} construct is executed by every thread of the team.
|
||||
|
||||
In the example below the first taskloop occurs closely nested within
|
||||
a \code{parallel} region and the entire loop is executed by each of the \plc{T} threads;
|
||||
hence the reduction sum is executed \plc{T}*\plc{N} times.
|
||||
|
||||
The loop of the second taskloop is within a \code{single} region and is executed
|
||||
by a single thread so that only \plc{N} reduction sums occur. (The other
|
||||
\plc{N}-1 threads of the \code{parallel} region will participate in executing the
|
||||
tasks. This is the common use case for the \code{taskloop} construct.)
|
||||
|
||||
In the example, the code thus prints \code{x1 = 16384} (\plc{T}*\plc{N}) and
|
||||
\code{x2 = 1024} (\plc{N}).
|
||||
|
||||
\cexample{taskloop}{2}
|
||||
|
||||
\ffreeexample{taskloop}{2}
|
||||
|
89
Examples_udr.tex
Normal file
89
Examples_udr.tex
Normal file
@ -0,0 +1,89 @@
|
||||
\subsection{User-Defined Reduction}
|
||||
\label{subsec:UDR}
|
||||
|
||||
The \code{declare}~\code{reduction} directive can be used to specify
|
||||
user-defined reductions (UDR) for user data types.
|
||||
|
||||
%The following examples show how user-defined reductions can be used to support user data types in the \code{reduction} clause.
|
||||
|
||||
%The following example computes the enclosing rectangle of a set of points. The point data structure (\code{struct}~\code{point}) is not supported by the \code{reduction} clause. Using two \code{declare}~\code{reduction} directives we define how a reduction for the point data structure is done for the \plc{min} and \plc{max} operations. Each \code{declare}~\code{reduction} directive calls the appropriate function that passes the two special variables that can be used in the user-defined reduction expression: \code{omp\_in}, which holds one of the two values to reduce, and \code{omp\_out}, which holds the other value and should hold also the result of the reduction once the expression has been executed. Note, also, that when defining the user-defined reduction for \plc{min} we specify how the private variables of each thread are to be initialized (that is, the neutral value). This is not the case for \plc{max} as the default values (that is, zero filling) are already adequate.
|
||||
|
||||
|
||||
In the following example, \code{declare}~\code{reduction} directives are used to define
|
||||
\plc{min} and \plc{max} operations for the \plc{point} data structure for computing
|
||||
the rectangle that encloses a set of 2-D points.
|
||||
|
||||
Each \code{declare}~\code{reduction} directive defines new reduction identifiers,
|
||||
\plc{min} and \plc{max}, to be used in a \code{reduction} clause. The next item in the
|
||||
declaration list is the data type (\plc{struct} \plc{point}) used in the reduction,
|
||||
followed by the combiner, here the functions \plc{minproc} and \plc{maxproc} perform
|
||||
the min and max operations, respectively, on the user data (of type \plc{struct} \plc{point}).
|
||||
In the function argument list are two special OpenMP variable identifiers, \code{omp\_in} and \code{omp\_out},
|
||||
that denote the two values to be combined in the "real" function;
|
||||
the \code{omp\_out} identifier indicates which one is to hold the result.
|
||||
|
||||
The initializer of the \code{declare}~\code{reduction} directive specifies
|
||||
the initial value for the private variable of each implicit task.
|
||||
The \code{omp\_priv} identifier is used to denote the private variable.
|
||||
|
||||
\cexample{udr}{1}
|
||||
|
||||
The following example shows the corresponding code in Fortran.
|
||||
The \code{declare}~\code{reduction} directives are specified as part of
|
||||
the declaration in subroutine \plc{find\_enclosing\_rectangle} and
|
||||
the procedures that perform the min and max operations are specified as subprograms.
|
||||
|
||||
\ffreeexample{udr}{1}
|
||||
|
||||
|
||||
The following example shows the same computation as \plc{udr.1} but it illustrates that you can craft complex expressions in the user-defined reduction declaration. In this case, instead of calling the \plc{minproc} and \plc{maxproc} functions we inline the code in a single expression.
|
||||
|
||||
\cexample{udr}{2}
|
||||
|
||||
The corresponding code of the same example in Fortran is very similar
|
||||
except that the assignment expression in the \code{declare}~\code{reduction}
|
||||
directive can only be used for a single variable, in this case through
|
||||
a type structure constructor \plc{point($\ldots$)}.
|
||||
|
||||
\ffreeexample{udr}{2}
|
||||
|
||||
|
||||
The following example shows the use of special variables in arguments for combiner (\code{omp\_in} and \code{omp\_out}) and initializer (\code{omp\_priv} and \code{omp\_orig}) routines. This example returns the maximum value of an array and the corresponding index value. The \code{declare}~\code{reduction} directive specifies a user-defined reduction operation \plc{maxloc} for data type \plc{struct} \plc{mx\_s}. The function \plc{mx\_combine} is the combiner and the function \plc{mx\_init} is the initializer.
|
||||
|
||||
\cexample{udr}{3}
|
||||
|
||||
Below is the corresponding Fortran version of the above example. The \code{declare}~\code{reduction} directive specifies the user-defined operation \plc{maxloc} for user-derived type \plc{mx\_s}. The combiner \plc{mx\_combine} and the initializer \plc{mx\_init} are specified as subprograms.
|
||||
|
||||
\ffreeexample{udr}{3}
|
||||
|
||||
|
||||
The following example explains a few details of the user-defined reduction
|
||||
in Fortran through modules. The \code{declare}~\code{reduction} directive is declared in a module (\plc{data\_red}).
|
||||
The reduction-identifier \plc{.add.} is a user-defined operator that is
|
||||
to allow accessibility in the scope that performs the reduction
|
||||
operation.
|
||||
The user-defined operator \plc{.add.} and the subroutine \plc{dt\_init} specified in the \code{initializer} clause are defined in the same subprogram.
|
||||
|
||||
The reduction operation (that is, the \code{reduction} clause) is in the main program.
|
||||
The reduction identifier \plc{.add.} is accessible by use association.
|
||||
Since \plc{.add.} is a user-defined operator, the explicit interface
|
||||
should also be accessible by use association in the current
|
||||
program unit.
|
||||
Since the \code{declare}~\code{reduction} associated to this \code{reduction} clause
|
||||
has the \code{initializer} clause, the subroutine specified on the clause
|
||||
must be accessible in the current scoping unit. In this case,
|
||||
the subroutine \plc{dt\_init} is accessible by use association.
|
||||
|
||||
\ffreeexample{udr}{4}
|
||||
|
||||
|
||||
The following example uses user-defined reductions to declare a plus (+) reduction for a C++ class. As the \code{declare}~\code{reduction} directive is inside the context of the \plc{V} class the expressions in the \code{declare}~\code{reduction} directive are resolved in the context of the class. Also, note that the \code{initializer} clause uses a copy constructor to initialize the private variables of the reduction and it uses as parameter to its original variable by using the special variable \code{omp\_orig}.
|
||||
|
||||
\cppexample{udr}{5}
|
||||
|
||||
The following examples shows how user-defined reductions can be defined for some STL containers. The first \code{declare}~\code{reduction} defines the plus (+) operation for \plc{std::vector<int>} by making use of the \plc{std::transform} algorithm. The second and third define the merge (or concatenation) operation for \plc{std::vector<int>} and \plc{std::list<int>}.
|
||||
%It shows how the same user-defined reduction operation can be defined to be done differently depending on the specified data type.
|
||||
It shows how the user-defined reduction operation can be applied to specific data types of an STL.
|
||||
|
||||
\cppexample{udr}{6}
|
||||
|
77
Examples_variant.tex
Normal file
77
Examples_variant.tex
Normal file
@ -0,0 +1,77 @@
|
||||
\pagebreak
|
||||
\section{\code{declare}~\code{variant} Directive}
|
||||
\label{sec:declare_variant}
|
||||
|
||||
%A \code{declare variant} directive specifies that the following function is an alternate function,
|
||||
%a \plc{function variant}, to be used in place of the specified \plc{base function}
|
||||
%when the trait within the \code{match} clause has a valid context.
|
||||
|
||||
A \code{declare}~\code{variant} directive specifies an alternate function,
|
||||
\plc{function variant}, to be used in place of the \plc{base function}
|
||||
%when the trait within the \code{match} clause has a valid context.
|
||||
when the trait within the \code{match} clause matches the OpenMP context at a given call site.
|
||||
The base function follows the directive in the C and C++ languages.
|
||||
In Fortran, either a subroutine or function may be used as the \plc{base function},
|
||||
and the \code{declare}~\code{variant} directive must be in the specification
|
||||
part of a subroutine or function (unless a \plc{base-proc-name}
|
||||
modifier is used, as in the case of a procedure declaration statement). See
|
||||
the OpenMP 5.0 Specification for details on the modifier.
|
||||
|
||||
When multiple \code{declare}~\code{variant} directives are used
|
||||
a function variant becomes a candidate for replacing the base function if the
|
||||
%base function call context matches the traits of all selectors in the \code{match} clause.
|
||||
context at the base function call matches the traits of all selectors in the \code{match} clause.
|
||||
If there are multiple candidates, a score is assigned with rules for each
|
||||
of the selector traits. The scoring algorithm can be found in the OpenMP 5.0 Specification.
|
||||
|
||||
In the first example the \plc{vxv()} function is called within a \code{parallel} region,
|
||||
a \code{target} region, and in a sequential part of the program. Two function variants, \plc{p\_vxv()} and \plc{t\_vxv()},
|
||||
are defined for the first two regions by using \plc{parallel} and \plc{target} selectors (within
|
||||
the \plc{construct} trait set) in a \code{match} clause. The \plc{p\_vxv()} function variant includes
|
||||
a \code{for} construct (\code{do} construct for Fortran) for the \code{parallel} region,
|
||||
while \plc{t\_vxv()} includes a \code{distribute}~\code{simd} construct for the \code{target} region.
|
||||
The \plc{t\_vxv()} function is explicitly compiled for the device using a \code{declare}~\code{target} directive.
|
||||
|
||||
Since the two \code{declare}~\code{variant} directives have no selectors that match traits for the context
|
||||
of the base function call in the sequential part of the program, the base \plc{vxv()} function is used there,
|
||||
as expected.
|
||||
(The vectors in the \plc{p\_vxv} and \plc{t\_vxv} functions have been multiplied
|
||||
by 3 and 2, respectively, for checking the validity of the replacement. Normally
|
||||
the purpose of a function variant is to produce the same results by a different method.)
|
||||
|
||||
%Note: a \code{target teams} construct is used to direct execution onto a device, with a
|
||||
%\code{distribute simd} construct in the function variant. As of the OpenMP 5.0 implementation
|
||||
%no intervening code is allowed between a \code{target} and \code{teams} construct. So
|
||||
%using a \code{target} construct to direct execution onto a device, and including
|
||||
%\code{teams distribute simd} in the variant function would produce non conforming code.
|
||||
|
||||
%\pagebreak
|
||||
\cexample{declare_variant}{1}
|
||||
|
||||
\ffreeexample{declare_variant}{1}
|
||||
|
||||
|
||||
%\pagebreak
|
||||
|
||||
In this example, traits from the \plc{device} set are used to select a function variant.
|
||||
In the \code{declare}~\code{variant} directive, an \plc{isa} selector
|
||||
specifies that if the implementation of the ``\plc{core-avx512}''
|
||||
instruction set is detected at compile time the \plc{avx512\_saxpy()}
|
||||
variant function is used for the call to \plc{base\_saxpy()}.
|
||||
|
||||
A compilation of \plc{avx512\_saxpy()} is aware of
|
||||
the AVX-512 instruction set that supports 512-bit vector extensions (for Xeon or Xeon Phi architectures).
|
||||
Within \plc{avx512\_saxpy()}, the \code{parallel}~\code{for}~\code{simd} construct performs parallel execution, and
|
||||
takes advantage of 64-byte data alignment.
|
||||
When the \plc{avx512\_saxpy()} function variant is not selected, the base \plc{base\_saxpy()} function variant
|
||||
containing only a basic \code{parallel}~\code{for} construct is used for the call to \plc{base\_saxpy()}.
|
||||
|
||||
%Note:
|
||||
%An allocator is used to set the alignment to 64 bytes when an OpenMP compilation is performed.
|
||||
%Details about allocator variable declarations and functions
|
||||
%can be found in the allocator example of the Memory Management Chapter.
|
||||
|
||||
%\pagebreak
|
||||
\cexample{declare_variant}{2}
|
||||
|
||||
\ffreeexample{declare_variant}{2}
|
23
Foreword_Chapt.tex
Normal file
23
Foreword_Chapt.tex
Normal file
@ -0,0 +1,23 @@
|
||||
\pagebreak
|
||||
\chapter*{Foreword}
|
||||
\label{chap:foreword}
|
||||
\addcontentsline{toc}{chapter}{\protect\numberline{}Foreword}
|
||||
|
||||
The OpenMP Examples document has been updated with new features
|
||||
found in the OpenMP 5.0 Specification. The additional examples and updates
|
||||
are referenced in the Document Revision History of the Appendix, \specref{sec:history_45_to_50}.
|
||||
|
||||
Text describing an example with a 5.0 feature specifically states
|
||||
that the feature support begins in the OpenMP 5.0 Specification. Also,
|
||||
an \plc{omp\_5.0} keyword has been added to metadata in the source code.
|
||||
These distinctions are presented to remind readers that a 5.0 compliant
|
||||
OpenMP implementation is necessary to use these features in codes.
|
||||
|
||||
Examples for most of the 5.0 features are included in this document,
|
||||
and incremental releases will become available as more feature examples
|
||||
and updates are submitted, and approved by the OpenMP Examples Subcommittee.
|
||||
|
||||
\bigskip
|
||||
Examples Subcommitee Co-chairs: \smallskip\linebreak
|
||||
Henry Jin (\textsc{NASA} Ames Research Center) \linebreak
|
||||
Kent Milfeld (\textsc{TACC}, Texas Advanced Research Center)
|
111
History.tex
111
History.tex
@ -1,39 +1,84 @@
|
||||
\chapter{Document Revision History}
|
||||
\label{chap:history}
|
||||
|
||||
\section{Changes from 4.5.0 to 5.0.0}
|
||||
\label{sec:history_45_to_50}
|
||||
|
||||
\begin{itemize}
|
||||
\item Added the following examples for the 5.0 features:
|
||||
|
||||
\begin{itemize}
|
||||
\item Extended \code{teams} construct for host execution (\specref{sec:host_teams})
|
||||
\item \code{loop} and \code{teams}~\code{loop} constructs specify loop iterations that can execute concurrently
|
||||
(\specref{sec:loop})
|
||||
\item Task data affinity is indicated by \code{affinity} clause of \code{task} construct
|
||||
(\specref{sec: task_affinity})
|
||||
\item Display thread affinity with \code{OMP\_DISPLAY\_AFFINITY} environment variable or \code{omp\_display\_affinity()} API routine
|
||||
(\specref{sec:affinity_display})
|
||||
\item \code{taskwait} with dependences (\specref{subsec:taskwait_depend})
|
||||
\item \code{mutexinoutset} task dependences (\specref{subsec:task_dep_mutexinoutset})
|
||||
\item Multidependence Iterators (in \code{depend} clauses) (\specref{subsec:depend_iterator})
|
||||
\item Combined constructs: \code{parallel}~\code{master}~\code{taskloop} and \code{parallel}~\code{master}~\code{taskloop}~\code{simd}
|
||||
(\specref{sec:parallel_master_taskloop})
|
||||
\item Reverse Offload through \plc{ancestor} modifier of \code{device} clause. (\specref{subsec:target_reverse_offload})
|
||||
\item Array Shaping with the \plc{shape-operator} (\specref{sec:array-shaping})
|
||||
\item The \code{declare}~\code{mapper} construct (\specref{sec:declare_mapper})
|
||||
\item Acquire and Release Semantics Synchronization: Memory ordering
|
||||
clauses \code{acquire}, \code{release}, and \code{acq\_rel} were added
|
||||
to flush and atomic constructs
|
||||
(\specref{sec:acquire_and_release_semantics})
|
||||
\item \code{depobj} construct provides dependence objects for subsequent use in \code{depend} clauses
|
||||
(\specref{sec:depobj})
|
||||
\item \code{reduction} clause for \code{task} construct (\specref{subsec:task_reduction})
|
||||
\item \code{reduction} clause for \code{taskloop} construct (\specref{subsec:taskloop_reduction})
|
||||
\item \code{reduction} clause for \code{taskloop}~\code{simd} construct (\specref{subsec:taskloop_reduction})
|
||||
\item Memory Allocators for making OpenMP memory requests with traits (\specref{sec:allocators})
|
||||
\item \code{requires} directive specifies required features of implementation (\specref{sec:requires})
|
||||
\item \code{declare}~\code{variant} directive - for function variants (\specref{sec:declare_variant})
|
||||
\item \code{metadirective} directive - for directive variants (\specref{sec:metadirective})
|
||||
\end{itemize}
|
||||
|
||||
\item Included the following additional examples for the 4.x features:
|
||||
\begin{itemize}
|
||||
\item more taskloop examples (\specref{sec:taskloop})
|
||||
\item user-defined reduction (UDR) (\specref{subsec:UDR})
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
\section{Changes from 4.0.2 to 4.5.0}
|
||||
\begin{itemize}
|
||||
\item Reorganized into chapters of major topics
|
||||
\item Included file extensions in example labels to indicate source type
|
||||
\item Applied the explicit \code{map(tofrom)} for scalar variables
|
||||
in a number of examples to comply with
|
||||
the change of the default behavior for scalar variables from
|
||||
\code{map(tofrom)} to \code{firstprivate} in the 4.5 specification
|
||||
in a number of examples to comply with
|
||||
the change of the default behavior for scalar variables from
|
||||
\code{map(tofrom)} to \code{firstprivate} in the 4.5 specification
|
||||
\item Added the following new examples:
|
||||
|
||||
\begin{itemize}
|
||||
\item \code{linear} clause in loop constructs (\specref{sec:linear_in_loop})
|
||||
\item task priority (\specref{sec:task_priority})
|
||||
\item \code{taskloop} construct (\specref{sec:taskloop})
|
||||
\item \code{linear} clause in loop constructs (\specref{sec:linear_in_loop})
|
||||
\item \code{priority} clause for \code{task} construct (\specref{sec:task_priority})
|
||||
\item \code{taskloop} construct (\specref{sec:taskloop})
|
||||
\item \plc{directive-name} modifier in multiple \code{if} clauses on
|
||||
a combined construct (\specref{subsec:target_if})
|
||||
\item unstructured data mapping (\specref{sec:target_enter_exit_data})
|
||||
a combined construct (\specref{subsec:target_if})
|
||||
\item unstructured data mapping (\specref{sec:target_enter_exit_data})
|
||||
\item \code{link} clause for \code{declare}~\code{target} directive
|
||||
(\specref{subsec:declare_target_link})
|
||||
(\specref{subsec:declare_target_link})
|
||||
\item asynchronous target execution with \code{nowait} clause (\specref{sec:async_target_exec_depend})
|
||||
\item device memory routines and device pointers
|
||||
(\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item doacross loop nest (\specref{sec:doacross})
|
||||
\item locks with hints (\specref{sec:locks})
|
||||
\item C/C++ array reduction (\specref{sec:reduction})
|
||||
\item device memory routines and device pointers (\specref{subsec:target_mem_and_device_ptrs})
|
||||
\item doacross loop nest (\specref{sec:doacross})
|
||||
\item locks with hints (\specref{sec:locks})
|
||||
\item C/C++ array reduction (\specref{subsec:reduction})
|
||||
\item C++ reference types in data sharing clauses (\specref{sec:cpp_reference})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\section{Changes from 4.0.1 to 4.0.2}
|
||||
|
||||
\begin{itemize}
|
||||
\item Names of examples were changed from numbers to mnemonics
|
||||
\item Added SIMD examples (\specref{sec:SIMD})
|
||||
\item Added SIMD examples (\specref{sec:SIMD})
|
||||
\item Applied miscellaneous fixes in several source codes
|
||||
\item Added the revision history
|
||||
\end{itemize}
|
||||
@ -42,27 +87,29 @@ a combined construct (\specref{subsec:target_if})
|
||||
|
||||
Added the following new examples:
|
||||
\begin{itemize}
|
||||
\item the \code{proc\_bind} clause (\specref{sec:affinity})
|
||||
\item the \code{proc\_bind} clause (\specref{sec:affinity})
|
||||
\item the \code{taskgroup} construct (\specref{sec:taskgroup})
|
||||
\end{itemize}
|
||||
|
||||
\section{Changes from 3.1 to 4.0}
|
||||
|
||||
Beginning with OpenMP 4.0, examples were placed in a separate document
|
||||
from the specification document.
|
||||
|
||||
Version 4.0 added the following new examples:
|
||||
\begin{itemize}
|
||||
\item task dependences (\specref{sec:task_depend})
|
||||
\item \code{target} construct (\specref{sec:target})
|
||||
\item \code{target} \code{data} construct (\specref{sec:target_data})
|
||||
\item \code{target} \code{update} construct (\specref{sec:target_update})
|
||||
\item \code{declare} \code{target} construct (\specref{sec:declare_target})
|
||||
\item \code{teams} constructs (\specref{sec:teams})
|
||||
\item asynchronous execution of a \code{target} region using tasks
|
||||
(\specref{subsec:async_target_with_tasks})
|
||||
\item array sections in device constructs (\specref{sec:array_sections})
|
||||
\item device runtime routines (\specref{sec:device})
|
||||
\item Fortran ASSOCIATE construct (\specref{sec:associate})
|
||||
\item cancellation constructs (\specref{sec:cancellation})
|
||||
\item Beginning with OpenMP 4.0, examples were placed in a separate document
|
||||
from the specification document.
|
||||
\item Version 4.0 added the following new examples:
|
||||
|
||||
\begin{itemize}
|
||||
\item task dependences (\specref{sec:task_depend})
|
||||
\item \code{target} construct (\specref{sec:target})
|
||||
\item \code{target}~\code{data} construct (\specref{sec:target_data})
|
||||
\item \code{target}~\code{update} construct (\specref{sec:target_update})
|
||||
\item \code{declare}~\code{target} construct (\specref{sec:declare_target})
|
||||
\item \code{teams} constructs (\specref{sec:teams})
|
||||
\item asynchronous execution of a \code{target} region using tasks (\specref{subsec:async_target_with_tasks})
|
||||
\item array sections in device constructs (\specref{sec:array_sections})
|
||||
\item device runtime routines (\specref{sec:device})
|
||||
\item Fortran ASSOCIATE construct (\specref{sec:associate})
|
||||
\item cancellation constructs (\specref{sec:cancellation})
|
||||
\end{itemize}
|
||||
|
||||
\end{itemize}
|
||||
|
@ -40,12 +40,6 @@ Memory Parallelization specifications, and is not part of the formal specificati
|
||||
assumes familiarity with the OpenMP specifications, and shares the typographical
|
||||
conventions used in that document.
|
||||
|
||||
\notestart
|
||||
\noteheader – This first release of the OpenMP Examples reflects the OpenMP Version 4.5
|
||||
specifications. Additional examples are being developed and will be published in future
|
||||
releases of this document.
|
||||
\noteend
|
||||
|
||||
The OpenMP API specification provides a model for parallel programming that is
|
||||
portable across shared memory architectures from different vendors. Compilers from
|
||||
numerous vendors support the OpenMP API.
|
||||
|
3
Makefile
3
Makefile
@ -1,11 +1,12 @@
|
||||
# Makefile for the OpenMP Examples document in LaTex format.
|
||||
# For more information, see the master document, openmp-examples.tex.
|
||||
|
||||
version=4.5.0
|
||||
version=5.0.0
|
||||
default: openmp-examples.pdf
|
||||
|
||||
|
||||
CHAPTERS=Title_Page.tex \
|
||||
Foreword_Chapt.tex \
|
||||
Introduction_Chapt.tex \
|
||||
Examples_*.tex \
|
||||
History.tex
|
||||
|
14
README
14
README
@ -27,7 +27,7 @@ For copyright information, please see omp_copyright.txt.
|
||||
|
||||
2) Tags (meta data) for example sources
|
||||
|
||||
@@name: <ename>.<seq-no>[c|f]
|
||||
@@name: <ename>.<seq-no>[c|cpp|f|f90]
|
||||
@@type: C|C++|F-fixed|F-free
|
||||
@@compilable: yes|no|maybe
|
||||
@@linkable: yes|no|maybe
|
||||
@ -47,12 +47,16 @@ For copyright information, please see omp_copyright.txt.
|
||||
3) LaTeX macros for examples
|
||||
|
||||
- Source code with language h-rules
|
||||
\cexample{<ename>}{<seq-no>c}
|
||||
\fexample{<ename>}{<seq-no>f}
|
||||
\cexample{<ename>}{<seq-no>} % for C/C++ examples
|
||||
\cppexample{<ename>}{<seq-no>} % for C++ examples
|
||||
\fexample{<ename>}{<seq-no>} % for fixed-form Fortran examples
|
||||
\ffreeexample{<ename>}{<seq-no>} % for free-form Fortran examples
|
||||
|
||||
- Source code without language h-rules
|
||||
\cnexample{<ename>}{<seq-no>c}
|
||||
\fnexample{<ename>}{<seq-no>f}
|
||||
\cnexample{<ename>}{<seq-no>}
|
||||
\cppnexample{<ename>}{<seq-no>}
|
||||
\fnexample{<ename>}{<seq-no>}
|
||||
\ffreenexample{<ename>}{<seq-no>}
|
||||
|
||||
- Language h-rules
|
||||
\cspecificstart, \cspecificend
|
||||
|
@ -17,17 +17,17 @@
|
||||
|
||||
\vspace{1.0in}
|
||||
|
||||
\textbf{Version \VER{} -- \VERDATE}
|
||||
\textbf{Version \PVER{} -- \VERDATE}
|
||||
\end{center}
|
||||
\end{adjustwidth}
|
||||
|
||||
\vspace{2.3in} %was 3.0
|
||||
|
||||
Source codes for OpenMP \VER{} Examples can be downloaded from
|
||||
Source codes for OpenMP \PVER{} Examples can be downloaded from
|
||||
\href{https://github.com/OpenMP/Examples/tree/v\VER}{github}.\\
|
||||
|
||||
\begin{adjustwidth}{0pt}{1em}\setlength{\parskip}{0.25\baselineskip}%
|
||||
Copyright © 1997-2016 OpenMP Architecture Review Board.\\
|
||||
Copyright © 1997-2019 OpenMP Architecture Review Board.\\
|
||||
Permission to copy without fee all or part of this material is granted,
|
||||
provided the OpenMP Architecture Review Board copyright notice and
|
||||
the title of this document appear. Notice is given that copying is by
|
||||
@ -42,7 +42,9 @@ permission of OpenMP Architecture Review Board.\end{adjustwidth}
|
||||
\phantom{a}
|
||||
\emph{This page intentionally left blank}
|
||||
|
||||
%This working version enacted the following tickets: 180, 295, 299, 342, 381,
|
||||
%For final version, uncomment the line above, comment out the lines below
|
||||
%This working version enacted the following tickets: 287, 519, 550, 593,
|
||||
%674, 688, 689,
|
||||
%and a few other editorial changes.
|
||||
\vfill
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
Copyright (c) 1997-2016 OpenMP Architecture Review Board.
|
||||
Copyright (c) 1997-2019 OpenMP Architecture Review Board.
|
||||
All rights reserved.
|
||||
|
||||
Permission to redistribute and use without fee all or part of the source
|
||||
|
83
openmp-example.tex
Normal file
83
openmp-example.tex
Normal file
@ -0,0 +1,83 @@
|
||||
% Welcome to openmp-examples.tex.
|
||||
% This is the master LaTex file for the OpenMP Examples document.
|
||||
%
|
||||
% The files in this set include:
|
||||
%
|
||||
% openmp-examples.tex - this file, the master file
|
||||
% Makefile - makes the document
|
||||
% openmp.sty - the main style file
|
||||
% Title_Page.tex - the title page
|
||||
% openmplogo.png - the logo
|
||||
% Introduction_Chapt.tex - unnumbered introductory chapter
|
||||
% Examples_Chapt.tex - unnumbered chapter
|
||||
% Examples_Sects.tex - examples
|
||||
% sources/*.c, *.f - C/C++/Fortran example source files
|
||||
%
|
||||
% When editing this file:
|
||||
%
|
||||
% 1. To change formatting, appearance, or style, please edit openmp.sty.
|
||||
%
|
||||
% 2. Custom commands and macros are defined in openmp.sty.
|
||||
%
|
||||
% 3. Be kind to other editors -- keep a consistent style by copying-and-pasting to
|
||||
% create new content.
|
||||
%
|
||||
% 4. We use semantic markup, e.g. (see openmp.sty for a full list):
|
||||
% \code{} % for bold monospace keywords, code, operators, etc.
|
||||
% \plc{} % for italic placeholder names, grammar, etc.
|
||||
%
|
||||
% 5. Other recommendations:
|
||||
% Use the convenience macros defined in openmp.sty for the minor headers
|
||||
% such as Comments, Syntax, etc.
|
||||
%
|
||||
% To keep items together on the same page, prefer the use of
|
||||
% \begin{samepage}.... Avoid \parbox for text blocks as it interrupts line numbering.
|
||||
% When possible, avoid \filbreak, \pagebreak, \newpage, \clearpage unless that's
|
||||
% what you mean. Use \needspace{} cautiously for troublesome paragraphs.
|
||||
%
|
||||
% Avoid absolute lengths and measures in this file; use relative units when possible.
|
||||
% Vertical space can be relative to \baselineskip or ex units. Horizontal space
|
||||
% can be relative to \linewidth or em units.
|
||||
%
|
||||
% Prefer \emph{} to italicize terminology, e.g.:
|
||||
% This is a \emph{definition}, not a placeholder.
|
||||
% This is a \plc{var-name}.
|
||||
%
|
||||
|
||||
% The following says letter size, but the style sheet may change the size
|
||||
\documentclass[10pt,letterpaper,twoside,makeidx,hidelinks]{scrreprt}
|
||||
|
||||
% Text to appear in the footer on even-numbered pages:
|
||||
\newcommand{\VER}{5.0.0}
|
||||
\newcommand{\PVER}{\VER{}p1}
|
||||
\newcommand{\VERDATE}{February 2018}
|
||||
\newcommand{\footerText}{OpenMP Examples Version \PVER{} - \VERDATE}
|
||||
|
||||
% Unified style sheet for OpenMP documents:
|
||||
\input{openmp.sty}
|
||||
|
||||
|
||||
\begin{document}
|
||||
\pagenumbering{roman}
|
||||
|
||||
\setcounter{page}{0}
|
||||
\setcounter{tocdepth}{2}
|
||||
|
||||
|
||||
% Uncomment the next line to enable line numbering on the main body text:
|
||||
\linenumbers\pagewiselinenumbers
|
||||
|
||||
\newpage\pagenumbering{arabic}
|
||||
|
||||
\setcounter{chapter}{0} % start chapter numbering here
|
||||
|
||||
% \input{Chap_Single}
|
||||
\input{Example}
|
||||
|
||||
%\setcounter{chapter}{0} % restart chapter numbering with "letter A"
|
||||
%\renewcommand{\thechapter}{\Alph{chapter}}%
|
||||
%\appendix
|
||||
%\input{History}
|
||||
|
||||
\end{document}
|
||||
|
@ -8,6 +8,7 @@
|
||||
% openmp.sty - the main style file
|
||||
% Title_Page.tex - the title page
|
||||
% openmplogo.png - the logo
|
||||
% Forward_Chapt.tex - unnumbered introductory chapter
|
||||
% Introduction_Chapt.tex - unnumbered introductory chapter
|
||||
% Examples_Chapt.tex - unnumbered chapter
|
||||
% Examples_Sects.tex - examples
|
||||
@ -48,9 +49,10 @@
|
||||
\documentclass[10pt,letterpaper,twoside,makeidx,hidelinks]{scrreprt}
|
||||
|
||||
% Text to appear in the footer on even-numbered pages:
|
||||
\newcommand{\VER}{4.5.0}
|
||||
\newcommand{\VERDATE}{November 2016}
|
||||
\newcommand{\footerText}{OpenMP Examples Version \VER{} - \VERDATE}
|
||||
\newcommand{\VER}{5.0.0}
|
||||
\newcommand{\PVER}{\VER{}}
|
||||
\newcommand{\VERDATE}{November 2019}
|
||||
\newcommand{\footerText}{OpenMP Examples Version \PVER{} - \VERDATE}
|
||||
|
||||
% Unified style sheet for OpenMP documents:
|
||||
\input{openmp.sty}
|
||||
@ -70,6 +72,8 @@
|
||||
% Uncomment the next line to enable line numbering on the main body text:
|
||||
\linenumbers\pagewiselinenumbers
|
||||
|
||||
\input{Foreword_Chapt}
|
||||
|
||||
\newpage\pagenumbering{arabic}
|
||||
|
||||
\input{Introduction_Chapt}
|
||||
@ -80,6 +84,7 @@
|
||||
\input{Chap_parallel_execution}
|
||||
\input{Examples_ploop}
|
||||
\input{Examples_parallel}
|
||||
\input{Examples_host_teams}
|
||||
\input{Examples_nthrs_nesting}
|
||||
\input{Examples_nthrs_dynamic}
|
||||
\input{Examples_fort_do}
|
||||
@ -92,12 +97,15 @@
|
||||
\input{Examples_single}
|
||||
\input{Examples_workshare}
|
||||
\input{Examples_master}
|
||||
\input{Examples_loop}
|
||||
\input{Examples_pra_iterator}
|
||||
\input{Examples_set_dynamic_nthrs}
|
||||
\input{Examples_get_nthrs}
|
||||
|
||||
\input{Chap_affinity}
|
||||
\input{Examples_affinity}
|
||||
\input{Examples_task_affinity}
|
||||
\input{Examples_affinity_display}
|
||||
\input{Examples_affinity_query}
|
||||
|
||||
\input{Chap_tasking}
|
||||
@ -107,9 +115,15 @@
|
||||
\input{Examples_taskgroup}
|
||||
\input{Examples_taskyield}
|
||||
\input{Examples_taskloop}
|
||||
\input{Examples_parallel_master_taskloop}
|
||||
|
||||
\input{Chap_devices}
|
||||
\input{Examples_target}
|
||||
\input{Examples_target_pointer_mapping}
|
||||
\input{Examples_target_structure_mapping}
|
||||
\input{Examples_array_sections}
|
||||
\input{Examples_array_shaping}
|
||||
\input{Examples_target_mapper}
|
||||
\input{Examples_target_data}
|
||||
\input{Examples_target_unstructured_data}
|
||||
\input{Examples_target_update}
|
||||
@ -122,8 +136,8 @@
|
||||
%New subsection
|
||||
\input{Examples_async_target_nowait}
|
||||
\input{Examples_async_target_nowait_depend}
|
||||
\input{Examples_array_sections}
|
||||
% Structure Element in map 487
|
||||
% \input{Examples_array_sections} moved after struct_ptr_map
|
||||
% Structure Element in map 487 no 579
|
||||
\input{Examples_device}
|
||||
% MemoryRoutine and Device ptr 473
|
||||
|
||||
@ -140,7 +154,9 @@
|
||||
\input{Examples_atomic}
|
||||
\input{Examples_atomic_restrict}
|
||||
\input{Examples_flush_nolist}
|
||||
\input{Examples_acquire_release}
|
||||
\input{Examples_ordered}
|
||||
\input{Examples_depobj}
|
||||
% Doacross loop 405
|
||||
\input{Examples_doacross}
|
||||
\input{Examples_locks}
|
||||
@ -165,7 +181,7 @@
|
||||
\input{Examples_lastprivate}
|
||||
\input{Examples_reduction}
|
||||
% User UDR 287
|
||||
% C array reduction 377
|
||||
\input{Examples_udr}
|
||||
\input{Examples_copyin}
|
||||
\input{Examples_copyprivate}
|
||||
\input{Examples_cpp_reference}
|
||||
@ -174,6 +190,7 @@
|
||||
|
||||
\input{Chap_memory_model}
|
||||
\input{Examples_mem_model}
|
||||
\input{Examples_allocators}
|
||||
\input{Examples_fort_race}
|
||||
|
||||
\input{Chap_program_control}
|
||||
@ -182,9 +199,13 @@
|
||||
% If multi-ifs 471
|
||||
\input{Examples_standalone}
|
||||
\input{Examples_cancellation}
|
||||
\input{Examples_requires}
|
||||
\input{Examples_variant}
|
||||
\input{Examples_metadirective}
|
||||
% New Section Nested Regions
|
||||
\input{Examples_nested_loop}
|
||||
\input{Examples_nesting_restrict}
|
||||
\input{Examples_nested_loop}
|
||||
\input{Examples_nesting_restrict}
|
||||
\input{Examples_target_offload}
|
||||
|
||||
|
||||
\setcounter{chapter}{0} % restart chapter numbering with "letter A"
|
||||
|
32
sources/Example_acquire_release.1.c
Normal file
32
sources/Example_acquire_release.1.c
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
* @@name: acquire_release.1.c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int x = 0, y = 0;
|
||||
#pragma omp parallel num_threads(2)
|
||||
{
|
||||
int thrd = omp_get_thread_num();
|
||||
if (thrd == 0) {
|
||||
x = 10;
|
||||
#pragma omp critical
|
||||
{ y = 1; }
|
||||
} else {
|
||||
int tmp = 0;
|
||||
while (tmp == 0) {
|
||||
#pragma omp critical
|
||||
{ tmp = y; }
|
||||
}
|
||||
printf("x = %d\n", x); // always "x = 10"
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
29
sources/Example_acquire_release.1.f90
Normal file
29
sources/Example_acquire_release.1.f90
Normal file
@ -0,0 +1,29 @@
|
||||
! @@name: acquire_release.1.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
program rel_acq_ex1
|
||||
use omp_lib
|
||||
integer :: x, y, thrd, tmp
|
||||
x = 0
|
||||
y = 0
|
||||
!$omp parallel num_threads(2) private(thrd, tmp)
|
||||
thrd = omp_get_thread_num()
|
||||
if (thrd == 0) then
|
||||
x = 10
|
||||
!$omp critical
|
||||
y = 1
|
||||
!$omp end critical
|
||||
else
|
||||
tmp = 0
|
||||
do while (tmp == 0)
|
||||
!$omp critical
|
||||
tmp = y
|
||||
!$omp end critical
|
||||
end do
|
||||
print *, "x = ", x !! always "x = 10"
|
||||
end if
|
||||
!$omp end parallel
|
||||
end program
|
32
sources/Example_acquire_release.2.c
Normal file
32
sources/Example_acquire_release.2.c
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
* @@name: acquire_release.2.c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int x = 0, y = 0;
|
||||
#pragma omp parallel num_threads(2)
|
||||
{
|
||||
int thrd = omp_get_thread_num();
|
||||
if (thrd == 0) {
|
||||
x = 10;
|
||||
#pragma omp atomic write release // or seq_cst
|
||||
y = 1;
|
||||
} else {
|
||||
int tmp = 0;
|
||||
while (tmp == 0) {
|
||||
#pragma omp atomic read acquire // or seq_cst
|
||||
tmp = y;
|
||||
}
|
||||
printf("x = %d\n", x); // always "x = 10"
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
29
sources/Example_acquire_release.2.f90
Normal file
29
sources/Example_acquire_release.2.f90
Normal file
@ -0,0 +1,29 @@
|
||||
! @@name: acquire_release.2.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
program rel_acq_ex2
|
||||
use omp_lib
|
||||
integer :: x, y, thrd, tmp
|
||||
x = 0
|
||||
y = 0
|
||||
!$omp parallel num_threads(2) private(thrd, tmp)
|
||||
thrd = omp_get_thread_num()
|
||||
if (thrd == 0) then
|
||||
x = 10
|
||||
!$omp atomic write release ! or seq_cst
|
||||
y = 1
|
||||
!$omp end atomic
|
||||
else
|
||||
tmp = 0
|
||||
do while (tmp == 0)
|
||||
!$omp atomic read acquire ! or seq_cst
|
||||
tmp = y
|
||||
!$omp end atomic
|
||||
end do
|
||||
print *, "x = ", x !! always "x = 10"
|
||||
end if
|
||||
!$omp end parallel
|
||||
end program
|
34
sources/Example_acquire_release.3.c
Normal file
34
sources/Example_acquire_release.3.c
Normal file
@ -0,0 +1,34 @@
|
||||
/*
|
||||
* @@name: acquire_release.3.c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int x = 0, y = 0;
|
||||
#pragma omp parallel num_threads(2)
|
||||
{
|
||||
int thrd = omp_get_thread_num();
|
||||
if (thrd == 0) {
|
||||
x = 10;
|
||||
#pragma omp flush // or with acq_rel or release clause
|
||||
#pragma omp atomic write // or with relaxed clause
|
||||
y = 1;
|
||||
} else {
|
||||
int tmp = 0;
|
||||
while (tmp == 0) {
|
||||
#pragma omp atomic read // or with relaxed clause
|
||||
tmp = y;
|
||||
}
|
||||
#pragma omp flush // or with acq_rel or acquire clause
|
||||
printf("x = %d\n", x); // always "x = 10"
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
31
sources/Example_acquire_release.3.f90
Normal file
31
sources/Example_acquire_release.3.f90
Normal file
@ -0,0 +1,31 @@
|
||||
! @@name: acquire_release.3.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
program rel_acq_ex3
|
||||
use omp_lib
|
||||
integer :: x, y, thrd, tmp
|
||||
x = 0
|
||||
y = 0
|
||||
!$omp parallel num_threads(2) private(thrd, tmp)
|
||||
thrd = omp_get_thread_num()
|
||||
if (thrd == 0) then
|
||||
x = 10
|
||||
!$omp flush ! or with acq_rel or release clause
|
||||
!$omp atomic write
|
||||
y = 1
|
||||
!$omp end atomic
|
||||
else
|
||||
tmp = 0
|
||||
do while (tmp == 0)
|
||||
!$omp atomic read
|
||||
tmp = y
|
||||
!$omp end atomic
|
||||
end do
|
||||
!$omp flush ! or with acq_rel or acquire clause
|
||||
print *, "x = ", x !! always "x = 10"
|
||||
end if
|
||||
!$omp end parallel
|
||||
end program
|
41
sources/Example_acquire_release_broke.4.c
Normal file
41
sources/Example_acquire_release_broke.4.c
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* @@name: acquire_release.4.c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
// !!! THIS CODE WILL FAIL TO PRODUCE CONSISTENT RESULTS !!!!!!!
|
||||
// !!! DO NOT PROGRAM SYNCHRONIZATION THIS WAY !!!!!!!
|
||||
|
||||
int x = 0, y;
|
||||
#pragma omp parallel num_threads(2)
|
||||
{
|
||||
int thrd = omp_get_thread_num();
|
||||
if (thrd == 0) {
|
||||
#pragma omp critical
|
||||
{ x = 10; }
|
||||
// an explicit flush directive that provides
|
||||
// release semantics is needed here
|
||||
// to complete the synchronization.
|
||||
#pragma omp atomic write
|
||||
y = 1;
|
||||
} else {
|
||||
int tmp = 0;
|
||||
while (tmp == 0) {
|
||||
#pragma omp atomic read acquire // or seq_cst
|
||||
tmp = y;
|
||||
}
|
||||
#pragma omp critical
|
||||
{ printf("x = %d\n", x); } // !! NOT ALWAYS 10
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
40
sources/Example_acquire_release_broke.4.f90
Normal file
40
sources/Example_acquire_release_broke.4.f90
Normal file
@ -0,0 +1,40 @@
|
||||
! @@name: acquire_release.4.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
program rel_acq_ex4
|
||||
use omp_lib
|
||||
integer :: x, y, thrd
|
||||
integer :: tmp
|
||||
x = 0
|
||||
|
||||
!! !!! THIS CODE WILL FAIL TO PRODUCE CONSISTENT RESULTS !!!!!!!
|
||||
!! !!! DO NOT PROGRAM SYNCHRONIZATION THIS WAY !!!!!!!
|
||||
|
||||
!$omp parallel num_threads private(thrd) private(tmp)
|
||||
thrd = omp_get_thread_num()
|
||||
if (thrd == 0) then
|
||||
!$omp critical
|
||||
x = 10
|
||||
!$omp end critical
|
||||
! an explicit flush directive that provides
|
||||
! release semantics is needed here to
|
||||
! complete the synchronization.
|
||||
!$omp atomic write
|
||||
y = 1
|
||||
!$omp end atomic
|
||||
else
|
||||
tmp = 0
|
||||
do while(tmp == 0)
|
||||
!$omp atomic read acquire ! or seq_cst
|
||||
tmp = x
|
||||
!$omp end atomic
|
||||
end do
|
||||
!$omp critical
|
||||
print *, "x = ", x !! !! NOT ALWAYS 10
|
||||
!$omp end critical
|
||||
end if
|
||||
!$omp end parallel
|
||||
end program
|
@ -5,12 +5,17 @@
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
void work();
|
||||
int main()
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
#pragma omp parallel proc_bind(spread) num_threads(4)
|
||||
{
|
||||
work();
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
@ -1,38 +1,27 @@
|
||||
/*
|
||||
* @@name: affinity.6c
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@expect: success
|
||||
* @@name: affinity.1.c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: no
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
double * alloc_init_B(double *A, int N);
|
||||
void compute_on_B(double *B, int N);
|
||||
|
||||
void socket_init(int socket_num)
|
||||
void task_affinity(double *A, int N)
|
||||
{
|
||||
int n_procs;
|
||||
|
||||
n_procs = omp_get_place_num_procs(socket_num);
|
||||
#pragma omp parallel num_threads(n_procs) proc_bind(close)
|
||||
double * B;
|
||||
#pragma omp task depend(out:B) shared(B) affinity(A[0:N])
|
||||
{
|
||||
printf("Reporting in from socket num, thread num: %d %d\n",
|
||||
socket_num,omp_get_thread_num() );
|
||||
B = alloc_init_B(A,N);
|
||||
}
|
||||
|
||||
#pragma omp task depend( in:B) shared(B) affinity(A[0:N])
|
||||
{
|
||||
compute_on_B(B,N);
|
||||
}
|
||||
|
||||
#pragma omp taskwait
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int n_sockets, socket_num;
|
||||
|
||||
omp_set_nested(1); // or export OMP_NESTED=true
|
||||
omp_set_max_active_levels(2); // or export OMP_MAX_ACTIVE_LEVELS=2
|
||||
|
||||
n_sockets = omp_get_num_places();
|
||||
#pragma omp parallel num_threads(n_sockets) private(socket_num) \
|
||||
proc_bind(spread)
|
||||
{
|
||||
socket_num = omp_get_place_num();
|
||||
socket_init(socket_num);
|
||||
}
|
||||
}
|
||||
|
@ -1,34 +1,24 @@
|
||||
! @@name: affinity.6f
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@expect: success
|
||||
! @@name: affinity.6f
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@expect: success
|
||||
|
||||
subroutine socket_init(socket_num)
|
||||
use omp_lib
|
||||
integer :: socket_num, n_procs
|
||||
subroutine task_affinity(A, N)
|
||||
|
||||
n_procs = omp_get_place_num_procs(socket_num)
|
||||
!$omp parallel num_threads(n_procs) proc_bind(close)
|
||||
external alloc_init_B
|
||||
external compute_on_B
|
||||
double precision, allocatable :: B(:)
|
||||
|
||||
!$omp task depend(out:B) shared(B) affinity(A)
|
||||
call alloc_init_B(B,A)
|
||||
!$omp end task
|
||||
|
||||
!$omp task depend(in:B) shared(B) affinity(A)
|
||||
call compute_on_B(B)
|
||||
!$omp end task
|
||||
|
||||
!$omp taskwait
|
||||
|
||||
print*,"Reporting in from socket num, thread num: ", &
|
||||
socket_num,omp_get_thread_num()
|
||||
!$omp end parallel
|
||||
end subroutine
|
||||
|
||||
program numa_teams
|
||||
use omp_lib
|
||||
integer :: n_sockets, socket_num
|
||||
|
||||
call omp_set_nested(.true.) ! or export OMP_NESTED=true
|
||||
call omp_set_max_active_levels(2) ! or export OMP_MAX_ACTIVE_LEVELS=2
|
||||
|
||||
n_sockets = omp_get_num_places()
|
||||
!$omp parallel num_threads(n_sockets) private(socket_num) &
|
||||
!$omp& proc_bind(spread)
|
||||
|
||||
socket_num = omp_get_place_num()
|
||||
call socket_init(socket_num)
|
||||
|
||||
!$omp end parallel
|
||||
end program
|
||||
|
62
sources/Example_affinity_display.1.c
Normal file
62
sources/Example_affinity_display.1.c
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* @@name: affinity_display.1.c
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int main(void){ //MAX threads = 8, single socket system
|
||||
|
||||
omp_display_affinity(NULL); //API call-- Displays Affinity of Master Thread
|
||||
|
||||
// API CALL OUTPUT (default format):
|
||||
//team_num= 0, nesting_level= 0, thread_num= 0, thread_affinity= 0,1,2,3,4,5,6,7
|
||||
|
||||
|
||||
// OMP_DISPLAY_AFFINITY=TRUE, OMP_NUM_THREADS=8
|
||||
#pragma omp parallel num_threads(omp_get_num_procs())
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
printf("1st Parallel Region -- Affinity Reported \n");
|
||||
|
||||
// DISPLAY OUTPUT (default format) has been sorted:
|
||||
// team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0
|
||||
// team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 1
|
||||
// ...
|
||||
// team_num= 0, nesting_level= 1, thread_num= 7, thread_affinity= 7
|
||||
|
||||
// doing work here
|
||||
}
|
||||
|
||||
#pragma omp parallel num_threads( omp_get_num_procs() )
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
printf("%s%s\n","Same Affinity as in Previous Parallel Region",
|
||||
" -- no Affinity Reported\n");
|
||||
|
||||
// NO AFFINITY OUTPUT:
|
||||
//(output in 1st parallel region only for OMP_DISPLAY_AFFINITY=TRUE)
|
||||
|
||||
// doing more work here
|
||||
}
|
||||
|
||||
// Report Affinity for 1/2 number of threads
|
||||
#pragma omp parallel num_threads( omp_get_num_procs()/2 )
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
printf("Report Affinity for using 1/2 of max threads.\n");
|
||||
|
||||
// DISPLAY OUTPUT (default format) has been sorted:
|
||||
// team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0,1
|
||||
// team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 2,3
|
||||
// team_num= 0, nesting_level= 1, thread_num= 2, thread_affinity= 4,5
|
||||
// team_num= 0, nesting_level= 1, thread_num= 3, thread_affinity= 6,7
|
||||
|
||||
// do work
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
66
sources/Example_affinity_display.1.f90
Normal file
66
sources/Example_affinity_display.1.f90
Normal file
@ -0,0 +1,66 @@
|
||||
! @@name: affinity_display.1.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
program affinity_display ! MAX threads = 8, single socket system
|
||||
|
||||
use omp_lib
|
||||
implicit none
|
||||
character(len=0) :: null
|
||||
|
||||
call omp_display_affinity(null) !API call- Displays Affinity of Master Thread
|
||||
|
||||
! API CALL OUTPUT (default format):
|
||||
! team_num= 0, nesting_level= 0, thread_num= 0, thread_affinity= 0,1,2,3,4,5,6,7
|
||||
|
||||
|
||||
! OMP_DISPLAY_AFFINITY=TRUE, OMP_NUM_THREADS=8
|
||||
|
||||
!$omp parallel num_threads(omp_get_num_procs())
|
||||
|
||||
if(omp_get_thread_num()==0) then
|
||||
print*, "1st Parallel Region -- Affinity Reported"
|
||||
endif
|
||||
|
||||
! DISPLAY OUTPUT (default format) has been sorted:
|
||||
! team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0
|
||||
! team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 1
|
||||
! ...
|
||||
! team_num= 0, nesting_level= 1, thread_num= 7, thread_affinity= 7
|
||||
|
||||
! doing work here
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
!$omp parallel num_threads( omp_get_num_procs() )
|
||||
|
||||
if(omp_get_thread_num()==0) then
|
||||
print*, "Same Affinity in Parallel Region -- no Affinity Reported"
|
||||
endif
|
||||
|
||||
! NO AFFINITY OUTPUT:
|
||||
!(output in 1st parallel region only for OMP_DISPLAY_AFFINITY=TRUE)
|
||||
|
||||
! doing more work here
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
! Report Affinity for 1/2 number of threads
|
||||
!$omp parallel num_threads( omp_get_num_procs()/2 )
|
||||
|
||||
if(omp_get_thread_num()==0) then
|
||||
print*, "Different Affinity in Parallel Region -- Affinity Reported"
|
||||
endif
|
||||
|
||||
! DISPLAY OUTPUT (default format) has been sorted:
|
||||
! team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0,1
|
||||
! team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 2,3
|
||||
! team_num= 0, nesting_level= 1, thread_num= 2, thread_affinity= 4,5
|
||||
! team_num= 0, nesting_level= 1, thread_num= 3, thread_affinity= 6,7
|
||||
|
||||
! do work
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
end program
|
74
sources/Example_affinity_display.2.c
Normal file
74
sources/Example_affinity_display.2.c
Normal file
@ -0,0 +1,74 @@
|
||||
/*
|
||||
* @@name: affinity_display.2c
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <omp.h>
|
||||
|
||||
void socket_work(int socket_num, int n_thrds);
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int n_sockets, socket_num, n_thrds_on_socket;
|
||||
|
||||
omp_set_nested(1); // or env var= OMP_NESTED=true
|
||||
omp_set_max_active_levels(2); // or env var= OMP_MAX_ACTIVE_LEVELS=2
|
||||
|
||||
n_sockets = omp_get_num_places();
|
||||
n_thrds_on_socket = omp_get_place_num_procs(0);
|
||||
|
||||
// OMP_NUM_THREADS=2,4
|
||||
// OMP_PLACES="{0,2,4,6},{1,3,5,7}" #2 sockets; even/odd proc-ids
|
||||
// OMP_AFFINITY_FORMAT=\
|
||||
// "nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A"
|
||||
|
||||
#pragma omp parallel num_threads(n_sockets) private(socket_num)
|
||||
{
|
||||
socket_num = omp_get_place_num();
|
||||
|
||||
if(socket_num==0)
|
||||
printf(" LEVEL 1 AFFINITIES 1 thread/socket, %d sockets:\n\n", n_sockets);
|
||||
|
||||
omp_display_affinity(NULL); // not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
|
||||
// OUTPUT:
|
||||
// LEVEL 1 AFFINITIES 1 thread/socket, 2 sockets:
|
||||
// nest_level= 1, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0,2,4,6
|
||||
// nest_level= 1, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 1,3,5,7
|
||||
|
||||
socket_work(socket_num, n_thrds_on_socket);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void socket_work(int socket_num, int n_thrds)
|
||||
{
|
||||
#pragma omp parallel num_threads(n_thrds)
|
||||
{
|
||||
if(omp_get_thread_num()==0)
|
||||
printf(" LEVEL 2 AFFINITIES, %d threads on socket %d\n",n_thrds, socket_num);
|
||||
|
||||
omp_display_affinity(NULL); // not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
|
||||
// OUTPUT:
|
||||
// LEVEL 2 AFFINITIES, 4 threads on socket 0
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 2
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 2, thrd_affinity= 4
|
||||
// nest_level= 2, parent_thrd_num= 0, thrd_num= 3, thrd_affinity= 6
|
||||
|
||||
// LEVEL 2 AFFINITIES, 4 threads on socket 1
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 0, thrd_affinity= 1
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 1, thrd_affinity= 3
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 2, thrd_affinity= 5
|
||||
// nest_level= 2, parent_thrd_num= 1, thrd_num= 3, thrd_affinity= 7
|
||||
|
||||
// ... Do Some work on Socket
|
||||
|
||||
}
|
||||
}
|
76
sources/Example_affinity_display.2.f90
Normal file
76
sources/Example_affinity_display.2.f90
Normal file
@ -0,0 +1,76 @@
|
||||
! @@name: affinity_display.2.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
program affinity_display
|
||||
|
||||
use omp_lib
|
||||
implicit none
|
||||
character(len=0) :: null
|
||||
integer :: n_sockets, socket_num, n_thrds_on_socket;
|
||||
|
||||
call omp_set_nested(.true.) ! or env var= OMP_NESTED=true
|
||||
call omp_set_max_active_levels(2) ! or env var= OMP_MAX_ACTIVE_LEVELS=2
|
||||
|
||||
n_sockets = omp_get_num_places()
|
||||
n_thrds_on_socket = omp_get_place_num_procs(0)
|
||||
|
||||
! OMP_NUM_THREADS=2,4
|
||||
! OMP_PLACES="{0,2,4,6},{1,3,5,7}" #2 sockets; even/odd proc-ids
|
||||
! OMP_AFFINITY_FORMAT=\
|
||||
! "nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A"
|
||||
|
||||
!$omp parallel num_threads(n_sockets) private(socket_num)
|
||||
|
||||
socket_num = omp_get_place_num()
|
||||
|
||||
if(socket_num==0) then
|
||||
write(*,'("LEVEL 1 AFFINITIES 1 thread/socket ",i0," sockets")')n_sockets
|
||||
endif
|
||||
|
||||
call omp_display_affinity(null) !not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
|
||||
! OUTPUT:
|
||||
! LEVEL 1 AFFINITIES 1 thread/socket, 2 sockets:
|
||||
! nest_level= 1, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0,2,4,6
|
||||
! nest_level= 1, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 1,3,5,7
|
||||
|
||||
call socket_work(socket_num, n_thrds_on_socket)
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
end program
|
||||
|
||||
subroutine socket_work(socket_num, n_thrds)
|
||||
implicit none
|
||||
integer :: socket_num, n_thrds
|
||||
character(len=0) :: null
|
||||
|
||||
!$omp parallel num_threads(n_thrds)
|
||||
|
||||
if(omp_get_thread_num()==0) then
|
||||
write(*,'("LEVEL 2 AFFINITIES, ",i0," threads on socket ",i0)') &
|
||||
n_thrds,socket_num
|
||||
endif
|
||||
|
||||
call omp_display_affinity(null); !not needed if OMP_DISPLAY_AFFINITY=TRUE
|
||||
|
||||
! OUTPUT:
|
||||
! LEVEL 2 AFFINITIES, 4 threads on socket 0
|
||||
! nest_level= 2, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0
|
||||
! nest_level= 2, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 2
|
||||
! nest_level= 2, parent_thrd_num= 0, thrd_num= 2, thrd_affinity= 4
|
||||
! nest_level= 2, parent_thrd_num= 0, thrd_num= 3, thrd_affinity= 6
|
||||
|
||||
! LEVEL 2 AFFINITIES, 4 thrds on socket 1
|
||||
! nest_level= 2, parent_thrd_num= 1, thrd_num= 0, thrd_affinity= 1
|
||||
! nest_level= 2, parent_thrd_num= 1, thrd_num= 1, thrd_affinity= 3
|
||||
! nest_level= 2, parent_thrd_num= 1, thrd_num= 2, thrd_affinity= 5
|
||||
! nest_level= 2, parent_thrd_num= 1, thrd_num= 3, thrd_affinity= 7
|
||||
|
||||
! ... Do Some work on Socket
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
end subroutine
|
88
sources/Example_affinity_display.3.c
Normal file
88
sources/Example_affinity_display.3.c
Normal file
@ -0,0 +1,88 @@
|
||||
/*
|
||||
* @@name: affinity_display.3.c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h> // also null is in <stddef.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <omp.h>
|
||||
|
||||
#define FORMAT_STORE 80
|
||||
#define BUFFER_STORE 80
|
||||
|
||||
int main(void){
|
||||
|
||||
int i, n, thrd_num, max_req_store;
|
||||
size_t nchars;
|
||||
|
||||
char default_format[FORMAT_STORE];
|
||||
char my_format[] = "host=%20H thrd_num=%0.4n binds_to=%A";
|
||||
char **buffer;
|
||||
|
||||
|
||||
// CODE SEGMENT 1 AFFINITY FORMAT
|
||||
|
||||
// Get and Display Default Affinity Format
|
||||
|
||||
nchars = omp_get_affinity_format(default_format,(size_t)FORMAT_STORE);
|
||||
printf("Default Affinity Format is: %s\n",default_format);
|
||||
|
||||
if(nchars >= FORMAT_STORE){
|
||||
printf("Caution: Reported Format is truncated. Increase\n");
|
||||
printf(" FORMAT_STORE to %d.\n", nchars+1);
|
||||
}
|
||||
|
||||
// Set Affinity Format
|
||||
|
||||
omp_set_affinity_format(my_format);
|
||||
printf("Affinity Format set to: %s\n",my_format);
|
||||
|
||||
|
||||
// CODE SEGMENT 2 CAPTURE AFFINITY
|
||||
|
||||
// Set up buffer for affinity of n threads
|
||||
|
||||
n = omp_get_num_procs();
|
||||
buffer = (char **)malloc( sizeof(char *) * n );
|
||||
for(i=0;i<n;i++){ buffer[i]=(char *)malloc( sizeof(char) * BUFFER_STORE); }
|
||||
|
||||
// Capture Affinity using Affinity Format set above.
|
||||
// Use max reduction to check size of buffer areas
|
||||
max_req_store = 0;
|
||||
#pragma omp parallel private(thrd_num,nchars) reduction(max:max_req_store)
|
||||
{
|
||||
if(omp_get_num_threads()>n) exit(1); //safety: don't exceed # of buffers
|
||||
|
||||
thrd_num=omp_get_thread_num();
|
||||
nchars=omp_capture_affinity(buffer[thrd_num],(size_t)BUFFER_STORE,NULL);
|
||||
if(nchars > max_req_store) max_req_store=nchars;
|
||||
|
||||
// ...
|
||||
}
|
||||
|
||||
for(i=0;i<n;i++){
|
||||
printf("thrd_num= %d, affinity: %s\n", i,buffer[i]);
|
||||
}
|
||||
// For 4 threads with OMP_PLACES='{0,1},{2,3},{4,5},{6,7}'
|
||||
// Format host=%20H thrd_num=%0.4n binds_to=%A
|
||||
|
||||
// affinity: host=hpc.cn567 thrd_num=0000 binds_to=0,1
|
||||
// affinity: host=hpc.cn567 thrd_num=0001 binds_to=2,3
|
||||
// affinity: host=hpc.cn567 thrd_num=0002 binds_to=4,5
|
||||
// affinity: host=hpc.cn567 thrd_num=0003 binds_to=6,7
|
||||
|
||||
|
||||
if(max_req_store>=BUFFER_STORE){
|
||||
printf("Caution: Affinity string truncated. Increase\n");
|
||||
printf(" BUFFER_STORE to %d\n",max_req_store+1);
|
||||
}
|
||||
|
||||
for(i=0;i<n;i++) free(buffer[i]);
|
||||
free (buffer);
|
||||
|
||||
return 0;
|
||||
}
|
77
sources/Example_affinity_display.3.f90
Normal file
77
sources/Example_affinity_display.3.f90
Normal file
@ -0,0 +1,77 @@
|
||||
! @@name: affinity_display.3.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
program affinity_display
|
||||
use omp_lib
|
||||
implicit none
|
||||
integer, parameter :: FORMAT_STORE=80
|
||||
integer, parameter :: BUFFER_STORE=80
|
||||
|
||||
integer :: i, n, thrd_num, nchars, max_req_store
|
||||
|
||||
character(FORMAT_STORE) :: default_format
|
||||
character(*), parameter :: my_format = &
|
||||
"host=%20H thrd_num=%0.4n binds_to=%A"
|
||||
character(:), allocatable :: buffer(:)
|
||||
character(len=0) :: null
|
||||
|
||||
|
||||
! CODE SEGMENT 1 AFFINITY FORMAT
|
||||
|
||||
! Get and Display Default Affinity Format
|
||||
|
||||
nchars = omp_get_affinity_format(default_format)
|
||||
print*,"Default Affinity Format: ", trim(default_format)
|
||||
|
||||
if( nchars > FORMAT_STORE) then
|
||||
print*,"Caution: Reported Format is truncated. Increase"
|
||||
print*," FORMAT_STORE to ", nchars
|
||||
endif
|
||||
|
||||
! Set Affinity Format
|
||||
|
||||
call omp_set_affinity_format(my_format)
|
||||
print*,"Affinity Format set to: ", my_format
|
||||
|
||||
|
||||
! CODE SEGMENT 2 CAPTURE AFFINITY
|
||||
|
||||
! Set up buffer for affinity of n threads
|
||||
|
||||
n = omp_get_num_procs()
|
||||
allocate( character(len=BUFFER_STORE)::buffer(0:n-1) )
|
||||
|
||||
! Capture Affinity using Affinity Format set above.
|
||||
! Use max reduction to check size of buffer areas
|
||||
max_req_store = 0
|
||||
!$omp parallel private(thrd_num,nchars) reduction(max:max_req_store)
|
||||
|
||||
if(omp_get_num_threads()>n) stop "ERROR: increase buffer lines"
|
||||
|
||||
thrd_num=omp_get_thread_num()
|
||||
nchars=omp_capture_affinity(buffer(thrd_num),null)
|
||||
if(nchars>max_req_store) max_req_store=nchars
|
||||
! ...
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
do i = 0, n-1
|
||||
print*, "thrd_num= ",i," affinity:", trim(buffer(i))
|
||||
end do
|
||||
! For 4 threads with OMP_PLACES='{0,1},{2,3},{4,5},{6,7}'
|
||||
! Format: host=%20H thrd_num=%0.4n binds_to=%A
|
||||
|
||||
! affinity: host=hpc.cn567 thrd_num=0000 binds_to=0,1
|
||||
! affinity: host=hpc.cn567 thrd_num=0001 binds_to=2,3
|
||||
! affinity: host=hpc.cn567 thrd_num=0002 binds_to=4,5
|
||||
! affinity: host=hpc.cn567 thrd_num=0003 binds_to=6,7
|
||||
|
||||
if(max_req_store > BUFFER_STORE) then
|
||||
print*, "Caution: Affinity string truncated. Increase"
|
||||
print*, " BUFFER_STORE to ",max_req_store
|
||||
endif
|
||||
|
||||
deallocate(buffer)
|
||||
end program
|
39
sources/Example_affinity_query.1.c
Normal file
39
sources/Example_affinity_query.1.c
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* @@name: affinity_query.1c
|
||||
* @@type: C
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@expect: success
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
void socket_init(int socket_num)
|
||||
{
|
||||
int n_procs;
|
||||
|
||||
n_procs = omp_get_place_num_procs(socket_num);
|
||||
#pragma omp parallel num_threads(n_procs) proc_bind(close)
|
||||
{
|
||||
printf("Reporting in from socket num, thread num: %d %d\n",
|
||||
socket_num,omp_get_thread_num() );
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int n_sockets, socket_num;
|
||||
|
||||
omp_set_nested(1); // or export OMP_NESTED=true
|
||||
omp_set_max_active_levels(2); // or export OMP_MAX_ACTIVE_LEVELS=2
|
||||
|
||||
n_sockets = omp_get_num_places();
|
||||
#pragma omp parallel num_threads(n_sockets) private(socket_num) \
|
||||
proc_bind(spread)
|
||||
{
|
||||
socket_num = omp_get_place_num();
|
||||
socket_init(socket_num);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
33
sources/Example_affinity_query.1.f90
Normal file
33
sources/Example_affinity_query.1.f90
Normal file
@ -0,0 +1,33 @@
|
||||
! @@name: affinity_query.1f
|
||||
! @@type: F-free
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@expect: success
|
||||
subroutine socket_init(socket_num)
|
||||
use omp_lib
|
||||
integer :: socket_num, n_procs
|
||||
|
||||
n_procs = omp_get_place_num_procs(socket_num)
|
||||
!$omp parallel num_threads(n_procs) proc_bind(close)
|
||||
|
||||
print*,"Reporting in from socket num, thread num: ", &
|
||||
socket_num,omp_get_thread_num()
|
||||
!$omp end parallel
|
||||
end subroutine
|
||||
|
||||
program numa_teams
|
||||
use omp_lib
|
||||
integer :: n_sockets, socket_num
|
||||
|
||||
call omp_set_nested(.true.) ! or export OMP_NESTED=true
|
||||
call omp_set_max_active_levels(2) ! or export OMP_MAX_ACTIVE_LEVELS=2
|
||||
|
||||
n_sockets = omp_get_num_places()
|
||||
!$omp parallel num_threads(n_sockets) private(socket_num) &
|
||||
!$omp& proc_bind(spread)
|
||||
|
||||
socket_num = omp_get_place_num()
|
||||
call socket_init(socket_num)
|
||||
|
||||
!$omp end parallel
|
||||
end program
|
47
sources/Example_allocators.1.c
Normal file
47
sources/Example_allocators.1.c
Normal file
@ -0,0 +1,47 @@
|
||||
/*
|
||||
* @@name: allocators.1c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#define N 1000
|
||||
|
||||
int main()
|
||||
{
|
||||
float *x, *y;
|
||||
float s=2.0;
|
||||
|
||||
omp_memspace_handle_t xy_memspace = omp_default_mem_space;
|
||||
omp_alloctrait_t xy_traits[1]={omp_atk_alignment, 64};
|
||||
omp_allocator_handle_t xy_alloc = omp_init_allocator(xy_memspace,1,xy_traits);
|
||||
|
||||
|
||||
x=(float *)omp_alloc(N*sizeof(float), xy_alloc);
|
||||
y=(float *)omp_alloc(N*sizeof(float), xy_alloc);
|
||||
|
||||
if( ((intptr_t)(y))%64 != 0 || ((intptr_t)(x))%64 != 0 )
|
||||
{ printf("ERROR: x|y not 64-Byte aligned\n"); exit(1); }
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp for simd simdlen(16) aligned(x,y:64)
|
||||
for(int i=0; i<N; i++){ x[i]=i+1; y[i]=i+1; } // initialize
|
||||
|
||||
#pragma omp for simd simdlen(16) aligned(x,y:64)
|
||||
for(int i=0; i<N; i++) y[i] = s*x[i] + y[i];
|
||||
}
|
||||
|
||||
|
||||
printf("y[0],y[N-1]: %5.0f %5.0f\n",y[0],y[N-1]); //output: y... 3 3000
|
||||
|
||||
omp_free(x, xy_alloc);
|
||||
omp_free(y, xy_alloc);
|
||||
omp_destroy_allocator(xy_alloc);
|
||||
|
||||
return 0;
|
||||
}
|
51
sources/Example_allocators.1.f90
Normal file
51
sources/Example_allocators.1.f90
Normal file
@ -0,0 +1,51 @@
|
||||
! @@name: allocators.1f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
program main
|
||||
use omp_lib
|
||||
|
||||
integer, parameter :: N=1000, align=64
|
||||
real, allocatable :: x(:),y(:)
|
||||
real :: s = 2.0e0
|
||||
integer :: i
|
||||
|
||||
integer(omp_memspace_handle_kind ) :: xy_memspace = omp_default_mem_space
|
||||
type( omp_alloctrait ) :: xy_traits(1) = &
|
||||
[omp_alloctrait(omp_atk_alignment,64)]
|
||||
integer(omp_allocator_handle_kind) :: xy_alloc
|
||||
|
||||
xy_alloc = omp_init_allocator( xy_memspace, 1, xy_traits)
|
||||
|
||||
!$omp allocate(x,y) allocator(xy_alloc)
|
||||
allocate(x(N),y(N))
|
||||
!! loc is non-standard, but found everywhere
|
||||
!! remove these lines if not available
|
||||
if(modulo(loc(x),align) /= 0 .and. modulo(loc(y),align) /=0 ) then
|
||||
print*,"ERROR: x|y not 64-byte aligned"; stop
|
||||
endif
|
||||
|
||||
!$omp parallel
|
||||
|
||||
!$omp do simd simdlen(16) aligned(x,y: 64) !! 64B aligned
|
||||
do i=1,N !! initialize
|
||||
x(i)=i
|
||||
y(i)=i
|
||||
end do
|
||||
|
||||
!$omp do simd simdlen(16) aligned(x,y: 64) !! 64B aligned
|
||||
do i = 1,N
|
||||
y(i) = s*x(i) + y(i)
|
||||
end do
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
write(*,'("y(1),y(N):",2f6.0)') y(1),y(N) !!output: y... 3. 3000.
|
||||
|
||||
deallocate(x,y)
|
||||
call omp_destroy_allocator(xy_alloc)
|
||||
|
||||
end program
|
||||
|
41
sources/Example_array_shaping.1.c
Normal file
41
sources/Example_array_shaping.1.c
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* @@name: array_shaping.1.c
|
||||
* @@type: C, omp_5.0
|
||||
* @@compilable: yes
|
||||
* @@linkable: no
|
||||
* @@expect: success
|
||||
*/
|
||||
#pragma omp declare target
|
||||
int do_work(double *a, int nx, int ny);
|
||||
int other_work(double *a, int nx, int ny);
|
||||
#pragma omp end declare target
|
||||
|
||||
void exch_data(double *a, int nx, int ny);
|
||||
|
||||
void array_shaping(double *a, int nx, int ny)
|
||||
{
|
||||
// map data to device and do work
|
||||
#pragma omp target data map(a[0:nx*(ny+2)])
|
||||
{
|
||||
// do work on the device
|
||||
#pragma omp target // map(a[0:nx*(ny+2)]) is optional here
|
||||
do_work(a, nx, ny);
|
||||
|
||||
// update boundary points (two columns of 2D array) on the host
|
||||
// pointer is shaped to 2D array using the shape-operator
|
||||
#pragma omp target update from( (([nx][ny+2])a)[0:nx][1], \
|
||||
(([nx][ny+2])a)[0:nx][ny] )
|
||||
|
||||
// exchange ghost points with neighbors
|
||||
exch_data(a, nx, ny);
|
||||
|
||||
// update ghost points (two columns of 2D array) on the device
|
||||
// pointer is shaped to 2D array using the shape-operator
|
||||
#pragma omp target update to( (([nx][ny+2])a)[0:nx][0], \
|
||||
(([nx][ny+2])a)[0:nx][ny+1] )
|
||||
|
||||
// perform other work on the device
|
||||
#pragma omp target // map(a[0:nx*(ny+2)]) is optional here
|
||||
other_work(a, nx, ny);
|
||||
}
|
||||
}
|
@ -12,17 +12,14 @@ float F(float);
|
||||
#define CHUNKSZ 1000000
|
||||
void init(float *, int);
|
||||
float Z[N];
|
||||
void pipedF()
|
||||
{
|
||||
void pipedF(){
|
||||
int C, i;
|
||||
init(Z, N);
|
||||
for (C=0; C<N; C+=CHUNKSZ)
|
||||
{
|
||||
for (C=0; C<N; C+=CHUNKSZ){
|
||||
#pragma omp task shared(Z)
|
||||
#pragma omp target map(Z[C:CHUNKSZ])
|
||||
#pragma omp parallel for
|
||||
for (i=0; i<CHUNKSZ; i++)
|
||||
Z[i] = F(Z[i]);
|
||||
for (i=0; i<CHUNKSZ; i++) Z[i] = F(Z[i]);
|
||||
}
|
||||
#pragma omp taskwait
|
||||
}
|
||||
|
@ -22,8 +22,8 @@ void vec_mult(float *p, int N, int dev)
|
||||
// check whether on device dev
|
||||
if (omp_is_initial_device())
|
||||
abort();
|
||||
v1 = malloc(N*sizeof(float));
|
||||
v2 = malloc(N*sizeof(float));
|
||||
v1 = (float *)malloc(N*sizeof(float));
|
||||
v2 = (float *)malloc(N*sizeof(float));
|
||||
init(v1, v2, N);
|
||||
}
|
||||
foo(); // execute other work asychronously
|
||||
|
@ -5,11 +5,15 @@
|
||||
* @@linkable: no
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
void bar(float *a, int i, int j, int k);
|
||||
|
||||
int kl, ku, ks, jl, ju, js, il, iu,is;
|
||||
|
||||
void sub(float *a)
|
||||
{
|
||||
int i, j, k;
|
||||
|
||||
#pragma omp for collapse(2) private(i, k, j)
|
||||
for (k=kl; k<=ku; k+=ks)
|
||||
for (j=jl; j<=ju; j+=js)
|
||||
|
@ -3,11 +3,14 @@
|
||||
! @@compilable: yes
|
||||
! @@linkable: no
|
||||
! @@expect: success
|
||||
|
||||
subroutine sub(a)
|
||||
|
||||
real a(*)
|
||||
integer kl, ku, ks, jl, ju, js, il, iu, is
|
||||
common /csub/ kl, ku, ks, jl, ju, js, il, iu, is
|
||||
integer i, j, k
|
||||
|
||||
!$omp do collapse(2) private(i,j,k)
|
||||
do k = kl, ku, ks
|
||||
do j = jl, ju, js
|
||||
@ -17,4 +20,5 @@
|
||||
enddo
|
||||
enddo
|
||||
!$omp end do
|
||||
|
||||
end subroutine
|
||||
|
@ -5,6 +5,7 @@
|
||||
* @@linkable: no
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
void test()
|
||||
{
|
||||
|
@ -3,6 +3,7 @@
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
program test
|
||||
!$omp parallel
|
||||
!$omp do private(j,k) collapse(2) lastprivate(jlast, klast)
|
||||
|
@ -7,10 +7,10 @@
|
||||
*/
|
||||
#define N 100000000
|
||||
|
||||
#pragma omp declare target link(sp,sv1,sv2) \
|
||||
link(dp,dv1,dv2)
|
||||
float sp[N], sv1[N], sv2[N];
|
||||
double dp[N], dv1[N], dv2[N];
|
||||
#pragma omp declare target link(sp,sv1,sv2) \
|
||||
link(dp,dv1,dv2)
|
||||
|
||||
void s_init(float *, float *, int);
|
||||
void d_init(double *, double *, int);
|
||||
|
57
sources/Example_declare_variant.1.c
Normal file
57
sources/Example_declare_variant.1.c
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* @@name: declare_variant.1c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#define N 100
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
void p_vxv(int *v1,int *v2,int *v3,int n);
|
||||
void t_vxv(int *v1,int *v2,int *v3,int n);
|
||||
|
||||
#pragma omp declare variant( p_vxv ) match( construct={parallel} )
|
||||
#pragma omp declare variant( t_vxv ) match( construct={target} )
|
||||
void vxv(int *v1,int *v2,int *v3,int n) // base function
|
||||
{
|
||||
for (int i= 0; i< n; i++) v3[i] = v1[i] * v2[i];
|
||||
}
|
||||
|
||||
void p_vxv(int *v1,int *v2,int *v3,int n) // function variant
|
||||
{
|
||||
#pragma omp for
|
||||
for (int i= 0; i< n; i++) v3[i] = v1[i] * v2[i]*3;
|
||||
}
|
||||
|
||||
#pragma omp declare target
|
||||
void t_vxv(int *v1,int *v2,int *v3,int n) // function variant
|
||||
{
|
||||
#pragma omp distribute simd
|
||||
for (int i= 0; i< n; i++) v3[i] = v1[i] * v2[i]*2;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int v1[N], v2[N], v3[N];
|
||||
for(int i=0; i<N; i++){ v1[i]=(i+1); v2[i]=-(i+1); v3[i]=0; } //init
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
vxv(v1,v2,v3,N);
|
||||
}
|
||||
printf(" %d %d\n",v3[0],v3[N-1]); //from p_vxv -- output: -3 -30000
|
||||
|
||||
#pragma omp target teams map(to: v1[:N],v2[:N]) map(from: v3[:N])
|
||||
{
|
||||
vxv(v1,v2,v3,N);
|
||||
}
|
||||
printf(" %d %d\n",v3[0],v3[N-1]); //from t_vxv -- output: -2 -20000
|
||||
|
||||
vxv(v1,v2,v3,N);
|
||||
printf(" %d %d\n",v3[0],v3[N-1]); //from vxv -- output: -1 -10000
|
||||
|
||||
return 0;
|
||||
}
|
69
sources/Example_declare_variant.1.f90
Normal file
69
sources/Example_declare_variant.1.f90
Normal file
@ -0,0 +1,69 @@
|
||||
! @@name: declare_variant.1f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
module subs
|
||||
use omp_lib
|
||||
contains
|
||||
subroutine vxv(v1, v2, v3) !! base function
|
||||
integer,intent(in) :: v1(:),v2(:)
|
||||
integer,intent(out) :: v3(:)
|
||||
integer :: i,n
|
||||
!$omp declare variant( p_vxv ) match( construct={parallel} )
|
||||
!$omp declare variant( t_vxv ) match( construct={target} )
|
||||
|
||||
n=size(v1)
|
||||
do i = 1,n; v3(i) = v1(i) * v2(i); enddo
|
||||
|
||||
end subroutine
|
||||
|
||||
subroutine p_vxv(v1, v2, v3) !! function variant
|
||||
integer,intent(in) :: v1(:),v2(:)
|
||||
integer,intent(out) :: v3(:)
|
||||
integer :: i,n
|
||||
n=size(v1)
|
||||
|
||||
!$omp do
|
||||
do i = 1,n; v3(i) = v1(i) * v2(i) * 3; enddo
|
||||
|
||||
end subroutine
|
||||
|
||||
subroutine t_vxv(v1, v2, v3) !! function variant
|
||||
integer,intent(in) :: v1(:),v2(:)
|
||||
integer,intent(out) :: v3(:)
|
||||
integer :: i,n
|
||||
!$omp declare target
|
||||
n=size(v1)
|
||||
|
||||
!$omp distribute simd
|
||||
do i = 1,n; v3(i) = v1(i) * v2(i) * 2; enddo
|
||||
|
||||
end subroutine
|
||||
|
||||
end module subs
|
||||
|
||||
|
||||
program main
|
||||
use omp_lib
|
||||
use subs
|
||||
integer,parameter :: N = 100
|
||||
integer :: v1(N), v2(N), v3(N)
|
||||
|
||||
do i= 1,N; v1(i)= i; v2(i)= -i; v3(i)= 0; enddo !! init
|
||||
|
||||
!$omp parallel
|
||||
call vxv(v1,v2,v3)
|
||||
!$omp end parallel
|
||||
print *, v3(1),v3(N) !! from p_vxv -- output: -3 -30000
|
||||
|
||||
!$omp target teams map(to: v1,v2) map(from: v3)
|
||||
call vxv(v1,v2,v3)
|
||||
!$omp end target teams
|
||||
print *, v3(1),v3(N) !! from t_vxv -- output: -2 -20000
|
||||
|
||||
call vxv(v1,v2,v3)
|
||||
print *, v3(1),v3(N) !! from vxv -- output: -1 -10000
|
||||
|
||||
end program
|
50
sources/Example_declare_variant.2.c
Normal file
50
sources/Example_declare_variant.2.c
Normal file
@ -0,0 +1,50 @@
|
||||
/*
|
||||
* @@name: declare_variant.2c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: no
|
||||
* @@expect: success
|
||||
*/
|
||||
#include <omp.h>
|
||||
|
||||
void base_saxpy(int, float, float *, float *);
|
||||
void avx512_saxpy(int, float, float *, float *);
|
||||
|
||||
#pragma omp declare variant( avx512_saxpy ) \
|
||||
match( device={isa("core-avx512")} )
|
||||
void base_saxpy(int n, float s, float *x, float *y) // base function
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(int i=0; i<n; i++) y[i] = s*x[i] + y[i];
|
||||
}
|
||||
|
||||
void avx512_saxpy(int n, float s, float *x, float *y) //function variant
|
||||
{
|
||||
//assume 64-byte alignment for AVX-512
|
||||
#pragma omp parallel for simd simdlen(16) aligned(x,y:64)
|
||||
for(int i=0; i<n; i++) y[i] = s*x[i] + y[i];
|
||||
}
|
||||
|
||||
// Above may be in another file scope.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#define N 1000
|
||||
|
||||
int main()
|
||||
{
|
||||
static float x[N],y[N] __attribute__ ((aligned(64)));
|
||||
float s=2.0;
|
||||
// Check for 64-byte aligned
|
||||
if( ((intptr_t)y)%64 != 0 || ((intptr_t)x)%64 != 0 )
|
||||
{ printf("ERROR: x|y not 64-Byte aligned\n"); exit(1); }
|
||||
|
||||
for(int i=0; i<N; i++){ x[i]=i+1; y[i]=i+1; } // initialize
|
||||
|
||||
base_saxpy(N,s,x,y);
|
||||
|
||||
printf("y[0],y[N-1]: %5.0f %5.0f\n",y[0],y[N-1]); //output: y... 3 3000
|
||||
|
||||
return 0;
|
||||
}
|
65
sources/Example_declare_variant.2.f90
Normal file
65
sources/Example_declare_variant.2.f90
Normal file
@ -0,0 +1,65 @@
|
||||
! @@name: declare_variant.2f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
module subs
|
||||
use omp_lib
|
||||
contains
|
||||
|
||||
subroutine base_saxpy(s,x,y) !! base function
|
||||
real,intent(inout) :: s,x(:),y(:)
|
||||
!$omp declare variant( avx512_saxpy ) &
|
||||
!$omp& match( device={isa("core-avx512")} )
|
||||
|
||||
y = s*x + y
|
||||
|
||||
end subroutine
|
||||
|
||||
subroutine avx512_saxpy(s,x,y) !! function variant
|
||||
real,intent(inout) :: s,x(:),y(:)
|
||||
integer :: i,n
|
||||
n=size(x)
|
||||
!!assume 64-byte alignment for AVX-512
|
||||
!$omp parallel do simd simdlen(16) aligned(x,y: 64)
|
||||
do i = 1,n
|
||||
y(i) = s*x(i) + y(i)
|
||||
end do
|
||||
|
||||
end subroutine
|
||||
|
||||
end module subs
|
||||
|
||||
|
||||
program main
|
||||
use omp_lib
|
||||
use subs
|
||||
|
||||
integer, parameter :: N=1000, align=64
|
||||
real, allocatable :: x(:),y(:)
|
||||
real :: s = 2.0e0
|
||||
integer :: i
|
||||
|
||||
allocate(x(N),y(N)) !! Assumes allocation is 64-byte aligned
|
||||
!! (using compiler options, or another
|
||||
!! allocation method).
|
||||
|
||||
!! loc is non-standard, but found everywhere
|
||||
!! remove these lines if not available
|
||||
if(modulo(loc(x),align) /= 0 .and. modulo(loc(y),align) /=0 ) then
|
||||
print*,"ERROR: x|y not 64-byte aligned"; stop
|
||||
endif
|
||||
|
||||
do i=1,N !! initialize
|
||||
x(i)=i
|
||||
y(i)=i
|
||||
end do
|
||||
|
||||
call base_saxpy(s,x,y)
|
||||
|
||||
write(*,'("y(1),y(N):",2f6.0)') y(1),y(N) !!output: y... 3. 3000.
|
||||
|
||||
deallocate(x,y)
|
||||
|
||||
end program
|
76
sources/Example_depobj.1.c
Normal file
76
sources/Example_depobj.1.c
Normal file
@ -0,0 +1,76 @@
|
||||
/*
|
||||
* @@name: depobj.1c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
#define N 100
|
||||
#define TRUE 1
|
||||
#define FALSE 0
|
||||
|
||||
void driver(int update, float a[], float b[], int n, omp_depend_t *obj);
|
||||
|
||||
void update_copy(int update, float a[], float b[], int n);
|
||||
void checkpoint(float a[],int n);
|
||||
void init(float a[], int n);
|
||||
|
||||
|
||||
int main(){
|
||||
|
||||
float a[N],b[N];
|
||||
omp_depend_t obj;
|
||||
|
||||
init(a, N);
|
||||
|
||||
#pragma omp depobj(obj) depend(inout: a)
|
||||
|
||||
driver(TRUE, a,b,N, &obj); // updating a occurs
|
||||
|
||||
#pragma omp depobj(obj) update(in)
|
||||
|
||||
driver(FALSE, a,b,N, &obj); // no updating of a
|
||||
|
||||
#pragma omp depobj(obj) destroy // obj is set to uninitilized state,
|
||||
// resources are freed
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
void driver(int update, float a[], float b[], int n, omp_depend_t *obj)
|
||||
{
|
||||
#pragma omp parallel num_threads(2)
|
||||
#pragma omp single
|
||||
{
|
||||
|
||||
#pragma omp task depend(depobj: *obj) // Task 1, uses depend object
|
||||
update_copy(update, a,b,n); // update a or not, always copy a to b
|
||||
|
||||
#pragma omp task depend(in: a[:n]) // Task 2, only read a
|
||||
checkpoint(a,n);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void update_copy(int update, float a[], float b[], int n)
|
||||
{
|
||||
if(update) for(int i=0;i<n;i++) a[i]+=1.0f;
|
||||
|
||||
for(int i=0;i<n;i++) b[i]=a[i];
|
||||
}
|
||||
|
||||
void checkpoint(float a[], int n)
|
||||
{
|
||||
for(int i=0;i<n;i++) printf(" %f ",a[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void init(float a[], int n)
|
||||
{
|
||||
for(int i=0;i<n;i++) a[i]=i;
|
||||
}
|
||||
|
83
sources/Example_depobj.1.f90
Normal file
83
sources/Example_depobj.1.f90
Normal file
@ -0,0 +1,83 @@
|
||||
! @@name: depobj.1f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
program main
|
||||
use omp_lib
|
||||
implicit none
|
||||
|
||||
integer,parameter :: N=100
|
||||
real :: a(N),b(N)
|
||||
integer(omp_depend_kind) :: obj
|
||||
|
||||
call init(a, N)
|
||||
|
||||
!$omp depobj(obj) depend(inout: a)
|
||||
|
||||
call driver(.true., a,b,N, obj) !! updating occurs
|
||||
|
||||
!$omp depobj(obj) update(in)
|
||||
|
||||
call driver(.false., a,b,N, obj) !! no updating
|
||||
|
||||
!$omp depobj(obj) destroy !! obj is set to uninitilized state,
|
||||
!! resources are freed
|
||||
|
||||
end program
|
||||
|
||||
subroutine driver(update, a, b, n, obj)
|
||||
use omp_lib
|
||||
implicit none
|
||||
logical :: update
|
||||
real :: a(n), b(n)
|
||||
integer :: n
|
||||
integer(omp_depend_kind) :: obj
|
||||
|
||||
!$omp parallel num_threads(2)
|
||||
|
||||
!$omp single
|
||||
|
||||
!$omp task depend(depobj: obj) !! Task 1, uses depend object
|
||||
call update_copy(update, a,b,n) !! update a or not, always copy a to b
|
||||
!$omp end task
|
||||
|
||||
!$omp task depend(in: a) !! Task 2, only read a
|
||||
call checkpoint(a,n)
|
||||
!$omp end task
|
||||
|
||||
!$omp end single
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
end subroutine
|
||||
|
||||
subroutine update_copy(update, a, b, n)
|
||||
implicit none
|
||||
logical :: update
|
||||
real :: a(n), b(n)
|
||||
integer :: n
|
||||
|
||||
if (update) a = a + 1.0
|
||||
|
||||
b = a
|
||||
|
||||
end subroutine
|
||||
|
||||
subroutine checkpoint( a, n)
|
||||
implicit none
|
||||
integer :: n
|
||||
real :: a(n)
|
||||
integer :: i
|
||||
|
||||
write(*,'( *(f5.0) )') (a(i), i=1,n)
|
||||
end subroutine
|
||||
|
||||
subroutine init(a,n)
|
||||
implicit none
|
||||
integer :: n
|
||||
real :: a(n)
|
||||
integer :: i
|
||||
|
||||
a=[ (i, i=1,n) ]
|
||||
end subroutine
|
@ -22,7 +22,7 @@ void get_dev_cos(double *mem, size_t s)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
mem_dev_cpy = omp_target_alloc( sizeof(double) * s, t);
|
||||
mem_dev_cpy = (double *)omp_target_alloc( sizeof(double) * s, t);
|
||||
if(mem_dev_cpy == NULL){
|
||||
printf(" ERROR: No space left on device.\n");
|
||||
exit(1);
|
||||
|
@ -8,8 +8,7 @@
|
||||
#include <omp.h>
|
||||
void work(int i);
|
||||
|
||||
void incorrect()
|
||||
{
|
||||
void incorrect() {
|
||||
int np, i;
|
||||
|
||||
np = omp_get_num_threads(); /* misplaced */
|
||||
|
61
sources/Example_host_teams.1.c
Normal file
61
sources/Example_host_teams.1.c
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* @@name: host_teams.2.c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <omp.h>
|
||||
#define N 1000
|
||||
|
||||
int main(){
|
||||
int nteams_required=2, max_thrds, tm_id;
|
||||
float sp_x[N], sp_y[N], sp_a=0.0001e0;
|
||||
double dp_x[N], dp_y[N], dp_a=0.0001e0;
|
||||
|
||||
// Create 2 teams, each team works in a different precision
|
||||
#pragma omp teams num_teams(nteams_required) \
|
||||
thread_limit(max_thrds) private(tm_id)
|
||||
{
|
||||
tm_id = omp_get_team_num();
|
||||
|
||||
if( omp_get_num_teams() != 2 ) //if only getting 1, quit
|
||||
{ printf("error: Insufficient teams on host, 2 required\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if(tm_id == 0) // Do Single Precision Work (SAXPY) with this team
|
||||
{
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp for //init
|
||||
for(int i=0; i<N; i++){sp_x[i] = i*0.0001; sp_y[i]=i; }
|
||||
|
||||
#pragma omp for simd simdlen(8)
|
||||
for(int i=0; i<N; i++){sp_x[i] = sp_a*sp_x[i] + sp_y[i];}
|
||||
}
|
||||
}
|
||||
|
||||
if(tm_id == 1) // Do Double Precision Work (DAXPY) with this team
|
||||
{
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp for //init
|
||||
for(int i=0; i<N; i++){dp_x[i] = i*0.0001; dp_y[i]=i; }
|
||||
|
||||
#pragma omp for simd simdlen(4)
|
||||
for(int i=0; i<N; i++){dp_x[i] = dp_a*dp_x[i] + dp_y[i];}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("i=%d sp|dp %f %f \n",N-1, sp_x[N-1], dp_x[N-1]);
|
||||
printf("i=%d sp|dp %f %f \n",N/2, sp_x[N/2], dp_x[N/2]);
|
||||
//OUTPUT1:i=999 sp|dp 999.000000 999.000010
|
||||
//OUTPUT2:i=500 sp|dp 500.000000 500.000005
|
||||
|
||||
return 0;
|
||||
}
|
64
sources/Example_host_teams.1.f90
Normal file
64
sources/Example_host_teams.1.f90
Normal file
@ -0,0 +1,64 @@
|
||||
! @@name: host_teams.2.f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
program main
|
||||
use omp_lib
|
||||
integer :: nteams_required=2, max_thrds, tm_id
|
||||
integer,parameter :: N=1000
|
||||
real :: sp_x(N), sp_y(N), sp_a=0.0001e0
|
||||
double precision :: dp_x(N), dp_y(N), dp_a=0.0001d0
|
||||
|
||||
max_thrds = omp_get_num_procs()/nteams_required
|
||||
|
||||
!! Create 2 teams, each team works in a different precision
|
||||
!$omp teams num_teams(nteams_required) thread_limit(max_thrds) private(tm_id)
|
||||
|
||||
tm_id = omp_get_team_num()
|
||||
|
||||
if( omp_get_num_teams() /= 2 ) then !! if only getting 1, quit
|
||||
stop "error: Insufficient teams on host, 2 required."
|
||||
endif
|
||||
|
||||
if(tm_id == 0) then !! Do Single Precision Work (SAXPY) with this team
|
||||
|
||||
!$omp parallel
|
||||
!$omp do !! init
|
||||
do i = 1,N
|
||||
sp_x(i) = i*0.0001e0
|
||||
sp_y(i) = i
|
||||
end do
|
||||
|
||||
!$omp do simd simdlen(8)
|
||||
do i = 1,N
|
||||
sp_x(i) = sp_a*sp_x(i) + sp_y(i)
|
||||
end do
|
||||
!$omp end parallel
|
||||
|
||||
endif
|
||||
|
||||
if(tm_id == 1) then !! Do Double Precision Work (DAXPY) with this team
|
||||
|
||||
!$omp parallel
|
||||
!$omp do !! init
|
||||
do i = 1,N
|
||||
dp_x(i) = i*0.0001d0
|
||||
dp_y(i) = i
|
||||
end do
|
||||
|
||||
!$omp do simd simdlen(4)
|
||||
do i = 1,N
|
||||
dp_x(i) = dp_a*dp_x(i) + dp_y(i)
|
||||
end do
|
||||
!$omp end parallel
|
||||
|
||||
endif
|
||||
!$omp end teams
|
||||
|
||||
write(*,'( "i=",i4," sp|dp= ", e15.7, d25.16 )') N, sp_x(N), dp_x(N)
|
||||
write(*,'( "i=",i4," sp|dp= ", e15.7, d25.16 )') N/2, sp_x(N/2), dp_x(N/2)
|
||||
!! i=1000 sp|dp= 0.1000000E+04 0.1000000010000000D+04
|
||||
!! i= 500 sp|dp= 0.5000000E+03 0.5000000050000000D+03
|
||||
end program
|
@ -23,11 +23,9 @@ int main (void)
|
||||
omp_set_num_threads(4);
|
||||
#pragma omp single
|
||||
{
|
||||
/*
|
||||
* The following should print:
|
||||
* Inner: max_act_lev=8, num_thds=3, max_thds=4
|
||||
* Inner: max_act_lev=8, num_thds=3, max_thds=4
|
||||
*/
|
||||
// The following should print:
|
||||
// Inner: max_act_lev=8, num_thds=3, max_thds=4
|
||||
// Inner: max_act_lev=8, num_thds=3, max_thds=4
|
||||
printf ("Inner: max_act_lev=%d, num_thds=%d, max_thds=%d\n",
|
||||
omp_get_max_active_levels(), omp_get_num_threads(),
|
||||
omp_get_max_threads());
|
||||
@ -37,10 +35,8 @@ int main (void)
|
||||
#pragma omp barrier
|
||||
#pragma omp single
|
||||
{
|
||||
/*
|
||||
* The following should print:
|
||||
* Outer: max_act_lev=8, num_thds=2, max_thds=3
|
||||
*/
|
||||
// The following should print:
|
||||
// Outer: max_act_lev=8, num_thds=2, max_thds=3
|
||||
printf ("Outer: max_act_lev=%d, num_thds=%d, max_thds=%d\n",
|
||||
omp_get_max_active_levels(), omp_get_num_threads(),
|
||||
omp_get_max_threads());
|
||||
|
@ -7,15 +7,13 @@
|
||||
*/
|
||||
#include <omp.h>
|
||||
|
||||
omp_lock_t *new_locks()
|
||||
{
|
||||
omp_lock_t *new_locks() {
|
||||
int i;
|
||||
omp_lock_t *lock = new omp_lock_t[1000];
|
||||
|
||||
#pragma omp parallel for private(i)
|
||||
for (i=0; i<1000; i++)
|
||||
{
|
||||
omp_init_lock(&lock[i]);
|
||||
}
|
||||
{ omp_init_lock(&lock[i]); }
|
||||
|
||||
return lock;
|
||||
}
|
||||
|
@ -6,7 +6,6 @@
|
||||
FUNCTION NEW_LOCKS()
|
||||
USE OMP_LIB ! or INCLUDE "omp_lib.h"
|
||||
INTEGER(OMP_LOCK_KIND), DIMENSION(1000) :: NEW_LOCKS
|
||||
|
||||
INTEGER I
|
||||
|
||||
!$OMP PARALLEL DO PRIVATE(I)
|
||||
|
@ -16,7 +16,8 @@ omp_lock_t *new_locks()
|
||||
for (i=0; i<1000; i++)
|
||||
{
|
||||
omp_init_lock_with_hint(&lock[i],
|
||||
omp_lock_hint_contended | omp_lock_hint_speculative);
|
||||
static_cast<omp_lock_hint_t>(omp_lock_hint_contended |
|
||||
omp_lock_hint_speculative));
|
||||
}
|
||||
return lock;
|
||||
}
|
||||
|
22
sources/Example_loop.1.c
Normal file
22
sources/Example_loop.1.c
Normal file
@ -0,0 +1,22 @@
|
||||
/*
|
||||
* @@name: loop.2c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#define N 100
|
||||
int main()
|
||||
{
|
||||
float x[N], y[N];
|
||||
float a = 2.0;
|
||||
for(int i=0;i<N;i++){ x[i]=i; y[i]=0;} // initialize
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp loop
|
||||
for(int i = 0; i < N; ++i) y[i] = a*x[i] + y[i];
|
||||
}
|
||||
if(y[N-1] != (N-1)*2.0) printf("Error: 2*(N-1) != y[N-1]=%f",y[N-1]);
|
||||
}
|
19
sources/Example_loop.1.f90
Normal file
19
sources/Example_loop.1.f90
Normal file
@ -0,0 +1,19 @@
|
||||
! @@name: loop.2f90
|
||||
! @@type: F-free, omp_5.0
|
||||
! @@compilable: yes
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
program main
|
||||
integer, parameter :: N=100
|
||||
real :: x(N), y(N)
|
||||
real :: a = 2.0e0
|
||||
|
||||
x=[i,i=1,N]; y=1.0e0 !! initialize
|
||||
|
||||
!$omp parallel
|
||||
!$omp loop
|
||||
do i=1,N; y(i) = a*x(i) + y(i); enddo
|
||||
!$omp end parallel
|
||||
|
||||
if(y(N) /= N*2.0e0) print*,"Error: 2*N /= y(N); y(N)=",y(N)
|
||||
end program
|
@ -16,9 +16,9 @@
|
||||
!Flush of FLAG is implied by the atomic directive
|
||||
ELSE IF(OMP_GET_THREAD_NUM() .EQ. 1) THEN
|
||||
! Loop until we see that FLAG reaches 1
|
||||
!$OMP FLUSH(FLAG, DATA)
|
||||
!$OMP FLUSH(FLAG)
|
||||
DO WHILE(FLAG .LT. 1)
|
||||
!$OMP FLUSH(FLAG, DATA)
|
||||
!$OMP FLUSH(FLAG)
|
||||
ENDDO
|
||||
|
||||
PRINT *, 'Thread 1 awoken'
|
||||
@ -29,9 +29,9 @@
|
||||
!Flush of FLAG is implied by the atomic directive
|
||||
ELSE IF(OMP_GET_THREAD_NUM() .EQ. 2) THEN
|
||||
! Loop until we see that FLAG reaches 2
|
||||
!$OMP FLUSH(FLAG, DATA)
|
||||
!$OMP FLUSH(FLAG)
|
||||
DO WHILE(FLAG .LT. 2)
|
||||
!$OMP FLUSH(FLAG, DATA)
|
||||
!$OMP FLUSH(FLAG)
|
||||
ENDDO
|
||||
|
||||
PRINT *, 'Thread 2 awoken'
|
||||
|
26
sources/Example_metadirective.1.c
Normal file
26
sources/Example_metadirective.1.c
Normal file
@ -0,0 +1,26 @@
|
||||
/*
|
||||
* @@name: metadirective.1c
|
||||
* @@type: C
|
||||
* @@compilable: yes, omp_5.0
|
||||
* @@linkable: yes
|
||||
* @@expect: success
|
||||
*/
|
||||
|
||||
#define N 100
|
||||
#include <stdio.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int v1[N], v2[N], v3[N];
|
||||
for(int i=0; i<N; i++){ v1[i]=(i+1); v2[i]=-(i+1); }
|
||||
|
||||
#pragma omp target map(to:v1,v2) map(from:v3) device(0)
|
||||
#pragma omp metadirective \
|
||||
when( device={arch("nvptx")}: teams loop) \
|
||||
default( parallel loop)
|
||||
for (int i= 0; i< N; i++) v3[i] = v1[i] * v2[i];
|
||||
|
||||
printf(" %d %d\n",v3[0],v3[N-1]); //output: -1 -10000
|
||||
|
||||
return 0;
|
||||
}
|
20
sources/Example_metadirective.1.f90
Normal file
20
sources/Example_metadirective.1.f90
Normal file
@ -0,0 +1,20 @@
|
||||
! @@name: metadirective.2f90
|
||||
! @@type: F-free
|
||||
! @@compilable: yes, omp_5.0
|
||||
! @@linkable: yes
|
||||
! @@expect: success
|
||||
|
||||
program main
|
||||
integer, parameter :: N= 100
|
||||
integer :: v1(N), v2(N), v3(N);
|
||||
|
||||
do i=1,N; v1(i)=i; v2(i)=-i; enddo ! initialize
|
||||
|
||||
!$omp target map(to:v1,v2) map(from:v3) device(0)
|
||||
!$omp metadirective &
|
||||
!$omp& when( device={arch("nvptx")}: teams loop) &
|
||||
!$omp& default( parallel loop)
|
||||
do i= 1,N; v3(i) = v1(i) * v2(i); enddo
|
||||
|
||||
print *, v3(1),v3(N) !!output: -1 -10000
|
||||
end program
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user