From 3346a30ce2414def7b42140d207eb282186f47ea Mon Sep 17 00:00:00 2001 From: Henry Jin Date: Wed, 13 Nov 2024 11:07:08 -0800 Subject: [PATCH] v6.0 release --- Chap_data_environment.tex | 1 + Chap_devices.tex | 2 + Chap_directives.tex | 23 ++- Chap_loop_transformations.tex | 3 +- Chap_program_control.tex | 1 + Chap_tasking.tex | 1 + Contributions.md | 10 ++ Deprecated_Features.tex | 30 ++++ Foreword_Chapt.tex | 27 +-- History.tex | 68 +++++++ Makefile | 29 +-- STYLE_GUIDE.md | 46 +++++ Title_Page.tex | 2 +- affinity/affinity.tex | 12 +- affinity/affinity_display.tex | 13 +- data_environment/associate.tex | 10 +- data_environment/carrays_fpriv.tex | 4 +- data_environment/copyprivate.tex | 4 +- data_environment/cpp_reference.tex | 4 +- data_environment/default_none.tex | 4 +- data_environment/fort_loopvar.tex | 4 +- data_environment/fort_sa_private.tex | 5 +- data_environment/fort_shared_var.tex | 4 +- data_environment/fort_sp_common.tex | 4 +- data_environment/induction.tex | 67 +++++++ data_environment/reduction.tex | 35 +++- data_environment/scan.tex | 17 ++ data_environment/sources/induction.1.c | 48 +++++ data_environment/sources/induction.1.f90 | 48 +++++ data_environment/sources/induction.2.cpp | 47 +++++ data_environment/sources/induction.2.f90 | 66 +++++++ data_environment/sources/priv_reduction.1.c | 35 ++++ data_environment/sources/priv_reduction.1.f90 | 35 ++++ data_environment/sources/priv_reduction.2.cpp | 34 ++++ data_environment/sources/priv_reduction.2.f90 | 34 ++++ data_environment/sources/priv_reduction.3.c | 24 +++ data_environment/sources/priv_reduction.3.f90 | 20 +++ data_environment/sources/scan.3.c | 40 +++++ data_environment/sources/scan.3.f90 | 37 ++++ data_environment/sources/udr.1.c | 10 +- data_environment/sources/udr.1.f90 | 8 +- data_environment/sources/udr.2.c | 18 +- data_environment/sources/udr.2.f90 | 14 +- data_environment/sources/udr.3.c | 8 +- data_environment/sources/udr.3.f90 | 8 +- data_environment/sources/udr.4.f90 | 7 +- data_environment/sources/udr.5.cpp | 4 +- data_environment/sources/udr.6.cpp | 17 +- data_environment/threadprivate.tex | 12 +- data_environment/udr.tex | 83 +++++---- devices/C++_virtual_functions.tex | 8 +- devices/array_shaping.tex | 8 +- devices/async_target_nowait_arg.tex | 11 ++ devices/declare_target.tex | 8 +- devices/device_env_traits.tex | 65 +++++++ devices/lambda_expressions.tex | 5 +- devices/sources/async_target.5.c | 42 +++++ devices/sources/async_target.5.f90 | 41 +++++ devices/sources/teams.7.c | 27 +++ devices/sources/teams.7.f90 | 25 +++ devices/sources/usm_scalar_ptr_ref_asc.1.f90 | 2 +- .../target_fort_allocatable_array_mapping.tex | 11 +- devices/teams.tex | 12 +- devices/usm.tex | 9 +- directives/attributes.tex | 50 ++++-- directives/fixed_format_comments.tex | 6 +- directives/free_format_comments.tex | 10 +- directives/pragmas.tex | 4 +- .../sources/directive_syntax_attribute.1.cpp | 4 +- .../sources/directive_syntax_attribute.2.cpp | 36 ++++ introduction/Examples.tex | 11 +- loop_transformations/apply.tex | 150 ++++++++++++++++ loop_transformations/sources/apply_nested.1.c | 14 ++ .../sources/apply_nested.1.f90 | 15 ++ .../sources/apply_nested_equivalent.1.c | 39 ++++ .../sources/apply_nested_equivalent.1.f90 | 46 +++++ loop_transformations/sources/apply_span.1.c | 16 ++ loop_transformations/sources/apply_span.1.f90 | 18 ++ .../sources/apply_span_equivalent.1.c | 52 ++++++ .../sources/apply_span_equivalent.1.f90 | 64 +++++++ loop_transformations/sources/apply_syntax.1.c | 21 +++ .../sources/apply_syntax.1.f90 | 27 +++ loop_transformations/sources/apply_syntax.2.c | 19 ++ .../sources/apply_syntax.2.f90 | 21 +++ loop_transformations/sources/apply_syntax.3.c | 16 ++ .../sources/apply_syntax.3.f90 | 19 ++ .../sources/apply_syntax_equivalent.1.c | 16 ++ .../sources/apply_syntax_equivalent.1.f90 | 18 ++ .../sources/apply_syntax_equivalent.2.c | 25 +++ .../sources/apply_syntax_equivalent.2.f90 | 29 +++ .../sources/apply_syntax_equivalent.3.c | 27 +++ .../sources/apply_syntax_equivalent.3.f90 | 37 ++++ memory_model/fort_race.tex | 6 +- openmp-examples.tex | 2 +- openmp.sty | 166 +++++++++++++----- parallel_execution/fort_do.tex | 4 +- parallel_execution/loop.tex | 6 +- parallel_execution/pra_iterator.tex | 4 +- parallel_execution/workshare.tex | 4 +- program_control/assumption.tex | 20 ++- program_control/cancellation.tex | 16 +- program_control/cond_comp.tex | 8 +- program_control/dispatch.tex | 60 +++++++ program_control/icv.tex | 55 +++++- program_control/sources/assumption.1.c | 3 - program_control/sources/assumption.1.f90 | 3 - program_control/sources/assumption.2.c | 34 ++++ program_control/sources/assumption.2.f90 | 40 +++++ program_control/sources/cancellation.2.c | 5 +- program_control/sources/cancellation.2.f90 | 24 +-- program_control/sources/declare_variant.3.c | 2 +- program_control/sources/dispatch.1.c | 59 +++++++ program_control/sources/dispatch.1.f90 | 62 +++++++ program_control/sources/icv.2.c | 87 +++++++++ program_control/sources/icv.2.f90 | 77 ++++++++ program_control/sources/requires.1.f90 | 1 + program_control/sources/selector_scoring.1.c | 18 +- program_control/sources/selector_scoring.2.c | 30 ++-- .../sources/selector_scoring.2.f90 | 14 +- program_control/sources/standalone.1.f90 | 1 - .../sources/target_offload_control.1.c | 11 +- program_control/standalone.tex | 2 +- synchronization/atomic_cas.tex | 6 +- synchronization/atomic_restrict.tex | 4 +- synchronization/sources/atomic.4.c | 1 + synchronization/sources/atomic.4.f90 | 4 +- tasking/sources/task_dep.14.c | 55 ++++++ tasking/sources/task_dep.14.f90 | 49 ++++++ tasking/sources/taskloop_dep.1.c | 35 ++++ tasking/sources/taskloop_dep.1.f90 | 36 ++++ tasking/sources/taskloop_dep.2.c | 29 +++ tasking/sources/taskloop_dep.2.f90 | 29 +++ tasking/task_dep.tex | 32 ++++ tasking/taskloop_dep.tex | 51 ++++++ util/latexdiff/latexdiff | 4 +- util/latexdiff/latexdiff-fast | 4 +- util/latexdiff/latexdiff-vc | 2 +- versioninfo | 12 +- 138 files changed, 3009 insertions(+), 339 deletions(-) create mode 100644 STYLE_GUIDE.md create mode 100644 data_environment/induction.tex create mode 100644 data_environment/sources/induction.1.c create mode 100644 data_environment/sources/induction.1.f90 create mode 100644 data_environment/sources/induction.2.cpp create mode 100644 data_environment/sources/induction.2.f90 create mode 100644 data_environment/sources/priv_reduction.1.c create mode 100644 data_environment/sources/priv_reduction.1.f90 create mode 100644 data_environment/sources/priv_reduction.2.cpp create mode 100644 data_environment/sources/priv_reduction.2.f90 create mode 100644 data_environment/sources/priv_reduction.3.c create mode 100644 data_environment/sources/priv_reduction.3.f90 create mode 100644 data_environment/sources/scan.3.c create mode 100644 data_environment/sources/scan.3.f90 create mode 100644 devices/async_target_nowait_arg.tex create mode 100644 devices/device_env_traits.tex create mode 100644 devices/sources/async_target.5.c create mode 100644 devices/sources/async_target.5.f90 create mode 100644 devices/sources/teams.7.c create mode 100644 devices/sources/teams.7.f90 create mode 100644 directives/sources/directive_syntax_attribute.2.cpp create mode 100644 loop_transformations/apply.tex create mode 100644 loop_transformations/sources/apply_nested.1.c create mode 100644 loop_transformations/sources/apply_nested.1.f90 create mode 100644 loop_transformations/sources/apply_nested_equivalent.1.c create mode 100644 loop_transformations/sources/apply_nested_equivalent.1.f90 create mode 100644 loop_transformations/sources/apply_span.1.c create mode 100644 loop_transformations/sources/apply_span.1.f90 create mode 100644 loop_transformations/sources/apply_span_equivalent.1.c create mode 100644 loop_transformations/sources/apply_span_equivalent.1.f90 create mode 100644 loop_transformations/sources/apply_syntax.1.c create mode 100644 loop_transformations/sources/apply_syntax.1.f90 create mode 100644 loop_transformations/sources/apply_syntax.2.c create mode 100644 loop_transformations/sources/apply_syntax.2.f90 create mode 100644 loop_transformations/sources/apply_syntax.3.c create mode 100644 loop_transformations/sources/apply_syntax.3.f90 create mode 100644 loop_transformations/sources/apply_syntax_equivalent.1.c create mode 100644 loop_transformations/sources/apply_syntax_equivalent.1.f90 create mode 100644 loop_transformations/sources/apply_syntax_equivalent.2.c create mode 100644 loop_transformations/sources/apply_syntax_equivalent.2.f90 create mode 100644 loop_transformations/sources/apply_syntax_equivalent.3.c create mode 100644 loop_transformations/sources/apply_syntax_equivalent.3.f90 create mode 100644 program_control/dispatch.tex create mode 100644 program_control/sources/assumption.2.c create mode 100644 program_control/sources/assumption.2.f90 create mode 100644 program_control/sources/dispatch.1.c create mode 100644 program_control/sources/dispatch.1.f90 create mode 100644 program_control/sources/icv.2.c create mode 100644 program_control/sources/icv.2.f90 create mode 100644 tasking/sources/task_dep.14.c create mode 100644 tasking/sources/task_dep.14.f90 create mode 100644 tasking/sources/taskloop_dep.1.c create mode 100644 tasking/sources/taskloop_dep.1.f90 create mode 100644 tasking/sources/taskloop_dep.2.c create mode 100644 tasking/sources/taskloop_dep.2.f90 create mode 100644 tasking/taskloop_dep.tex diff --git a/Chap_data_environment.tex b/Chap_data_environment.tex index 245a12a..b353e0c 100644 --- a/Chap_data_environment.tex +++ b/Chap_data_environment.tex @@ -86,6 +86,7 @@ in the \docref{\kcode{map} Clause} subsection of the OpenMP Specifications docum \input{data_environment/lastprivate} \input{data_environment/reduction} \input{data_environment/udr} +\input{data_environment/induction} \input{data_environment/scan} \input{data_environment/copyin} \input{data_environment/copyprivate} diff --git a/Chap_devices.tex b/Chap_devices.tex index 0f4f0d0..fcd7871 100644 --- a/Chap_devices.tex +++ b/Chap_devices.tex @@ -73,5 +73,7 @@ clause introduced in OpenMP 4.5. \input{devices/async_target_with_tasks} \input{devices/async_target_nowait} \input{devices/async_target_nowait_depend} +\input{devices/async_target_nowait_arg} \input{devices/device} +\input{devices/device_env_traits} diff --git a/Chap_directives.tex b/Chap_directives.tex index 7572009..7f840e6 100644 --- a/Chap_directives.tex +++ b/Chap_directives.tex @@ -3,8 +3,8 @@ \index{directive syntax} OpenMP \plc{directives} use base-language mechanisms to specify OpenMP program behavior. -In C code, the directives are formed exclusively with pragmas, whereas in C++ -code, directives are formed from either pragmas or attributes. +In C/C++ code, the directives are formed with +either pragmas or attributes. Fortran directives are formed with comments in free form and fixed form sources (codes). All of these mechanisms allow the compilation to ignore the OpenMP directives if OpenMP is not supported or enabled. @@ -20,18 +20,27 @@ The formats for combining a base-language mechanism and a \plc{directive-specifi C/C++ pragmas \begin{indentedcodelist} -\kcode{\#pragma omp} \plc{directive-specification} +#pragma omp \plc{directive-specification} \end{indentedcodelist} -C++ attributes +C/C++ attribute specifiers \begin{indentedcodelist} -\kcode{[[omp :: directive( \plc{directive-specification} )]]} -\kcode{[[using omp : directive( \plc{directive-specification} )]]} +[[omp :: directive( \plc{directive-specification} )]] +[[omp :: decl( \plc{directive-specification} )]] \end{indentedcodelist} +C++ attribute specifiers +\begin{indentedcodelist} +[[using omp : directive( \plc{directive-specification} )]] +[[using omp : decl( \plc{directive-specification} )]] +\end{indentedcodelist} + +where the \kcode{decl} attribute may be used for declarative +directives alternatively. + Fortran comments \begin{indentedcodelist} -\scode{!$omp} \plc{directive-specification} +!$omp \plc{directive-specification} \end{indentedcodelist} where \scode{c$omp} and \scode{*$omp} may be used in Fortran fixed form sources. diff --git a/Chap_loop_transformations.tex b/Chap_loop_transformations.tex index de7d5ad..f73aef2 100644 --- a/Chap_loop_transformations.tex +++ b/Chap_loop_transformations.tex @@ -21,6 +21,7 @@ whereby specific hot spots can be affected by transformation directives. %===== Examples Sections ===== \input{loop_transformations/tile} -\input{loop_transformations/unroll} \input{loop_transformations/partial_tile} +\input{loop_transformations/unroll} +\input{loop_transformations/apply} diff --git a/Chap_program_control.tex b/Chap_program_control.tex index 8461b0d..6835b13 100644 --- a/Chap_program_control.tex +++ b/Chap_program_control.tex @@ -105,6 +105,7 @@ chapter in the OpenMP Specifications document. \input{program_control/cancellation} \input{program_control/requires} \input{program_control/context_based_variants} +\input{program_control/dispatch} \input{program_control/nested_loop} \input{program_control/nesting_restrict} \input{program_control/target_offload} diff --git a/Chap_tasking.tex b/Chap_tasking.tex index 24d5a6a..655a309 100644 --- a/Chap_tasking.tex +++ b/Chap_tasking.tex @@ -59,4 +59,5 @@ can be found in the \docref{Tasking Constructs} chapter of the OpenMP Specificat \input{tasking/taskyield} \input{tasking/taskloop} \input{tasking/parallel_masked_taskloop} +\input{tasking/taskloop_dep} diff --git a/Contributions.md b/Contributions.md index 5620f9e..144c9b7 100644 --- a/Contributions.md +++ b/Contributions.md @@ -158,9 +158,19 @@ The following describes LaTeX macros defined specifically for examples. \cppspecificstart, \cppspecificend \ccppspecificstart, \ccppspecificend \fortranspecificstart, \fortranspecificend + \begin{cspecific}[s] ... \end{cspecific} + \begin{cppspecific}[s] ... \end{cppspecific} + \begin{ccppspecific}[s] ... \end{ccppspecific} + \begin{fortranspecific}[s] ... \end{fortranspecific} \topmarker{Lang} ``` + Use of the structured `\begin{} .. \end{}` environments is the preferred + way of specifying language-dependent text over the unstructured approach + of using `\*specificstart` and `\*specificend`. + The option `[s]` to each of the environments can specify a vertical shift + for the beginning rule, such as when followed by a section header. + The macro `\topmarker` puts a dashed blue line floater at top of a page for "Lang (cont.)" where `Lang` can be `C/C++`, `C++`, `Fortran`. diff --git a/Deprecated_Features.tex b/Deprecated_Features.tex index 09ebe24..0f84611 100644 --- a/Deprecated_Features.tex +++ b/Deprecated_Features.tex @@ -58,6 +58,9 @@ accordingly and listed in Section~\ref{sec:Updated Examples}. \tablelasttail{\hline\\[-2ex]} \tablecaption{Deprecated Features and Their Replacements\label{tab:Deprecated Features}} \begin{supertabular}{p{0.4in} p{2.3in} p{2.2in}} +6.0 & \kcode{declare reduction(}\plc{reduction-id}: \plc{typename-list}: \plc{combiner}\kcode{)} + & \kcode{declare reduction(}\plc{reduction-id}: \plc{typename-list}\kcode{)} \kcode{combiner(\plc{combiner-exp})} \\ +\hline 5.2 & \kcode{default} clause on metadirectives & \kcode{otherwise} clause \\ 5.2 & delimited \kcode{declare target} directive for C/C++ @@ -98,6 +101,32 @@ the tables shows the version tag of the earlier version. It also shows the prior name of an example when it has been renamed. +Table~\ref{tab:Updated Examples 6.0} lists the updated examples for +features deprecated in OpenMP 6.0 +in the Examples Document Version +\href{https://github.com/OpenMP/Examples/tree/v6.0}{6.0}. +The \emph{Earlier Version} column of the table lists the earlier version +tags of the examples that can be found in +the Examples Document Version +\href{https://github.com/OpenMP/Examples/tree/v5.2}{5.2}. + +\index{clauses!combiner@\kcode{combiner}} +\index{combiner clause@\kcode{combiner} clause} + +\nolinenumbers +\dpftable{6.0} +\begin{supertabular}{p{1.7in} p{1.1in} p{2.2in}} + \hexentry{udr.1}[f90]{4.0} & + \plc{combiner} expression in \kcode{declare} \\ + \hexentry{udr.2}[f90]{4.0} & + \kcode{reduction} directive changed to use \\ + \hexentry{udr.3}[f90]{4.0} & \kcode{combiner} clause \\ + \hexentry[f90]{udr.4}{4.0} & \\ + \hexentry[cpp]{udr.5}{4.0} & \\ + \hexentry[cpp]{udr.6}{4.0} & \\[2pt] +\end{supertabular} + +\linenumbers Table~\ref{tab:Updated Examples 5.2} lists the updated examples for features deprecated in OpenMP 5.2 in the Examples Document Version \examplesref{5.2}. @@ -195,6 +224,7 @@ the Examples Document Version \examplesref{5.1}. \end{supertabular} \linenumbers +\newpage Table~\ref{tab:Updated Examples 5.1} lists the updated examples for features deprecated in OpenMP 5.1 in the Examples Document Version \examplesref{5.1}. diff --git a/Foreword_Chapt.tex b/Foreword_Chapt.tex index 6499f31..e468fd7 100644 --- a/Foreword_Chapt.tex +++ b/Foreword_Chapt.tex @@ -2,20 +2,27 @@ \label{chap:foreword} The OpenMP Examples document has been updated with new features -found in the OpenMP \PVER\ Specification. The additional examples and updates -are referenced in the Document Revision History of the Appendix on page~\pageref{chap:history}. +found in the OpenMP \SVER\ Specification. +In order to provide users with new feature examples concurrently +with the release of the OpenMP 6.0 Specification, +the 6.0 Examples document is being released early +with a caveat that some of the 6.0 features +(such as \kcode{workdistribute} construct, \kcode{taskgraph} construct, +\kcode{threadset} clause and free-agent threads) will be covered +in the next release of the document. +For a list of the new examples and updates in this release, +please refer to the Document Revision History of the Appendix on page~\pageref{chap:history}. -Text describing an example with a \PVER\ feature specifically states -that the feature support begins in the OpenMP \PVER\ Specification. Also, -an \kcode{\small{}omp_\PVER} keyword is included in the metadata of the source code. - -These distinctions are presented to remind readers that a \PVER\ compliant +Text describing an example with a \SVER\ feature specifically states +that the feature support begins in the OpenMP \SVER\ Specification. Also, +an \kcode{\small{}omp_\SVER} keyword is included in the metadata of the source code. +These distinctions are presented to remind readers that a \SVER\ compliant OpenMP implementation is necessary to use these features in codes. -Examples for most of the \PVER\ features are included in this document, -and incremental releases will become available as more feature examples +%Examples for most of the \SVER\ features are included in this document, +%and +Incremental releases will become available as more feature examples and updates are submitted and approved by the OpenMP Examples Subcommittee. - Examples are accepted for this document after discussions, revisions and reviews in the Examples Subcommittee, and two reviews/discussions and two votes in the OpenMP Language Committee. diff --git a/History.tex b/History.tex index 30bc30c..67323ab 100644 --- a/History.tex +++ b/History.tex @@ -1,6 +1,74 @@ \cchapter{Document Revision History}{history} \label{chap:history} +%===================================== +\section{Changes from 5.2.2 to 6.0} +\label{sec:history_522_to_60} + +\begin{itemize} +\item General changes: +\begin{itemize} + \item Added a set of structured LaTeX environments for specifying + language-dependent text. This allows extracting language-specific + content of the Examples document. Refer to the content of + \examplesblob{v6.0/Contributions.md} for details. +\end{itemize} + +\item Added the following examples for the 6.0 features: +\begin{itemize} + \item \kcode{omp::decl} attribute for declarative directives in C/C++ + (\specref{sec:attributes}) + \item \kcode{transparent} clause on the \kcode{task} construct to enable dependences + between non-sibling tasks (\specref{subsec:depend_trans_task}) + \item Task dependences for \kcode{taskloop} construct + (\specref{sec:taskloop_depend}) + \item \kcode{num_threads} clause that appears inside \kcode{target} region + (\specref{subsec:target_teams_num_teams}) + \item \kcode{nowait} clause with argument on the \kcode{target} construct to control deferment + of target task (\specref{subsec:async_target_nowait_arg}) + \item Traits for specifying devices (\specref{sec:device_env_traits}) + \item \kcode{apply} clause with modifier argument to + support selective loop transformations + (\specref{sec:apply_clause}) + \item Reduction on private variables in a \kcode{parallel} region + (\specref{subsec:priv_reduction}) + \item \kcode{induction} clause (\specref{subsec:induction}) + and user-defined induction (\specref{subsec:user-defined-induction}) + \item \kcode{init_complete} clause for \kcode{scan} directive to + support initialization phase in scan operation + (\specref{sec:scan}) + \item \kcode{assume} construct with \kcode{no_openmp} and \kcode{no_parallelism} clauses (\specref{sec:assumption}) + \item \kcode{num_threads} clause with a list + (\specref{subsec:icv_nthreads}) + \item \kcode{dispatch} construct to control variant substitution + for a procedure call (\specref{sec:dispatch}) +\end{itemize} + +\item Other changes: +\begin{itemize} + \item Changed attribute specifier as a directive form from C++ only to C/C++ + (\specref{chap:directive_syntax}) + \item Added missing \bcode{include } in Example \example{atomic.4.c} + and \bcode{use omp_lib} in Example \example{atomic.4.f90} + (\specref{sec:atomic_hint}) + \item Fixed the function declaration order for variant functions in + Examples \example{selector_scoring.[12].c} and Fortran pointer + initialization in Example \example{selector_scoring.2.f90} + (\specref{subsec:context_selector_scoring}) + \item Replaced the deprecated use of \plc{combiner-exp} + in \kcode{declare reduction} directive with \kcode{combiner} clause + (\specref{subsec:UDR} and \specref{sec:Updated Examples}) + \item Fixed the initialization of Fortran pointers + in Example \example{cancellation.2.f90} and changed to + use \kcode{atomic write} for performing atomic writes + (\specref{sec:cancellation}) + \item Added missing \kcode{declare target} directive for external procedure + called inside \kcode{target} region in Example + \example{requires.1.f90} (\specref{sec:requires}) +\end{itemize} + +\end{itemize} + %===================================== \section{Changes from 5.2.1 to 5.2.2} \label{sec:history_521_to_522} diff --git a/Makefile b/Makefile index 5944af8..ae96c3c 100644 --- a/Makefile +++ b/Makefile @@ -4,15 +4,21 @@ include versioninfo default: openmp-examples.pdf -diff: openmp-diff-abridged.pdf +diff: clean openmp-diff-abridged.pdf -book: BOOK_BUILD="\\\\def\\\\bookbuild{1}" -book: VERSIONSTR="$(version_date)" -book: clean openmp-examples.pdf - mv openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf release: VERSIONSTR="$(version_date)" release: clean openmp-examples.pdf +book: BOOK_BUILD="\\\\def\\\\bookbuild{1}" +book: clean release + mv openmp-examples-${version}.pdf openmp-examples-${version}-book.pdf + +ccpp-only: LANG_OPT="\\\\ccpptrue\\\\fortranfalse" +ccpp-only: clean release + +fortran-only: LANG_OPT="\\\\ccppfalse\\\\fortrantrue" +fortran-only: clean release + CHAPTERS=Title_Page.tex \ Foreword_Chapt.tex \ Chap_*.tex \ @@ -41,8 +47,9 @@ LATEXDCMD=$(LATEXCMD) -draftmode # check for branches names with "name_XXX" DIFF_TICKET_ID=$(shell git rev-parse --abbrev-ref HEAD) -GITREV=$(shell git rev-parse --short HEAD) +GITREV=$(shell git rev-parse --short HEAD || echo "??") VERSIONSTR="GIT rev $(GITREV)" +LANG_OPT="\\\\ccpptrue\\\\fortrantrue" openmp-examples.pdf: $(CHAPTERS) $(SOURCES) openmp.sty openmp-examples.tex openmp-logo.png generated-include.tex rm -f $(INTERMEDIATE_FILES) @@ -75,15 +82,15 @@ endif ifdef DIFF_FROM VC_DIFF_FROM := -r ${DIFF_FROM} else - VC_DIFF_FROM := -r main + VC_DIFF_FROM := -r work_6.0 endif DIFF_TO:=HEAD -DIFF_FROM:=main +DIFF_FROM:=work_6.0 DIFF_TYPE:=UNDERLINE COMMON_DIFF_OPTS:=--math-markup=whole \ - --append-safecmd=plc,code,hcode,scode,pcode,splc \ + --append-safecmd=plc,code,kcode,scode,ucode,vcode,splc,bcode,pvar,pout,example \ --append-textcmd=subsubsubsection VC_DIFF_OPTS:=${COMMON_DIFF_OPTS} --force -c latexdiff.cfg --flatten --type="${DIFF_TYPE}" --git --pdf ${VC_DIFF_FROM} ${VC_DIFF_TO} --subtype=ZLABEL --graphics-markup=none @@ -94,8 +101,10 @@ generated-include.tex: echo "$(BOOK_BUILD)" echo "$(BOOK_BUILD)" > $@ echo "\def\VER{${version}}" >> $@ - echo "\def\PVER{${version_spec}}" >> $@ + echo "\def\SVER{${version_spec}}" >> $@ echo "\def\VERDATE{${VERSIONSTR}}" >> $@ + echo "\\\\newif\ifccpp\\\\newif\iffortran" >> $@ + echo "$(LANG_OPT)" >> $@ util/list_tags -vtag */sources/* >> $@ %.tmpdir: $(wildcard *.sty) $(wildcard *.png) $(wildcard *.aux) openmp-examples.pdf diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md new file mode 100644 index 0000000..8abb056 --- /dev/null +++ b/STYLE_GUIDE.md @@ -0,0 +1,46 @@ +### OpenMP Examples Coding Style Guide + +Must Dos: +- Indents and Braces + - Code: Follow common base language practices. + - Where indents are normally used, use 2 spaces instead of tabs. + - Comments: Follow the indent of the base language for which the comment applies. + - OpenMP directives should be indented as if it's base language code where possible. + - Braces `{}` around structured blocks following directives must be on a new line and must follow base language indent. + - For C/C++ examples, for code blocks with multiple lines, the if-else statements must follow the following format: + ``` + if { + } else { + } + ``` +- All Section and sub-section headings must be in Title case. For example: " This is a Useful Example of X Directive ". + +- Comments + - Comments go on a new line before the relevant code/code block. + - Expected results may go on the same line. + - Keep comments terse; detailed explanations go in the text. + +- Output + - If there is a deterministic output, provide it. + - It can be done in one of the following ways: + - Specify the correct value in a comment. + - Code prints out "expected" and "run" values. + - Test for the correctness of a value in a conditional. + - If the test is expected to execute, return values must be used to indicate success or failure. + - For tests that produce incorrect results, use: + - `return(1)` for C/C++ + - `stop 1` for Fortran (do not exit) + - For tests that need to discontinue execution, use: + - `exit(1)` for C/C++ + - `error stop` for Fortran + - Validation messages such as "Pass" / "Fail" are not mandatory. + - A single "pass" or "fail" is sufficient for a multi-case test. + +- To Verify Metadata: + - A tool in the repository at the top level, "make check", scans all sources for version tags and ensures line length is 75 characters max. + - Inside `utils`, there is `chk_tags` (see different options) that can accept 1 file and scan for all specified values. + +Don’ts: +- Unless required by the feature, use free-format Fortran for new examples. +- Do not use all-caps for emphasis in the document. + diff --git a/Title_Page.tex b/Title_Page.tex index 6d28390..f32d32d 100644 --- a/Title_Page.tex +++ b/Title_Page.tex @@ -12,7 +12,7 @@ \textsf{OpenMP\\Application Programming\\Interface} % An optional subtitle can go here: - \vspace{0.5in}\textsf{Examples}\vspace{-0.7in} + \vspace{0.5in}\textsf{\langselect Examples}\vspace{-0.7in} \normalsize \vspace{1.0in} diff --git a/affinity/affinity.tex b/affinity/affinity.tex index 54e5e97..fdd6e5f 100644 --- a/affinity/affinity.tex +++ b/affinity/affinity.tex @@ -23,12 +23,14 @@ starting from 0, such that the hardware threads 0,1 form the first physical core The following equivalent place list declarations consist of eight places (which we designate as p0 to p7): - -\kcode{OMP_PLACES}=\verb+"{0,1},{2,3},{4,5},{6,7},{8,9},{10,11},{12,13},{14,15}"+ - +\begin{boxeducode} +\kcode{export OMP_PLACES=}"{0,1},{2,3},{4,5},{6,7},{8,9},{10,11},{12,13}, +{14,15}" +\end{boxeducode} or - -\kcode{OMP_PLACES}=\verb+"{0:2}:8:2"+ +\begin{boxeducode} +\kcode{export OMP_PLACES=}"{0:2}:8:2" +\end{boxeducode} \subsection{Spread Affinity Policy} \label{subsec:affinity_spread} diff --git a/affinity/affinity_display.tex b/affinity/affinity_display.tex index 0b06920..6660574 100644 --- a/affinity/affinity_display.tex +++ b/affinity/affinity_display.tex @@ -47,12 +47,13 @@ a nested parallel region runs half of the available threads on each socket. These OpenMP environment variables have been set: -\begin{compactitem} -\item \kcode{OMP_PROC_BIND}=\verb+"TRUE"+ -\item \kcode{OMP_NUM_THREADS}=\verb+"2,4"+ -\item \kcode{OMP_PLACES}=\verb+"{0,2,4,6},{1,3,5,7}"+ -\item \kcode{OMP_AFFINITY_FORMAT}=\verb+"nest_level= %L, parent_thrd_num= %a,+ \verb+thrd_num= %n, thrd_affinity= %A"+ -\end{compactitem} +\begin{boxeducode} +\kcode{export OMP_PROC_BIND=}"TRUE" +\kcode{export OMP_NUM_THREADS=}"2,4" +\kcode{export OMP_PLACES=}"{0,2,4,6},{1,3,5,7}" +\kcode{export OMP_AFFINITY_FORMAT=}"nest_level= %L, parent_thrd_num= %a, +thrd_num= %n, thrd_affinity= %A" +\end{boxeducode} where the numbers correspond to core ids for the system. Note, \kcode{OMP_DISPLAY_AFFINITY} is not set and is \vcode{FALSE} by default. This example shows how to use API routines to diff --git a/data_environment/associate.tex b/data_environment/associate.tex index 3b75b3f..7cc7953 100644 --- a/data_environment/associate.tex +++ b/data_environment/associate.tex @@ -1,6 +1,6 @@ %\pagebreak +\begin{fortranspecific}[4ex] \section{Fortran \bcode{ASSOCIATE} Construct} -\fortranspecificstart \label{sec:associate} \index{ASSOCIATE construct, Fortran@\bcode{ASSOCIATE} construct, Fortran} @@ -12,14 +12,13 @@ name \ucode{b} is associated with the shared variable \ucode{a}. With the predet attribute rule, the associate name \ucode{b} is not allowed to be specified on the \kcode{private} clause. -\pagebreak +%\pagebreak \fnexample[4.0]{associate}{1} In next example, within the \kcode{parallel} construct, the association name \ucode{thread_id} is associated with the private copy of \ucode{i}. The print statement should output the unique thread number. -\topmarker{Fortran} \fnexample[4.0]{associate}{2} The following example illustrates the effect of specifying a selector name on a data-sharing @@ -30,9 +29,10 @@ The association between \ucode{u} and the original \ucode{v} is retained (see th Attribute Rules} section in the OpenMP 4.0 API Specification). Inside the \kcode{parallel} region, \ucode{v} has the value of -1 and \ucode{u} has the value of the original \ucode{v}. +\topmarker{Fortran} \ffreenexample[4.0]{associate}{3} -\topmarker{Fortran} +%\topmarker{Fortran} \label{sec:associate_target} \bigskip @@ -63,5 +63,5 @@ an explicit mapping for the same \kcode{target} construct, hence the code block is non-conforming. \ffreenexample[5.1]{associate}{4} -\fortranspecificend +\end{fortranspecific} diff --git a/data_environment/carrays_fpriv.tex b/data_environment/carrays_fpriv.tex index 607ff0d..ebe268a 100644 --- a/data_environment/carrays_fpriv.tex +++ b/data_environment/carrays_fpriv.tex @@ -1,6 +1,6 @@ %\pagebreak +\begin{ccppspecific}[4ex] \section{C/C++ Arrays in a \kcode{firstprivate} Clause} -\ccppspecificstart \label{sec:carrays_fpriv} \index{clauses!firstprivate@\kcode{firstprivate}} \index{firstprivate clause@\kcode{firstprivate} clause!C/C++ arrays in} @@ -34,6 +34,6 @@ array is assigned to the corresponding element of the new array. Those of pointe type are initialized as if by assignment from the original item to the new item. \cnexample{carrays_fpriv}{1} -\ccppspecificend +\end{ccppspecific} diff --git a/data_environment/copyprivate.tex b/data_environment/copyprivate.tex index 463995a..f37a90c 100644 --- a/data_environment/copyprivate.tex +++ b/data_environment/copyprivate.tex @@ -42,7 +42,7 @@ that \kcode{parallel} region. \cexample{copyprivate}{3} -\fortranspecificstart +\begin{fortranspecific} \fnexample{copyprivate}{3} Note that the effect of the \kcode{copyprivate} clause on a variable with the @@ -52,6 +52,6 @@ the pointer \ucode{B} is copied (as if by pointer assignment) to the correspondi list items in the other implicit tasks belonging to the \kcode{parallel} region. \fnexample{copyprivate}{4} -\fortranspecificend +\end{fortranspecific} diff --git a/data_environment/cpp_reference.tex b/data_environment/cpp_reference.tex index e77c0a9..5187b32 100644 --- a/data_environment/cpp_reference.tex +++ b/data_environment/cpp_reference.tex @@ -1,5 +1,5 @@ +\begin{cppspecific}[4ex] \section{C++ Reference in Data-Sharing Clauses} -\cppspecificstart \label{sec:cpp_reference} \index{clauses!data-sharing, C++ reference in} \index{data-sharing clauses, C++ reference in} @@ -13,4 +13,4 @@ Additionally it shows how the data-sharing of formal arguments with a C++ refere \cppnexample[4.5]{cpp_reference}{1} -\cppspecificend +\end{cppspecific} diff --git a/data_environment/default_none.tex b/data_environment/default_none.tex index da244c4..30b148c 100644 --- a/data_environment/default_none.tex +++ b/data_environment/default_none.tex @@ -7,14 +7,14 @@ The following example distinguishes the variables that are affected by the \kcode{default(none)} clause from those that are not. -\ccppspecificstart +\begin{ccppspecific} Beginning with OpenMP 4.0, variables with \bcode{const}-qualified type and no mutable member are no longer predetermined shared. Thus, these variables (variable \ucode{c} in the example) need to be explicitly listed in data-sharing attribute clauses when the \kcode{default(none)} clause is specified. \cnexample{default_none}{1} -\ccppspecificend +\end{ccppspecific} \fexample{default_none}{1} diff --git a/data_environment/fort_loopvar.tex b/data_environment/fort_loopvar.tex index 52af4bb..8a0f2b7 100644 --- a/data_environment/fort_loopvar.tex +++ b/data_environment/fort_loopvar.tex @@ -1,7 +1,7 @@ %\pagebreak +\begin{fortranspecific}[4ex] \section{Fortran Private Loop Iteration Variables} \label{sec:fort_loopvar} -\fortranspecificstart \index{loop variables, Fortran} In general loop iteration variables will be private, when used in the \plc{do-loop} @@ -21,5 +21,5 @@ example: Note however that the use of shared loop iteration variables can easily lead to race conditions. -\fortranspecificend +\end{fortranspecific} diff --git a/data_environment/fort_sa_private.tex b/data_environment/fort_sa_private.tex index 2c3af7f..0599bbc 100644 --- a/data_environment/fort_sa_private.tex +++ b/data_environment/fort_sa_private.tex @@ -1,4 +1,5 @@ %\pagebreak +\begin{fortranspecific}[4ex] \section{Fortran Restrictions on Storage Association with the \kcode{private} Clause} \label{sec:fort_sa_private} \index{clauses!private@\kcode{private}} @@ -8,9 +9,9 @@ The following non-conforming examples illustrate the implications of the \kcode{ clause rules with regard to storage association. \pagebreak -\fortranspecificstart \fnexample{fort_sa_private}{1} +\topmarker{Fortran} \fnexample{fort_sa_private}{2} \fnexample{fort_sa_private}{3} @@ -19,5 +20,5 @@ clause rules with regard to storage association. \topmarker{Fortran} \fnexample[5.1]{fort_sa_private}{5} -\fortranspecificend +\end{fortranspecific} diff --git a/data_environment/fort_shared_var.tex b/data_environment/fort_shared_var.tex index 57faa54..a20d4e7 100644 --- a/data_environment/fort_shared_var.tex +++ b/data_environment/fort_shared_var.tex @@ -1,6 +1,6 @@ %\pagebreak +\begin{fortranspecific}[4ex] \section{Passing Shared Variable to Procedure in Fortran} -\fortranspecificstart \label{sec:fort_shared_var} \index{clauses!shared@\kcode{shared}} \index{shared clause@\kcode{shared} clause!storage association, Fortran} @@ -41,5 +41,5 @@ not well defined. \topmarker{Fortran} \ffreenexample{fort_shared_var}{1} -\fortranspecificend +\end{fortranspecific} diff --git a/data_environment/fort_sp_common.tex b/data_environment/fort_sp_common.tex index 1fbebe7..28dae4d 100644 --- a/data_environment/fort_sp_common.tex +++ b/data_environment/fort_sp_common.tex @@ -1,6 +1,6 @@ %\pagebreak +\begin{fortranspecific}[4ex] \section{Fortran Restrictions on \kcode{shared} and \kcode{private} Clauses with Common Blocks} -\fortranspecificstart \label{sec:fort_sp_common} \index{clauses!private@\kcode{private}} \index{clauses!shared@\kcode{shared}} @@ -35,6 +35,6 @@ The following example is non-conforming because a common block may not be declar both shared and private: \fnexample{fort_sp_common}{5} -\fortranspecificend +\end{fortranspecific} diff --git a/data_environment/induction.tex b/data_environment/induction.tex new file mode 100644 index 0000000..61845e3 --- /dev/null +++ b/data_environment/induction.tex @@ -0,0 +1,67 @@ +%\pagebreak + +\section{Induction} +\label{sec:induction} + +This section covers ways to perform inductions in \kcode{distribute}, worksharing-loop, \kcode{taskloop}, and SIMD regions. + +\subsection{\kcode{induction} Clause} +\label{subsec:induction} +\index{clauses!induction@\kcode{induction}} +\index{induction clause@\kcode{induction} clause} +\index{inductions!induction clause@\kcode{induction} clause} +\index{inductions!closed form} + +The following example demonstrates the basic use of the \kcode{induction} clause +in Case 1 for variable \ucode{xi} in a loop in routine \ucode{comp_poly} to +evaluate the polynomial of variable \ucode{x}. +For this case, the induction operation is +with the inductor `\scode{*}' and induction step \ucode{x}. +The intermediate value of \ucode{xi} is used in producing +the reduction sum \ucode{result}. +The last value of \ucode{xi} is well defined after the loop and +is printed out together with the final value of \ucode{result}. +An alternative approach is to use an \plc{inscan} reduction +as illustrated in Case 2, but this may not be as optimal as Case 1. +An equivalent code without the \kcode{induction} clause is given in Case 3 +where a non-recursive closed form of the induction operation is used to +compute the intermediate value of \ucode{xi}. +The last value of \ucode{xi} is returned with the \kcode{lastprivate} clause +for this case. + +\cexample[6.0]{induction}{1} + +\ffreeexample[6.0]{induction}{1} + +\subsection{User-defined Induction} +\label{subsec:user-defined-induction} + +\index{directives!declare induction@\kcode{declare induction}} +\index{declare induction directive@\kcode{declare induction} directive} +\index{inductions!declare induction directive@\kcode{declare induction} directive} +\index{inductions!inductor clause@\kcode{inductor} clause} +\index{inductions!collector clause@\kcode{collector} clause} +\index{inductions!user-defined} +\index{OpenMP variable identifiers!omp_var@\kcode{omp_var}} +\index{OpenMP variable identifiers!omp_step@\kcode{omp_step}} +\index{OpenMP variable identifiers!omp_idx@\kcode{omp_idx}} + +The following is a user-defined induction example that uses the +\kcode{declare induction} directive and the \kcode{induction} clause. +The example processes in parallel $N$ points along a line of a given slope +starting from a given point, and where adjacent points are separated by +a fixed distance. +The induction variable \ucode{P} represents a point, and +the step expression is the distance. The induction identifier \ucode{next} +is defined in the \kcode{declare induction} directive with an +appropriate \plc{inductor} via the \kcode{inductor} clause and +\plc{collector} via the \kcode{collector} clause. +This identifier together with the \kcode{step(\ucode{Separation})} +modifier is specified in the \kcode{induction} clause +for the \kcode{parallel for}/\kcode{do} construct +in routine \ucode{processPointsInLine}. + +\cppexample[6.0]{induction}{2} + +\ffreeexample[6.0]{induction}{2} + diff --git a/data_environment/reduction.tex b/data_environment/reduction.tex index ceeebcb..8fc2678 100644 --- a/data_environment/reduction.tex +++ b/data_environment/reduction.tex @@ -26,7 +26,7 @@ written as follows: \cexample{reduction}{2} -\fortranspecificstart +\begin{fortranspecific} \ffreenexample{reduction}{2} The following program is non-conforming because the reduction is on the @@ -47,7 +47,7 @@ The following conforming program performs the reduction using to \ucode{MIN}. \ffreenexample{reduction}{5} -\fortranspecificend +\end{fortranspecific} %\pagebreak The following example is non-conforming because the initialization (\ucode{a = @@ -395,3 +395,34 @@ without using a worksharing-loop construct. \ffreeexample[5.1]{scope_reduction}{1} +\subsection{Reduction on Private Variables in a \kcode{parallel} Region} +\label{subsec:priv_reduction} +\index{reduction clause@\kcode{reduction} clause!on private variables} +\index{reduction clause@\kcode{reduction} clause!original(private) modifier@\kcode{original(private)} modifier} + +The following example shows reduction on a private variable (\ucode{sum_v}) +for an orphaned worksharing loop in routine \ucode{do_red}, +which is called in a \kcode{parallel} region. +At the end of the loop, private variable of each thread should have the same combined value. +\cexample[6.0]{priv_reduction}{1} +\ffreeexample[6.0]{priv_reduction}{1} + +The following example is slightly modified from the previous example +where the \kcode{original(private)} modifier is explicitly specified +for variable \ucode{sum_v} in the \kcode{reduction} clause. +This modifier indicates that variable \ucode{sum_v} is private +for reduction as opposed to shared by default for a variable +passed as a procedure argument. +\cppexample[6.0]{priv_reduction}{2} +\ffreeexample[6.0]{priv_reduction}{2} + +The following example shows the effect of nested \kcode{reduction} constructs. +For the \kcode{parallel} construct, the reduction is on the shared variable +\ucode{x}. For the worksharing loop nested inside the \kcode{parallel} +region, the reduction is performed on the private copy of \ucode{x} +for each thread. +With 4 threads assigned for the \kcode{parallel} region +(enforced by the \kcode{strict} modifier in the \kcode{num_threads} clause), +the code should print 40 at the end. +\cexample[6.0]{priv_reduction}{3} +\ffreeexample[6.0]{priv_reduction}{3} diff --git a/data_environment/scan.tex b/data_environment/scan.tex index 4de05bd..17d770b 100644 --- a/data_environment/scan.tex +++ b/data_environment/scan.tex @@ -46,3 +46,20 @@ of the prefix sum \ucode{b[k]} (\ucode{b(k)} in Fortran) for iteration \ucode{k} \cexample[5.0]{scan}{2} \ffreeexample[5.0]{scan}{2} + +In OpenMP 6.0, the \kcode{scan} directive was extended to support +the concept of an \plc{initialization} phase where a private variable +can be set for later use in the \plc{input} phase of +an \plc{exclusive} scan operation. +The following example is a rewrite of the previous exclusive scan +example, which uses the \kcode{scan init_complete} directive to separate +the initialization phase from the other phases of the scan operation. +The private variable \ucode{tmp} is set in the initialization phase +and used later in the input phase to update the prefix sum stored +in variable \ucode{x}. +This case allows the same array \ucode{c} to be used for +both input and output of the scan results. + +\cexample[6.0]{scan}{3} + +\ffreeexample[6.0]{scan}{3} diff --git a/data_environment/sources/induction.1.c b/data_environment/sources/induction.1.c new file mode 100644 index 0000000..6e058ac --- /dev/null +++ b/data_environment/sources/induction.1.c @@ -0,0 +1,48 @@ +/* +* @@name: induction.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +#include +#include + +void comp_poly(int N, double x, double c[]) { + // x: input: value of x for which to eval the polynomial + // c[N]: input: the coefficients + double x0 = 1.0; // initial value x^0 == 1 + double xi; // x^i + double result; // accumulator for the result + + // Case 1: induction clause + xi = x0; + result = 0.0; + #pragma omp parallel for reduction(+: result) induction(step(x),*: xi) + for (int i = 0; i < N; i++) { + result += c[i] * xi; + xi *= x; + } + printf("C1: result = %f, xn = %f\n", result, xi); + + // Case 2: inscan reduction + xi = x0; + result = 0.0; + #pragma omp parallel for reduction(+: result) reduction(inscan,*: xi) + for (int i = 0; i < N; i++) { + result += c[i] * xi; + #pragma omp scan exclusive(xi) + xi *= x; + } + printf("C2: result = %f, xn = %f\n", result, xi); + + // Case 3: closed form + result = 0.0; + #pragma omp parallel for reduction(+: result) lastprivate(xi) + for (int i = 0; i < N; i++) { + xi = x0 * pow(x, i); // induction operation in closed form + result += c[i] * xi; + xi *= x; + } + printf("C3: result = %f, xn = %f\n", result, xi); +} diff --git a/data_environment/sources/induction.1.f90 b/data_environment/sources/induction.1.f90 new file mode 100644 index 0000000..23bc670 --- /dev/null +++ b/data_environment/sources/induction.1.f90 @@ -0,0 +1,48 @@ +! @@name: induction.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine comp_poly(N, x, c) + implicit none + ! x: input: value of x for which to eval the polynomial + ! c(N): input: the coefficients + integer :: N + double precision :: x, c(*) + + double precision :: x0 = 1.0 ! initial value x^0 == 1 + double precision :: xi ! x^i + double precision :: result ! accumulator for the result + integer :: i + + !! Case 1: induction clause + xi = x0 + result = 0.0 + !$omp parallel do reduction(+: result) induction(step(x),*: xi) + do i = 1, N + result = result + c(i) * xi + xi = xi * x + end do + print *, 'C1: result =', result, ', xn =', xi + + !! Case 2: inscan reduction + xi = x0 + result = 0.0 + !$omp parallel do reduction(+: result) reduction(inscan,*: xi) + do i = 1, N + result = result + c(i) * xi + !$omp scan exclusive(xi) + xi = xi * x + end do + print *, 'C2: result =', result, ', xn =', xi + + !! Case 3: closed form + result = 0.0 + !$omp parallel do reduction(+: result) lastprivate(xi) + do i = 1, N + xi = x0 * (x ** (i-1)) ! induction operation in closed form + result = result + c(i) * xi + xi = xi * x + end do + print *, 'C3: result =', result, ', xn =', xi +end subroutine diff --git a/data_environment/sources/induction.2.cpp b/data_environment/sources/induction.2.cpp new file mode 100644 index 0000000..ec5af77 --- /dev/null +++ b/data_environment/sources/induction.2.cpp @@ -0,0 +1,47 @@ +/* +* @@name: induction.2 +* @@type: C++ +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +#include + +class Point { + float x, y, m; + char color; +public: + Point(float x, float y, float m) : x(x), y(y), m(m) { + color = (int)(x+y) % 256; + } + Point nextPoint(float distance) { + // return a Point that is 'distance' away along slope m + // in the x direction + float deltaX = distance/(sqrtf(1.0f + m * m)); + float deltaY = m * deltaX; + Point NewPoint(x+deltaX, y+deltaY, m); + return NewPoint; + } +}; + +#pragma omp declare induction(next : (Point, float)) \ + inductor (omp_var = omp_var.nextPoint(omp_step)) \ + collector(omp_step * omp_idx) + +extern void process(Point P); + +void processPointsInLine(Point Start, int NumberOfPoints, + float Separation) { + Point P = Start; + #pragma omp parallel for induction(step(Separation), next : P) + for (int i = 0; i < NumberOfPoints; ++i) { + process(P); + P = P.nextPoint(Separation); + } +} + +int main() { + Point Start(1.0f, -2.0f, 0.5f); + processPointsInLine(Start, 100, 0.25f); + return 0; +} diff --git a/data_environment/sources/induction.2.f90 b/data_environment/sources/induction.2.f90 new file mode 100644 index 0000000..4e96c52 --- /dev/null +++ b/data_environment/sources/induction.2.f90 @@ -0,0 +1,66 @@ +! @@name: induction.2 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +module udi + integer, parameter :: I2 = selected_int_kind(3) ! enough for 256 + type Point + real x, y, m + integer(I2) color + contains + procedure initPoint, nextPoint + end type + + !$omp declare induction(next : (Point, real)) & + !$omp& inductor (omp_var = omp_var%nextPoint(omp_step)) & + !$omp& collector(omp_step * omp_idx) + + contains + subroutine initPoint(this, x1, y1, m1) + implicit none + class(Point) this + real x1, y1, m1 + this%x = x1; this%y = y1; this%m = m1 + this%color = mod(int(x1+y1), 256) + end subroutine + + function nextPoint(this, distance) result(NewPoint) + ! return a Point that is 'distance' away along slope m in the x direction + implicit none + class(Point) this + real distance + type(Point) NewPoint + + real deltaX, deltaY + deltaX = distance/(sqrt(1.0 + this%m * this%m)) + deltaY = this%m * deltaX + call NewPoint%initPoint(this%x+deltaX, this%y+deltaY, this%m) + end function +end module + +subroutine processPointsInLine(Start, NumberOfPoints, Separation) + use udi + implicit none + type(Point) Start + integer NumberOfPoints + real Separation + type(Point) P + integer i + + P = Start + !$omp parallel do induction(step(Separation), next : P) + do i = 1, NumberOfPoints + call process(P) + P = P%nextPoint(Separation) + end do +end subroutine + +program main + use udi + implicit none + type(Point) Start + + call Start%initPoint(1.0, -2.0, 0.5) + call processPointsInLine(Start, 100, 0.25) +end program diff --git a/data_environment/sources/priv_reduction.1.c b/data_environment/sources/priv_reduction.1.c new file mode 100644 index 0000000..296ceaf --- /dev/null +++ b/data_environment/sources/priv_reduction.1.c @@ -0,0 +1,35 @@ +/* +* @@name: priv_reduction.1 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +*/ +#include +#include +#define N 100 + +int do_red(int n, int *v) +{ + int sum_v = 0; // sum_v is private + + #pragma omp for reduction(+: sum_v) + for (int i = 0; i < n; i++) { + sum_v += v[i]; + } + return sum_v; +} + +int main(void) +{ + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; + + #pragma omp parallel + { + int s_v = do_red(N, v); + printf("myid %d: sum of v = %d\n", omp_get_thread_num(), s_v); + } + return 0; +} diff --git a/data_environment/sources/priv_reduction.1.f90 b/data_environment/sources/priv_reduction.1.f90 new file mode 100644 index 0000000..ac3acfb --- /dev/null +++ b/data_environment/sources/priv_reduction.1.f90 @@ -0,0 +1,35 @@ +! @@name: priv_reduction.1 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +function do_red(n, v) result(sum_v) + implicit none + integer :: n, v(*) + integer :: sum_v ! sum_v is private + integer :: i + + sum_v = 0 + !$omp do reduction(+: sum_v) + do i = 1, n + sum_v = sum_v + v(i) + end do +end function + +program priv_red + use :: omp_lib, only : omp_get_thread_num + implicit none + integer, parameter :: N = 100 + integer :: i, v(N), s_v + integer, external :: do_red + + do i = 1, N + v(i) = i - 1 + end do + + !$omp parallel private(s_v) + s_v = do_red(N, v) + print 10, omp_get_thread_num(), s_v + 10 format("myid ", i0, ": sum of v = ", i0) + !$omp end parallel +end program diff --git a/data_environment/sources/priv_reduction.2.cpp b/data_environment/sources/priv_reduction.2.cpp new file mode 100644 index 0000000..cb55ae2 --- /dev/null +++ b/data_environment/sources/priv_reduction.2.cpp @@ -0,0 +1,34 @@ +/* +* @@name: priv_reduction.2 +* @@type: C++ +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +*/ +#include +#include +#define N 100 + +void do_red(int n, int *v, int &sum_v) +{ + sum_v = 0; // sum_v is private + #pragma omp for reduction(original(private),+: sum_v) + for (int i = 0; i < n; i++) { + sum_v += v[i]; + } +} + +int main(void) +{ + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; + + #pragma omp parallel + { + int s_v; // s_v is private + do_red(N, v, s_v); + printf("myid %d: sum of v = %d\n", omp_get_thread_num(), s_v); + } + return 0; +} diff --git a/data_environment/sources/priv_reduction.2.f90 b/data_environment/sources/priv_reduction.2.f90 new file mode 100644 index 0000000..6d42678 --- /dev/null +++ b/data_environment/sources/priv_reduction.2.f90 @@ -0,0 +1,34 @@ +! @@name: priv_reduction.2 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +subroutine do_red(n, v, sum_v) + implicit none + integer :: n, v(*) + integer :: sum_v + integer :: i + + sum_v = 0 ! sum_v is private + !$omp do reduction(original(private),+: sum_v) + do i = 1, n + sum_v = sum_v + v(i) + end do +end subroutine + +program priv_red + use :: omp_lib, only : omp_get_thread_num + implicit none + integer, parameter :: N = 100 + integer :: i, v(N), s_v + + do i = 1, N + v(i) = i - 1 + end do + + !$omp parallel private(s_v) + call do_red(N, v, s_v) + print 10, omp_get_thread_num(), s_v + 10 format("myid ", i0, ": sum of v = ", i0) + !$omp end parallel +end program diff --git a/data_environment/sources/priv_reduction.3.c b/data_environment/sources/priv_reduction.3.c new file mode 100644 index 0000000..27ea159 --- /dev/null +++ b/data_environment/sources/priv_reduction.3.c @@ -0,0 +1,24 @@ +/* +* @@name: priv_reduction.3 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +*/ +#include + +int main(void) +{ + int x; + + x = 0; + // parallel reduction on shared x + #pragma omp parallel reduction(+: x) num_threads(strict: 4) + { + #pragma omp for reduction(+: x) // reduction on private x + for (int i = 0; i < 10; i++) + x++; + } + printf("x = %d\n", x); // should print 40, with 4 threads + return 0; +} diff --git a/data_environment/sources/priv_reduction.3.f90 b/data_environment/sources/priv_reduction.3.f90 new file mode 100644 index 0000000..36bdedf --- /dev/null +++ b/data_environment/sources/priv_reduction.3.f90 @@ -0,0 +1,20 @@ +! @@name: priv_reduction.3 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +program nest_red + implicit none + integer :: x + + x = 0 + ! parallel reduction on shared x + !$omp parallel reduction(+: x) num_threads(strict: 4) + !$omp do reduction(+: x) ! reduction on private x + do i = 1, 10 + x = x + 1 + end do + !$omp end do + !$omp end parallel + print *, "x =", x ! should print 40, with 4 threads +end program diff --git a/data_environment/sources/scan.3.c b/data_environment/sources/scan.3.c new file mode 100644 index 0000000..e301310 --- /dev/null +++ b/data_environment/sources/scan.3.c @@ -0,0 +1,40 @@ +/* +* @@name: scan.3 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +*/ +#include +#define N 100 + +int main(void) +{ + int c[N], tmp; + int x = 0; + + // initialization + for (int k = 0; k < N; k++) + c[k] = k + 1; + + // c[k] is used for both input and output of scan results + #pragma omp parallel for simd reduction(inscan,+: x) private(tmp) + for (int k = 0; k < N; k++) { + // initialization phase + tmp = c[k]; + #pragma omp scan init_complete + + // scan (output) phase - cannot use tmp here + c[k] = x; + + #pragma omp scan exclusive(x) + + // input phase - can use tmp here + x += tmp; + } + + printf("x = %d, c[0:3] = %d %d %d\n", x, c[0], c[1], c[2]); + // 5050, 0 1 3 + + return 0; +} diff --git a/data_environment/sources/scan.3.f90 b/data_environment/sources/scan.3.f90 new file mode 100644 index 0000000..4b753a1 --- /dev/null +++ b/data_environment/sources/scan.3.f90 @@ -0,0 +1,37 @@ +! @@name: scan.3 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +program inclusive_scan + implicit none + integer, parameter :: n = 100 + integer c(n), tmp + integer x, k + + ! initialization + x = 0 + do k = 1, n + c(k) = k + end do + + ! c(k) is used for both input and output of scan results + !$omp parallel do simd reduction(inscan,+: x) private(tmp) + do k = 1, n + ! initialization phase + tmp = c(k) + !$omp scan init_complete + + ! scan (output) phase - cannot use tmp here + c(k) = x + + !$omp scan exclusive(x) + + ! input phase - can use tmp here + x = x + tmp + end do + + print *,'x =', x, ', c(1:3) =', c(1:3) + ! 5050, 0 1 3 + +end program diff --git a/data_environment/sources/udr.1.c b/data_environment/sources/udr.1.c index a2fc5c6..11a0c58 100644 --- a/data_environment/sources/udr.1.c +++ b/data_environment/sources/udr.1.c @@ -3,7 +3,7 @@ * @@type: C * @@operation: compile * @@expect: success -* @@version: omp_4.0 +* @@version: omp_6.0 */ #include #include @@ -25,12 +25,12 @@ void maxproc ( struct point *out, struct point *in ) if ( in->y > out->y ) out->y = in->y; } -#pragma omp declare reduction(min : struct point : \ - minproc(&omp_out, &omp_in)) \ +#pragma omp declare reduction(min : struct point) \ + combiner( minproc(&omp_out, &omp_in) ) \ initializer( omp_priv = { INT_MAX, INT_MAX } ) -#pragma omp declare reduction(max : struct point : \ - maxproc(&omp_out, &omp_in)) \ +#pragma omp declare reduction(max : struct point) \ + combiner( maxproc(&omp_out, &omp_in) ) \ initializer( omp_priv = { 0, 0 } ) void find_enclosing_rectangle ( int n, struct point points[] ) diff --git a/data_environment/sources/udr.1.f90 b/data_environment/sources/udr.1.f90 index 7c21720..8c786d4 100644 --- a/data_environment/sources/udr.1.f90 +++ b/data_environment/sources/udr.1.f90 @@ -2,7 +2,7 @@ ! @@type: F-free ! @@operation: compile ! @@expect: success -! @@version: omp_4.0 +! @@version: omp_6.0 module data_type type :: point @@ -18,10 +18,12 @@ subroutine find_enclosing_rectangle ( n, points ) integer :: n type(point) :: points(*) - !$omp declare reduction(min : point : minproc(omp_out, omp_in)) & + !$omp declare reduction(min : point) & + !$omp& combiner( minproc(omp_out, omp_in) ) & !$omp& initializer( omp_priv = point( HUGE(0), HUGE(0) ) ) - !$omp declare reduction(max : point : maxproc(omp_out, omp_in)) & + !$omp declare reduction(max : point) & + !$omp& combiner( maxproc(omp_out, omp_in) ) & !$omp& initializer( omp_priv = point( 0, 0 ) ) type(point) :: minp = point( HUGE(0), HUGE(0) ), maxp = point( 0, 0 ) diff --git a/data_environment/sources/udr.2.c b/data_environment/sources/udr.2.c index 7fbbedd..a4a9ac5 100644 --- a/data_environment/sources/udr.2.c +++ b/data_environment/sources/udr.2.c @@ -3,7 +3,7 @@ * @@type: C * @@operation: compile * @@expect: success -* @@version: omp_4.0 +* @@version: omp_6.0 */ #include #include @@ -13,15 +13,15 @@ struct point { int y; }; -#pragma omp declare reduction(min : struct point : \ - omp_out.x = omp_in.x > omp_out.x ? omp_out.x : omp_in.x, \ - omp_out.y = omp_in.y > omp_out.y ? omp_out.y : omp_in.y ) \ - initializer( omp_priv = { INT_MAX, INT_MAX } ) +#pragma omp declare reduction(min : struct point) \ + combiner( omp_out.x = omp_in.x > omp_out.x ? omp_out.x : omp_in.x, \ + omp_out.y = omp_in.y > omp_out.y ? omp_out.y : omp_in.y ) \ + initializer( omp_priv = { INT_MAX, INT_MAX } ) -#pragma omp declare reduction(max : struct point : \ - omp_out.x = omp_in.x < omp_out.x ? omp_out.x : omp_in.x, \ - omp_out.y = omp_in.y < omp_out.y ? omp_out.y : omp_in.y ) \ - initializer( omp_priv = { 0, 0 } ) +#pragma omp declare reduction(max : struct point) \ + combiner( omp_out.x = omp_in.x < omp_out.x ? omp_out.x : omp_in.x, \ + omp_out.y = omp_in.y < omp_out.y ? omp_out.y : omp_in.y ) \ + initializer( omp_priv = { 0, 0 } ) void find_enclosing_rectangle ( int n, struct point points[] ) { diff --git a/data_environment/sources/udr.2.f90 b/data_environment/sources/udr.2.f90 index f334b25..2705d5e 100644 --- a/data_environment/sources/udr.2.f90 +++ b/data_environment/sources/udr.2.f90 @@ -2,7 +2,7 @@ ! @@type: F-free ! @@operation: compile ! @@expect: success -! @@version: omp_4.0 +! @@version: omp_6.0 module data_type type :: point @@ -18,14 +18,14 @@ subroutine find_enclosing_rectangle ( n, points ) integer :: n type(point) :: points(*) - !$omp declare reduction( min : point : & - !$omp& omp_out = point(min( omp_out%x, omp_in%x ), & - !$omp& min( omp_out%y, omp_in%y )) ) & + !$omp declare reduction( min : point ) & + !$omp& combiner( omp_out = point(min( omp_out%x, omp_in%x ), & + !$omp& min( omp_out%y, omp_in%y )) ) & !$omp& initializer( omp_priv = point( HUGE(0), HUGE(0) ) ) - !$omp declare reduction( max : point : & - !$omp& omp_out = point(max( omp_out%x, omp_in%x ), & - !$omp& max( omp_out%y, omp_in%y )) ) & + !$omp declare reduction( max : point ) & + !$omp& combiner( omp_out = point(max( omp_out%x, omp_in%x ), & + !$omp& max( omp_out%y, omp_in%y )) ) & !$omp& initializer( omp_priv = point( 0, 0 ) ) type(point) :: minp = point( HUGE(0), HUGE(0) ), maxp = point( 0, 0 ) diff --git a/data_environment/sources/udr.3.c b/data_environment/sources/udr.3.c index aee8efc..17e3d47 100644 --- a/data_environment/sources/udr.3.c +++ b/data_environment/sources/udr.3.c @@ -3,7 +3,7 @@ * @@type: C * @@operation: run * @@expect: success -* @@version: omp_4.0 +* @@version: omp_6.0 */ #include #define N 100 @@ -18,9 +18,9 @@ struct mx_s { void mx_combine(struct mx_s *out, struct mx_s *in); void mx_init(struct mx_s *priv, struct mx_s *orig); -#pragma omp declare reduction(maxloc: struct mx_s: \ - mx_combine(&omp_out, &omp_in)) \ - initializer(mx_init(&omp_priv, &omp_orig)) +#pragma omp declare reduction(maxloc: struct mx_s) \ + combiner( mx_combine(&omp_out, &omp_in) ) \ + initializer( mx_init(&omp_priv, &omp_orig) ) void mx_combine(struct mx_s *out, struct mx_s *in) { diff --git a/data_environment/sources/udr.3.f90 b/data_environment/sources/udr.3.f90 index 412d5a9..e6831f6 100644 --- a/data_environment/sources/udr.3.f90 +++ b/data_environment/sources/udr.3.f90 @@ -2,7 +2,7 @@ ! @@type: F-free ! @@operation: run ! @@expect: success -! @@version: omp_4.0 +! @@version: omp_6.0 program max_loc implicit none type :: mx_s @@ -10,9 +10,9 @@ program max_loc integer index end type - !$omp declare reduction(maxloc: mx_s: & - !$omp& mx_combine(omp_out, omp_in)) & - !$omp& initializer(mx_init(omp_priv, omp_orig)) + !$omp declare reduction(maxloc: mx_s) & + !$omp& combiner( mx_combine(omp_out, omp_in) ) & + !$omp& initializer( mx_init(omp_priv, omp_orig) ) integer, parameter :: N = 100 type(mx_s) :: mx diff --git a/data_environment/sources/udr.4.f90 b/data_environment/sources/udr.4.f90 index f136b39..5d4ad89 100644 --- a/data_environment/sources/udr.4.f90 +++ b/data_environment/sources/udr.4.f90 @@ -2,7 +2,7 @@ ! @@type: F-free ! @@operation: run ! @@expect: success -! @@version: omp_4.0 +! @@version: omp_6.0 module data_red ! Declare data type. type dt @@ -16,8 +16,9 @@ module data_red end interface ! Declare the user-defined reduction operator .add. -!$omp declare reduction(.add.:dt:omp_out=omp_out.add.omp_in) & -!$omp& initializer(dt_init(omp_priv)) +!$omp declare reduction(.add. : dt) & +!$omp& combiner( omp_out=omp_out.add.omp_in ) & +!$omp& initializer( dt_init(omp_priv) ) contains ! Declare the initialization routine. diff --git a/data_environment/sources/udr.5.cpp b/data_environment/sources/udr.5.cpp index bde4dee..af03272 100644 --- a/data_environment/sources/udr.5.cpp +++ b/data_environment/sources/udr.5.cpp @@ -3,7 +3,7 @@ * @@type: C++ * @@operation: compile * @@expect: success -* @@version: omp_4.0 +* @@version: omp_6.0 */ class V { float *p; @@ -16,6 +16,6 @@ public: V& operator+= ( const V& ); - #pragma omp declare reduction( + : V : omp_out += omp_in ) \ + #pragma omp declare reduction( + : V ) combiner( omp_out += omp_in ) \ initializer(omp_priv(omp_orig)) }; diff --git a/data_environment/sources/udr.6.cpp b/data_environment/sources/udr.6.cpp index 76a7b52..87b1d4d 100644 --- a/data_environment/sources/udr.6.cpp +++ b/data_environment/sources/udr.6.cpp @@ -3,18 +3,19 @@ * @@type: C++ * @@operation: view * @@expect: unspecified -* @@version: omp_4.0 +* @@version: omp_6.0 */ #include #include #include -#pragma omp declare reduction( + : std::vector : \ - std::transform (omp_out.begin(), omp_out.end(), \ - omp_in.begin(), omp_in.end(),std::plus())) +#pragma omp declare reduction( + : std::vector ) \ + combiner( std::transform (omp_out.begin(), omp_out.end(), \ + omp_in.begin(), omp_in.end(),std::plus()) ) -#pragma omp declare reduction( merge : std::vector : \ - omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) +#pragma omp declare reduction( merge : std::vector ) \ + combiner( omp_out.insert(omp_out.end(), omp_in.begin(), \ + omp_in.end()) ) -#pragma omp declare reduction( merge : std::list : \ - omp_out.merge(omp_in)) +#pragma omp declare reduction( merge : std::list ) \ + combiner( omp_out.merge(omp_in) ) diff --git a/data_environment/threadprivate.tex b/data_environment/threadprivate.tex index 1bc19b2..1ceb6b0 100644 --- a/data_environment/threadprivate.tex +++ b/data_environment/threadprivate.tex @@ -12,7 +12,7 @@ The following examples demonstrate how to use the \kcode{threadprivate} directiv \fexample{threadprivate}{1} \pagebreak -\ccppspecificstart +\begin{ccppspecific} The following example uses \kcode{threadprivate} on a static variable: \cnexample{threadprivate}{2} @@ -26,12 +26,12 @@ region could be either 1 or 2. This problem is avoided for \ucode{b}, which uses an auxiliary \bcode{const} variable and a copy-constructor. \cppnexample{threadprivate}{3} -\ccppspecificend +\end{ccppspecific} The following examples show non-conforming uses and correct uses of the \kcode{threadprivate} directive. -\fortranspecificstart +\begin{fortranspecific} The following example is non-conforming because the common block is not declared local to the subroutine that refers to it: @@ -84,9 +84,9 @@ The following is an example of the use of \kcode{threadprivate} for module varia \topmarker{Fortran} \fnexample{threadprivate}{6} -\fortranspecificend +\end{fortranspecific} -\cppspecificstart +\begin{cppspecific} The following example illustrates initialization of \kcode{threadprivate} variables for class-type \ucode{T}. \ucode{t1} is default constructed, \ucode{t2} is constructed taking a constructor accepting one argument of integer type, \ucode{t3} is copy @@ -99,5 +99,5 @@ class members. The \kcode{threadprivate} directive for a static class member mus be placed inside the class definition. \cppnexample{threadprivate}{5} -\cppspecificend +\end{cppspecific} diff --git a/data_environment/udr.tex b/data_environment/udr.tex index 210a11c..a9b6b50 100644 --- a/data_environment/udr.tex +++ b/data_environment/udr.tex @@ -28,8 +28,10 @@ the rectangle that encloses a set of 2-D points. Each \kcode{declare reduction} directive defines new reduction identifiers, \ucode{min} and \ucode{max}, to be used in a \kcode{reduction} clause. The next item in the -declaration list is the data type (\ucode{struct point}) used in the reduction, -followed by the combiner, here the functions \ucode{minproc} and \ucode{maxproc} perform +declaration list is the data type (\ucode{struct point}) used in the +reduction. +The \kcode{combiner} clause specifies the functions \ucode{minproc} and +\ucode{maxproc} to perform the min and max operations, respectively, on the user data (of type \ucode{struct point}). In the function argument list are two special OpenMP variable identifiers, \kcode{omp_in} and \kcode{omp_out}, that denote the two values to be combined in the ``real'' function; @@ -39,7 +41,7 @@ The initializer of the \kcode{declare reduction} directive specifies the initial value for the private variable of each implicit task. The \kcode{omp_priv} identifier is used to denote the private variable. -\cexample[4.0]{udr}{1} +\cexample[6.0]{udr}{1} %\clearpage The following example shows the corresponding code in Fortran. @@ -47,49 +49,53 @@ The \kcode{declare reduction} directives are specified as part of the declaration in subroutine \ucode{find_enclosing_rectangle} and the procedures that perform the min and max operations are specified as subprograms. -\ffreeexample[4.0]{udr}{1} +\ffreeexample[6.0]{udr}{1} The following example shows the same computation as \example{udr.1} but it illustrates that you can craft complex expressions in the user-defined reduction declaration. In this case, instead of calling the \ucode{minproc} and \ucode{maxproc} functions we inline the code in a single expression. -\cexample[4.0]{udr}{2} +\cexample[6.0]{udr}{2} The corresponding code of the same example in Fortran is very similar -except that the assignment expression in the \kcode{declare reduction} +except that the assignment expression in the \kcode{combiner} clause for +the \kcode{declare reduction} directive can only be used for a single variable, in this case through a type structure constructor \ucode{point($\ldots$)}. -\ffreeexample[4.0]{udr}{2} +\ffreeexample[6.0]{udr}{2} \index{OpenMP variable identifiers!omp_orig@\kcode{omp_orig}} The following example shows the use of special variables in arguments for -combiner (\kcode{omp_in} and \kcode{omp_out}) and initializer (\kcode{omp_priv} -and \kcode{omp_orig}) routines. This example returns the maximum value of an -array and the corresponding index value. The \kcode{declare reduction} -directive specifies a user-defined reduction operation \ucode{maxloc} for -data type \ucode{struct mx_s}. The function \ucode{mx_combine} is the combiner -and the function \ucode{mx_init} is the initializer. +combiner (\kcode{omp_in} and \kcode{omp_out}) and initializer +(\kcode{omp_priv} and \kcode{omp_orig}) routines. This example returns +the maximum value of an array and the corresponding index value. The +\kcode{declare reduction} directive specifies a user-defined +reduction operation \ucode{maxloc} for data type \ucode{struct mx_s}. +The function \ucode{mx_combine} is the combiner and the function \ucode{mx_init} +is the initializer. -\cexample[4.0]{udr}{3} +\cexample[6.0]{udr}{3} -Below is the corresponding Fortran version of the above example. The -\kcode{declare reduction} directive specifies the user-defined operation -\ucode{maxloc} for user-derived type \ucode{mx_s}. The combiner -\ucode{mx_combine} and the initializer \ucode{mx_init} are specified as -subprograms. +Below is the corresponding Fortran version of the above example. +The \kcode{declare reduction} directive specifies the user-defined +operation \ucode{maxloc} for user-derived type \ucode{mx_s}. +The combiner \ucode{mx_combine} and the initializer \ucode{mx_init} are +specified as subprograms. -\ffreeexample[4.0]{udr}{3} +\ffreeexample[6.0]{udr}{3} The following example explains a few details of the user-defined reduction -in Fortran through modules. The \kcode{declare reduction} directive is declared in a module (\ucode{data_red}). +in Fortran through modules. The \kcode{declare reduction} directive +is declared in a module (\ucode{data_red}). The reduction-identifier \ucode{.add.} is a user-defined operator that is to allow accessibility in the scope that performs the reduction operation. -The user-defined operator \ucode{.add.} and the subroutine \ucode{dt_init} specified in the \kcode{initializer} clause are defined in the same subprogram. +The user-defined operator \ucode{.add.} and the subroutine \ucode{dt_init} +specified in the \kcode{initializer} clause are defined in the same subprogram. The reduction operation (that is, the \kcode{reduction} clause) is in the main program. The reduction identifier \ucode{.add.} is accessible by use association. @@ -101,28 +107,27 @@ has the \kcode{initializer} clause, the subroutine specified on the clause must be accessible in the current scoping unit. In this case, the subroutine \ucode{dt_init} is accessible by use association. -\ffreeexample[4.0]{udr}{4} +\ffreeexample[6.0]{udr}{4} The following example uses user-defined reductions to declare a plus (\kcode{+}) -reduction for a C++ class. As the \kcode{declare reduction} directive is inside -the context of the \ucode{V} class the expressions in the \kcode{declare -reduction} directive are resolved in the context of the class. Also, note that -the \kcode{initializer} clause uses a copy constructor to initialize the -private variables of the reduction and it uses as parameter to its original -variable by using the special variable \kcode{omp_orig}. +reduction for a C++ class. As the \kcode{declare reduction} directive +is inside the context of the \ucode{V} class the expressions in the +\kcode{declare reduction} directive are resolved in the context of +the class. Also, note that the \kcode{initializer} clause uses a copy +constructor to initialize the private variables of the reduction and it uses +as parameter to its original variable by using the special variable +\kcode{omp_orig}. -\cppexample[4.0]{udr}{5} +\cppexample[6.0]{udr}{5} The following examples shows how user-defined reductions can be defined for -some STL containers. The first \kcode{declare reduction} defines the plus -(\kcode{+}) -operation for \ucode{std::vector} by making use of the -\ucode{std::transform} algorithm. The second and third define the merge -(or concatenation) operation for \ucode{std::vector} and -\ucode{std::list}. -%It shows how the same user-defined reduction operation can be defined to be done differently depending on the specified data type. -It shows how the user-defined reduction operation can be applied to specific data types of an STL. +some STL containers. The first \kcode{declare reduction} defines the +plus (\kcode{+}) operation for \ucode{std::vector} by making use of the +\ucode{std::transform} algorithm. The second and third define the merge (or +concatenation) operation for \ucode{std::vector} and \ucode{std::list}. +It shows how the user-defined reduction operation can be applied to specific +data types of an STL. -\cppexample[4.0]{udr}{6} +\cppexample[6.0]{udr}{6} diff --git a/devices/C++_virtual_functions.tex b/devices/C++_virtual_functions.tex index 5819b5a..874ca72 100644 --- a/devices/C++_virtual_functions.tex +++ b/devices/C++_virtual_functions.tex @@ -1,4 +1,5 @@ %\pagebreak +\begin{cppspecific}[4ex] \section{C++ Virtual Functions} \label{sec:virtual_functions} @@ -31,7 +32,8 @@ That is, the behavior of the implicit map of \ucode{ar} is non-conforming -- its static type \ucode{A} doesn't match its dynamic type \ucode{D}. Hence the behavior of the access to the virtual functions is unspecified. -\cppexample[5.2]{virtual_functions}{1} +\topmarker{C++} +\cppnexample[5.2]{virtual_functions}{1} The second example illustrates the restriction: @@ -47,4 +49,6 @@ In the second case, the object \ucode{ap} is instantiated on the host; access of the next \kcode{target} region is permitted. (Unified Shared Memory is used here to minimize mapping concerns.) -\cppexample[5.2]{virtual_functions}{2} +\topmarker{C++} +\cppnexample[5.2]{virtual_functions}{2} +\end{cppspecific} diff --git a/devices/array_shaping.tex b/devices/array_shaping.tex index cfdb1e3..172db59 100644 --- a/devices/array_shaping.tex +++ b/devices/array_shaping.tex @@ -11,7 +11,7 @@ \index{directives!begin declare target@\kcode{begin declare target}} \index{begin declare target directive@\kcode{begin declare target} directive} -\ccppspecificstart +\begin{ccppspecific} A pointer variable can be shaped to a multi-dimensional array to facilitate data access. This is achieved by a \plc{shape-operator} casted in front of a pointer (lvalue expression): @@ -35,13 +35,15 @@ around the shape-operator and \ucode{a} to ensure the correct precedence over array-section operations. \cnexample[5.1]{array_shaping}{1} -\ccppspecificend +\end{ccppspecific} %\clearpage +\begin{fortranspecific} The shape operator is not defined for Fortran. Explicit array shaping of procedure arguments can be used instead to achieve a similar goal. Below is the Fortran equivalent of the above example that illustrates the support of transferring two rows of noncontiguous boundary data in the \kcode{target update} directive. -\ffreeexample[5.2]{array_shaping}{1} +\ffreenexample[5.2]{array_shaping}{1} +\end{fortranspecific} diff --git a/devices/async_target_nowait_arg.tex b/devices/async_target_nowait_arg.tex new file mode 100644 index 0000000..5c2a7f9 --- /dev/null +++ b/devices/async_target_nowait_arg.tex @@ -0,0 +1,11 @@ +\subsection{Conditionally Asynchronous \kcode{target} Using the \kcode{nowait} Clause} +\label{subsec:async_target_nowait_arg} +\index{target construct@\kcode{target} construct!nowait clause@\kcode{nowait} clause} +\index{nowait clause@\kcode{nowait} clause} +\index{clauses!nowait@\kcode{nowait}} + +In OpenMP 6.0, \kcode{nowait} takes an OpenMP logical type argument to specify if the generated \plc{task} is an included task or a deferred task. In the following example, the \kcode{nowait} clause is used with an argument on the \kcode{target} directive. In a practical situation, the value of \ucode{is_deferred} can be chosen based on the time taken for some work on host or device that can be performed asynchronously after the target task is scheduled. If the target task is deferred, it must be synchronized by a \kcode{taskwait} before the value of \ucode{x} is used. Prior to 6.0, the same effect would require the use of a \plc{metadirective} or an \bcode{if-else} statement that duplicates the \kcode{target} construct. + +\cexample[6.0]{async_target}{5} + +\ffreeexample[6.0]{async_target}{5} diff --git a/devices/declare_target.tex b/devices/declare_target.tex index f1e5a7c..dcb28d0 100644 --- a/devices/declare_target.tex +++ b/devices/declare_target.tex @@ -95,7 +95,7 @@ end of the affected declarations, as introduced in OpenMP 5.1. The \kcode{begin declare target} directive was defined to symmetrically complement the terminating (``end'') directive. -\cppspecificstart +\begin{cppspecific} The example also shows 3 different ways to use a \kcode{declare target} directive for a class and an external member-function definition (for the \ucode{XOR1}, \ucode{XOR2}, @@ -138,10 +138,10 @@ separately and linking them, will create appropriate executable device functions \smallskip \cppnexample[5.1]{declare_target}{2b_main}[1] -%\cppspecificend +%\end{cppspecific} \topmarker{C++} -%\cppspecificstart +%\begin{cppspecific} The following example shows how the \kcode{begin declare target} and \kcode{end declare target} directives are used to enclose the declaration of a variable \ucode{varY} with a class type \ucode{typeY}. %Prior to OpenMP 5.0, the member function \code{typeY::foo()} cannot @@ -157,7 +157,7 @@ and will successfully execute the function on the device. See previous examples %as if it were included in list or block of a declare target directive, \cppnexample[5.1]{declare_target}{2c} -\cppspecificend +\end{cppspecific} \subsection{Declare Target Directive for Variables} \label{subsec:declare_target_variables} diff --git a/devices/device_env_traits.tex b/devices/device_env_traits.tex new file mode 100644 index 0000000..d7ed1c0 --- /dev/null +++ b/devices/device_env_traits.tex @@ -0,0 +1,65 @@ +\pagebreak +\section{Traits for Specifying Devices} +\label{sec:device_env_traits} + +\index{environment variables!OMP_AVAILABLE_DEVICES@\kcode{OMP_AVAILABLE_DEVICES}} +\index{OMP_AVAILABLE_DEVICES@\kcode{OMP_AVAILABLE_DEVICES}} +\index{environment variables!OMP_DEFAULT_DEVICE@\kcode{OMP_DEFAULT_DEVICE}} +\index{OMP_DEFAULT_DEVICE@\kcode{OMP_DEFAULT_DEVICE}} + +Environment variables \kcode{OMP_AVAILABLE_DEVICES} and +\kcode{OMP_DEFAULT_DEVICE} can take traits to specify the available +devices and the default device, respectively. +In addition, \kcode{OMP_DEFAULT_DEVICE} can also take an integer +as a device number to specify the default device. + +The following examples show how traits are used to specify devices +for these environment variables. + +Only GPU non-host devices are available to program: +\begin{boxedcode} +export OMP_AVAILABLE_DEVICES=\ucode{"kind(gpu)"} +\end{boxedcode} + +Order of available devices would be all vendor \ucode{A} GPUs, then +the rest of the non-host devices as specified by "\ucode{*}": +\begin{boxedcode} +export OMP_AVAILABLE_DEVICES=\ucode{"kind(gpu)&&vendor(A),*"} +\end{boxedcode} + +Available devices would be all non-gpu devices from vendor \ucode{A}: +\begin{boxedcode} +export OMP_AVAILABLE_DEVICES=\ucode{"!kind(gpu)&&vendor(A)"} +\end{boxedcode} + +Available devices start with 1 vendor \ucode{A} GPU device, then +2 vendor \ucode{B} GPU devices, and then the rest of the non-host devices: +\begin{boxedcode} +export OMP_AVAILABLE_DEVICES=\ucode{"(kind(gpu)&&vendor(A))[0],} + \ucode{(kind(gpu)&&vendor(B))[0:2],*"} +\end{boxedcode} +The device number range is specified by the C/C++ array section syntax +\ucode{[0:2]} where "\ucode{0}" is the first index and "\ucode{2}" +is the length. + +Three available devices are re-ordered with "\ucode{uid-gpu3}" corresponding +to device 0, "\ucode{uid-gpu2}" to device 1 and "\ucode{uid-gpu1}" +to device 2: +\begin{boxedcode} +export OMP_AVAILABLE_DEVICES=\ucode{"uid(uid-gpu3),uid(uid-gpu2),} + \ucode{uid(uid-gpu1)"} +\end{boxedcode} + +The default device will be some visible vendor \ucode{A} GPU device. +If not available, then set to initial device: +\begin{boxedcode} +export OMP_DEFAULT_DEVICE=\ucode{"kind(gpu)&&vendor(A),initial"} +\end{boxedcode} + +The default device will be some visible vendor \ucode{A} GPU device. +If not available, then set to invalid device so that upon first use of default +device the program will error out: +\begin{boxedcode} +export OMP_DEFAULT_DEVICE=\ucode{"kind(gpu)&&vendor(A),invalid"} +\end{boxedcode} + diff --git a/devices/lambda_expressions.tex b/devices/lambda_expressions.tex index fabcc7d..5e50f25 100644 --- a/devices/lambda_expressions.tex +++ b/devices/lambda_expressions.tex @@ -1,11 +1,11 @@ %\pagebreak +\begin{cppspecific}[4ex] \section{Lambda Expressions} \label{sec:lambda_expressions} \index{lambda expressions} -\cppspecificstart The following example illustrates the usage of lambda expressions and their corresponding closure objects within a \kcode{target} region. @@ -48,5 +48,6 @@ results from the \kcode{declare target} directive. The \kcode{always} modifier is used on the \kcode{map} clause to transfer the updated values for the structure back to the host device. +\topmarker{C++} \cppnexample[5.0]{lambda_expressions}{1} -\cppspecificend +\end{cppspecific} diff --git a/devices/sources/async_target.5.c b/devices/sources/async_target.5.c new file mode 100644 index 0000000..58b71e4 --- /dev/null +++ b/devices/sources/async_target.5.c @@ -0,0 +1,42 @@ +/* +* @@name: async_target.5 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +*/ +#include +#include +#include + +#pragma omp begin declare target +void update(int* num) { + + *num = (*num) * 3; +} +#pragma omp end declare target + +int main(int argc, char*argv[]){ + int x = 2 ; + int is_deferred = time(NULL) % 2; + + #pragma omp target nowait(is_deferred) map(tofrom: x) + { + update(&x); + } + + // Perform other tasks in parallel while the + // target region is executing + + if(is_deferred){ + #pragma omp taskwait + } + + if( x == 6){ + printf("Passed\n"); + return 0; + } else { + printf("Failed\n"); + return 1; + } +} diff --git a/devices/sources/async_target.5.f90 b/devices/sources/async_target.5.f90 new file mode 100644 index 0000000..1cc5bf3 --- /dev/null +++ b/devices/sources/async_target.5.f90 @@ -0,0 +1,41 @@ +! @@name: async_target.5 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +program async_target_nowait_arg + implicit none + integer :: x + logical :: is_deferred + real :: rand_no + + x = 2 + ! Determine if computation is deferred + call random_number(rand_no) + is_deferred=mod(int(rand_no*10), 2) == 1 + + !$omp target map(tofrom: x) nowait(is_deferred) + call update(x) + !$omp end target + + ! Perform other tasks in parallel while the target region is executing + + if (is_deferred) then + !$omp taskwait + endif + + if (x == 6) then + stop "Passed" + else + error stop "Failed" + endif + +contains + + subroutine update(num) + integer, intent(inout) :: num + !$omp declare target + num = num * 3 + end subroutine update + +end program async_target_nowait_arg diff --git a/devices/sources/teams.7.c b/devices/sources/teams.7.c new file mode 100644 index 0000000..1615c41 --- /dev/null +++ b/devices/sources/teams.7.c @@ -0,0 +1,27 @@ +/* +* @@name: teams.7 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +*/ +#include +#include + +int x; +#pragma omp declare target local(x) + +int main() { + x = 128; + #pragma omp target + x = 256; + + #pragma omp target + #pragma omp teams num_teams(x) // Undefined behavior due to value of "x" + if (omp_get_team_num() == 0){ + printf("%d\n", omp_get_num_teams()); + } + + return 0; +} + diff --git a/devices/sources/teams.7.f90 b/devices/sources/teams.7.f90 new file mode 100644 index 0000000..e095bcc --- /dev/null +++ b/devices/sources/teams.7.f90 @@ -0,0 +1,25 @@ +! @@name: teams.7 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +PROGRAM main + USE omp_lib + INTEGER :: x + !$OMP DECLARE TARGET LOCAL(x) + + x = 128 + + !$OMP TARGET + x = 256 + !$OMP END TARGET + + !$OMP TARGET + !$OMP TEAMS NUM_TEAMS(x) ! Undefined behavior due to value of 'x' + IF (omp_get_team_num() == 0) THEN + PRINT *, omp_get_num_teams() + END IF + !$OMP END TEAMS + !$OMP END TARGET + +END PROGRAM main diff --git a/devices/sources/usm_scalar_ptr_ref_asc.1.f90 b/devices/sources/usm_scalar_ptr_ref_asc.1.f90 index fbb4e71..a39c2f2 100644 --- a/devices/sources/usm_scalar_ptr_ref_asc.1.f90 +++ b/devices/sources/usm_scalar_ptr_ref_asc.1.f90 @@ -1,4 +1,4 @@ -! @@name: usm_scalar_ptr_ref_ax.1 +! @@name: usm_scalar_ptr_ref_asc.1 ! @@type: F-free ! @@operation: compile ! @@expect: success diff --git a/devices/target_fort_allocatable_array_mapping.tex b/devices/target_fort_allocatable_array_mapping.tex index 3a159e5..8a3e91e 100644 --- a/devices/target_fort_allocatable_array_mapping.tex +++ b/devices/target_fort_allocatable_array_mapping.tex @@ -1,4 +1,5 @@ %\pagebreak +\begin{fortranspecific}[4ex] \section{Fortran Allocatable Array Mapping} \label{sec:fort_allocatable_array_mapping} \index{mapping!allocatable array, Fortran} @@ -36,8 +37,8 @@ or disassociated status, and associated storage can be mapped and attached as ne For allocatable variables, the update of the allocation status to allocated (allowing reference to allocated storage) on the device, is similar to pointer attachment. - -\ffreeexample[5.1]{target_fort_allocatable_map}{1} +\topmarker{Fortran} +\ffreenexample[5.1]{target_fort_allocatable_map}{1} Once an allocatable variable has been allocated on the host, its allocation status may not be changed in a \kcode{target} region, either @@ -48,7 +49,8 @@ and allocation) in a \kcode{target} region is not conforming. Also, an initial intrinsic assignment of an allocatable variable requires deallocation before the \kcode{target} region ends. -\ffreeexample[5.1]{target_fort_allocatable_map}{2} +\topmarker{Fortran} +\ffreenexample[5.1]{target_fort_allocatable_map}{2} \newpage The next example illustrates a corner case of this restriction (allocatable status @@ -62,4 +64,5 @@ the compiler will deallocate the associated actual argument when the subroutine (However, the allocation on procedure entry can be avoided by specifying the intent as \bcode{intent(inout)}, making the intended use conforming.) -\ffreeexample[5.1]{target_fort_allocatable_map}{3} +\ffreenexample[5.1]{target_fort_allocatable_map}{3} +\end{fortranspecific} diff --git a/devices/teams.tex b/devices/teams.tex index 669bea4..afbb736 100644 --- a/devices/teams.tex +++ b/devices/teams.tex @@ -64,7 +64,7 @@ each primary thread's private copy of \ucode{sum} is reduced into the final \uco implicitly mapped into the \kcode{target} region. \cexample[4.0]{teams}{2} -\clearpage +%\clearpage \ffreeexample[4.0]{teams}{2} @@ -144,3 +144,13 @@ thread uses SIMD parallelism. \ffreeexample[4.0]{teams}{6} +\subsection{Evaluation of \kcode{num_teams} Clause that Appears inside \kcode{target} Region} +\label{subsec:target_teams_num_teams} + +The following example shows the evaluation of the \kcode{num_teams} clause when the \kcode{teams} construct is closely nested inside \kcode{target} construct. The code is non-conforming since value of \ucode{x} for the clause may be different from different devices. As of OpenMP 6.0, it is the user's responsibility to ensure identical values for the clause expression for nested as well as combined directive cases for \kcode{target} and \kcode{teams} constructs. This permits implementations to evaluate the \kcode{num_teams} argument on the host rather than the target device. For the program to be conforming, the program must update the host value so that \ucode{x} will have the same value when evaluated on the host or target device. + +\cexample[6.0]{teams}{7} + +\ffreeexample[6.0]{teams}{7} + + diff --git a/devices/usm.tex b/devices/usm.tex index 56399cc..5253578 100644 --- a/devices/usm.tex +++ b/devices/usm.tex @@ -14,6 +14,7 @@ unified shared memory (USM) is required throughout the scope of the program by t \kcode{unified_shared_memory} clause in a \kcode{requires} directive. USM assumes a unified address space. +\begin{cppspecific} In the C++ code of the first example, a scalar (\ucode{x}), a pointer (\ucode{ptr}), and a reference (\ucode{ref}) are used in a \kcode{target} construct in Cases 1, 2 and 3, respectively. For the scalar variable \ucode{x}, the predetermined data-sharing attribute is still @@ -24,6 +25,10 @@ the \kcode{target} construct, as seen in Case 2. For the reference \ucode{ref}, the object to which it refers is mapped for the \kcode{target} construct, as seen in Case 3. +\cppnexample[5.2]{usm_scalar_ptr_ref_asc}{1} +\end{cppspecific} + +\begin{fortranspecific} In Case 1 of the Fortran example, the scalar \ucode{x} is firstprivate under the USM requirement in the \kcode{target} construct, and modification of the local variable on the device is never updated to the host data environment. @@ -35,5 +40,5 @@ but implicitly mapped. Hence, updates to the value of \ucode{y} appear in the h %Hence, updates to \ucode{y} in the \kcode{target} construct appear in the data environment of the host. %\pagebreak -\cppexample[5.2]{usm_scalar_ptr_ref_asc}{1} -\ffreeexample[5.2]{usm_scalar_ptr_ref_asc}{1} +\ffreenexample[5.2]{usm_scalar_ptr_ref_asc}{1} +\end{fortranspecific} diff --git a/directives/attributes.tex b/directives/attributes.tex index 6d09b48..4e5ab09 100644 --- a/directives/attributes.tex +++ b/directives/attributes.tex @@ -1,22 +1,23 @@ -\section{C++ Attributes} +\begin{ccppspecific}[4ex] +\section{C/C++ Attributes} \label{sec:attributes} -\index{directive syntax!attribute, C++} -\index{attribute syntax, C++} +\index{directive syntax!attribute, C/C++} +\index{attribute syntax, C/C++} -OpenMP directives for C++ can also be specified with -%the implementation-defined -the \kcode{directive} extension for the C++11 standard \plc{attributes}. +OpenMP directives for C/C++ can also be specified with +the \kcode{directive} extension for the C23 and C++11 standard \plc{attributes}. %https://en.cppreference.com/w/cpp/language/attributes -The C++ example below shows two ways to parallelize a \bcode{for} loop using the \kcode{\#pragma} syntax. +The example below shows two ways to parallelize a \bcode{for} loop using the \kcode{\#pragma} syntax. The first pragma uses the combined \kcode{parallel for} directive, and the second applies the uncombined closely nested directives, \kcode{parallel} and \kcode{for}, directly to the same statement. These are labeled PRAG 1-3. Using the attribute syntax, the same construct in PRAG 1 -is applied two different ways in attribute form, as shown in the ATTR 1 and ATTR 2 sections. +is applied in two different ways in attribute form, as shown in the ATTR 1 and ATTR 2 sections. In ATTR 1 the attribute syntax is used with the \kcode{omp ::} namespace form. -In ATTR 2 the attribute syntax is used with the \kcode{using omp :} namespace form. +In ATTR 2 the attribute syntax is used with the \kcode{using omp :} namespace +form available for C++ only. Next, parallelization is attempted by applying directives using two different syntaxes. For ATTR 3 and PRAG 4, the loop parallelization will fail to compile because multiple directives that @@ -53,4 +54,33 @@ form of the \kcode{simd} directive is used for loops calling the \ucode{Q} funct in combination with the attribute form of the \kcode{declare simd} directives declaring the variants for \ucode{Q}. -\cppexample[5.1]{directive_syntax_attribute}{1} +\topmarker{C/C++} +\cppnexample[6.0]{directive_syntax_attribute}{1} + +\topmarker{C/C++} +The following code snippets show how to use the \kcode{omp::decl} attribute +as an alternative way for specifying declarative directives. +The \kcode{omp::decl} attribute can be embedded in the base +language declarations as shown for variables in Cases 1 and 2, +for function in Case 3, and for C++ template in Case 4. +The variable and function name lists are implied from where +the attributes are specified. + +In Case 1, the prefix attribute applies +to all variables (\ucode{u} and \ucode{v}) in the declaration; +in Case 2, the postfix attribute applies to the associated variable +(\ucode{a} as the directive argument for the \kcode{declare_target} directive, +and \ucode{b} as the clause argument for the \kcode{link} clause +on \kcode{declare_target}); +in Case 3, the prefix attribute applies to the function (\ucode{f}). +The comma to separate directive name (\kcode{declare_target}) and +clause name (\kcode{link}) in +the \kcode{omp::decl} attribute specifier in Case 2 is optional. + +Case 4 shows the use of \kcode{omp::decl(declare_target)} for +a C++ template function definition +and its equivalent using the delimited +\kcode{begin}/\kcode{end declare_target} pragma form. + +\cppnexample[6.0]{directive_syntax_attribute}{2} +\end{ccppspecific} diff --git a/directives/fixed_format_comments.tex b/directives/fixed_format_comments.tex index 10cff43..59ce6a9 100644 --- a/directives/fixed_format_comments.tex +++ b/directives/fixed_format_comments.tex @@ -1,4 +1,5 @@ -%\pagebreak +\pagebreak +\begin{fortranspecific}[4ex] \section{Fortran Comments (Fixed Source Form)} \label{sec:fortran_fixed_format_comments} \index{directive syntax!fixed form, Fortran} @@ -16,5 +17,6 @@ two separate directives. Here, an \kcode{end} directive (\kcode{end parallel}) must be specified to demarcate the range (region) of the \kcode{parallel} directive. -\fexample{directive_syntax_F_fixed_comment}{1} +\fnexample{directive_syntax_F_fixed_comment}{1} +\end{fortranspecific} \clearpage diff --git a/directives/free_format_comments.tex b/directives/free_format_comments.tex index 2c84578..8187c4e 100644 --- a/directives/free_format_comments.tex +++ b/directives/free_format_comments.tex @@ -1,4 +1,5 @@ %\pagebreak +\begin{fortranspecific}[4ex] \section{Fortran Comments (Free Source Form)} \label{sec:fortran_free_format_comments} \index{directive syntax!free form, Fortran} @@ -18,7 +19,7 @@ two separate directives. Here, an \kcode{end} directive (\kcode{end parallel}) must be specified to demarcate the range (region) of the \kcode{parallel} directive. -\ffreeexample{directive_syntax_F_free_comment}{1} +\ffreenexample{directive_syntax_F_free_comment}{1} \clearpage As of OpenMP 5.1, \bcode{block} and \bcode{end block} statements can be used to designate @@ -28,7 +29,8 @@ block structure and are hence private. It was necessary to explicitly declare the \ucode{i} variable, due to the \bcode{implicit none} statement; it could have also been declared outside the structured block. -\ffreeexample[5.1]{directive_syntax_F_block}{1} +\topmarker{Fortran} +\ffreenexample[5.1]{directive_syntax_F_block}{1} A Fortran \bcode{BLOCK} construct may eliminate the need for a paired \kcode{end} directive for an OpenMP construct, as illustrated in the following example. @@ -48,4 +50,6 @@ a strictly structured block of an OpenMP construct is treated as the terminating of that construct. The next \kcode{end parallel} directive is required to terminate the outer \kcode{parallel} construct. -\ffreeexample[5.1]{directive_syntax_F_block}{2} +\topmarker{Fortran} +\ffreenexample[5.1]{directive_syntax_F_block}{2} +\end{fortranspecific} diff --git a/directives/pragmas.tex b/directives/pragmas.tex index 1f63eeb..0cd61d3 100644 --- a/directives/pragmas.tex +++ b/directives/pragmas.tex @@ -1,4 +1,5 @@ %\pagebreak +\begin{ccppspecific}[4ex] \section{C/C++ Pragmas} \label{sec:pragmas} \index{directive syntax!pragma, C/C++} @@ -20,4 +21,5 @@ two separate directives. The executable directives above all apply to the next statement. The \kcode{parallel} directive can be applied to a \plc{structured block} as shown in PRAG 5. -\cexample{directive_syntax_pragma}{1} +\cnexample{directive_syntax_pragma}{1} +\end{ccppspecific} diff --git a/directives/sources/directive_syntax_attribute.1.cpp b/directives/sources/directive_syntax_attribute.1.cpp index cabb38e..e2c9184 100644 --- a/directives/sources/directive_syntax_attribute.1.cpp +++ b/directives/sources/directive_syntax_attribute.1.cpp @@ -3,7 +3,7 @@ * @@type: C++ * @@operation: run * @@expect: success -* @@version: omp_5.1 +* @@version: omp_6.0 */ #include #include @@ -77,5 +77,5 @@ int main() { // OUTPUT: thrd no 2 // OUTPUT: thrd no 3 -// repeated 3 time: +// repeated 3 times: // OUTPUT: 656700.000000 diff --git a/directives/sources/directive_syntax_attribute.2.cpp b/directives/sources/directive_syntax_attribute.2.cpp new file mode 100644 index 0000000..bf372db --- /dev/null +++ b/directives/sources/directive_syntax_attribute.2.cpp @@ -0,0 +1,36 @@ +/* +* @@name: directive_syntax_attribute.2 +* @@type: C++ +* @@operation: view +* @@expect: none +* @@version: omp_6.0 +*/ +// Case 1 +[[ omp::decl(threadprivate) ]] int u, v; +// equivalent to +int u ,v; +#pragma omp threadprivate(u, v) + +// Case 2 +int a[100] [[ omp::decl(declare_target) ]], + b[100] [[ omp::decl(declare_target, link) ]]; +// equivalent to +int a[100], b[100]; +#pragma omp declare_target(a) +#pragma omp declare_target link(b) + +// Case 3 +[[ omp::decl(declare_target) ]] void f( int c ); +// equivalent to +void f( int c ); +#pragma omp declare_target(f) + +// Case 4 +template +[[ omp::decl(declare_target) ]] +void foo(T); +// equivalent to +#pragma omp begin declare_target +template +void foo(T); +#pragma omp end declare_target diff --git a/introduction/Examples.tex b/introduction/Examples.tex index 259fa2f..75da25a 100644 --- a/introduction/Examples.tex +++ b/introduction/Examples.tex @@ -29,15 +29,18 @@ prior to OpenMP version 3.0, such as Language markers may be used to indicate text or codes that are specific to a particular base language. -\ccppspecificstart +\begin{ccppspecific} This is C/C++ specific: A statement following a directive is compound only when necessary, and a non-compound statement is indented with respect to a directive preceding it. -\ccppspecificend -\fortranspecificstart +\end{ccppspecific} +\begin{fortranspecific} This is Fortran specific... -\fortranspecificend +\end{fortranspecific} +\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} +This marks the continuation of language specific page. +\medskip Throughout the examples document we assume that the number of threads used for a \kcode{parallel} region is the same as the number of threads requested, unless explicitly specified otherwise. diff --git a/loop_transformations/apply.tex b/loop_transformations/apply.tex new file mode 100644 index 0000000..dd951ab --- /dev/null +++ b/loop_transformations/apply.tex @@ -0,0 +1,150 @@ +\pagebreak +\section{\kcode{apply} Clause} +\label{sec:apply_clause} + +\index{unroll construct@\kcode{unroll} construct!apply clause@\kcode{apply} clause} +\index{tile construct@\kcode{tile} construct!apply clause@\kcode{apply} clause} + +\index{apply clause@\kcode{apply} clause} +\index{clauses!apply@\kcode{apply}} + +A loop transformation construct can be applied to another nested +loop transformation construct, but the application of the ``outer'' transformation +is limited to the outermost generated loop of the ``inner'' transformation. + +The \kcode{apply} clause on a loop transformation construct can specify additional +loop transformation directives that apply to generated loops other than the outermost one. +Clause modifiers are used to specify which generated loop to target. +Also, an applied directive within a clause may specify another \kcode{apply} clause. + +%The \code{apply} clause on a loop transformation construct can specify (other) +%loop transformation directives to be applied to its transformation. +%Clause modifiers can be used to target specific generated loops, providing a mechanism +%to overcome the restriction of applying a transformation immediately to the next loop +%transformation construct. Also, an applied directive within a clause may be another +%\code{apply} clause. + +Any nested loop transformation constructs including any constructs that +result from \kcode{apply} clauses of nested constructs are replaced before any enclosing +loop transformation construct. This is referred to as the \plc{innermost-first order} +here. + +\subsection{Syntax and Effect} + +In the example below, the \ucode{construct_unroll} and \ucode{apply_unroll} functions +illustrate the syntax for two equivalent means of applying the \kcode{unroll} loop transformation +directive to the outermost generated (grid) loop of the \kcode{tile} construct transformation. +In function \ucode{construct_unroll}, the tile transformation creates the generated (tiled) loops +and then the \kcode{unroll} construct is applied to outermost loop of the replacement. +In the \ucode{apply_unroll} function, the \kcode{apply} clause on the \kcode{tile} construct +is used to apply an \kcode{unroll} transformation on the \plc{grid} loop (the outermost loop +of the tile transformation) as specified by the \kcode{grid} modifier. + +\cexample[6.0]{apply_syntax}{1} +\ffreeexample[6.0]{apply_syntax}{1} + +For the two functions in the previous example, +the \ucode{equivalent} function in the next example shows an equivalent +code that a user could have written without using the \kcode{tile} construct +or \kcode{apply} clause. + +\cexample[5.1]{apply_syntax_equivalent}{1} +\ffreeexample[5.1]{apply_syntax_equivalent}{1} + + +The following example shows how multiple loop transformation directives +can be applied to different generated loops resulting from a loop transformation. +For the 4x4 \kcode{tile} construct there will be two (outer) \plc{grid} loops and two (inner) \plc{intra-tile} loops. +The first \kcode{apply} clause specifies that the two \plc{grid} loops are to have an \kcode{interchange} directive and a \kcode{nothing} directive +(just a placeholder to indicate no directive application) applied to the grid (two outermost) loops. +Directives, read from left to right, are applied to the \plc{grid} loops, from outermost to innermost, respectively. +The second \kcode{apply} clause specifies that the two \plc{intratile} loops are to have \kcode{nothing} and \kcode{interchange} directives applied to the +last two \plc{tile} loops, respectively. +Note that the \ucode{A} array dimensions are \ucode{A[100][100][3]} and \ucode{A(0:2,0:99,0:99)} +in the C/C++ and Fortran codes to illustrate equivalent sequential memory access for the +\ucode{i}, \ucode{j} and \ucode{k} loops. + +\index{interchange directive@\kcode{interchange} directive} +\index{directives!interchange@\kcode{interchange}} +\index{nothing directive@\kcode{nothing} directive} +\index{directives!nothing@\kcode{nothing}} + +\cexample[6.0]{apply_syntax}{2} +\pagebreak +\ffreeexample[6.0]{apply_syntax}{2} + +For the function in the previous example, +the \ucode{equivalent} function in the next example shows a possible +equivalent tile replacement code (\kcode{tile} generated loops) and the +appropriately positioned \kcode{interchange} and \kcode{nothing} directives. + +\cexample[6.0]{apply_syntax_equivalent}{2} +\pagebreak +\ffreeexample[6.0]{apply_syntax_equivalent}{2} + + +\index{tile construct@\kcode{tile} construct!apply clause@\kcode{apply} clause} +\index{grid modifier@\kcode{grid} modifier} +\index{intratile modifier@\kcode{intratile} modifier} + +The following example illustrates the use of \kcode{apply} clause +modifiers with argument. The index of the generated loop instead of +a positional location can be used for the applied-directive. +The \kcode{grid(1)} modifier indicates the first grid loop +generated by the \kcode{tile} directive +and the \kcode{intratile(2)} modifier indicates the second tile loop +generated by the \kcode{tile} directive. + +\cexample[6.0]{apply_syntax}{3} +\pagebreak +\ffreeexample[6.0]{apply_syntax}{3} + +Without the index arguments, the \kcode{nothing} argument would +be needed as a placeholder, as illustrated by the equivalent codes +of the above example as follows. + +\cexample[6.0]{apply_syntax_equivalent}{3} +\pagebreak +\ffreeexample[6.0]{apply_syntax_equivalent}{3} + + +\subsection{Spanning Loop Associations} + +It is possible for a loop transformation directive to be applied to multiple generated loops, +and multiple directives applied to the same generated loop. +The latter is illustrated in the this example. + +\cexample[6.0]{apply_span}{1} +\ffreeexample[6.0]{apply_span}{1} + +In this example, the functions show successive steps in the application of +the previous loop transformation example as equivalent user-written code. +First, the tiling is applied in the \ucode{step1} function. +Next, loop transformations in the generated loop nest are replaced according to the innermost-first order rule. +Applying the innermost transformation, loop reversal, results in the loop nest in \ucode{step2}. +After that, the inner tile directive is applied in the \ucode{step3} function. + +\index{reverse directive@\kcode{reverse} directive} +\index{directives!reverse@\kcode{reverse}} + +\cexample[6.0]{apply_span_equivalent}{1} +\ffreeexample[6.0]{apply_span_equivalent}{1} + + +\subsection{Nested apply} + +The following example illustrates how multiple loop transformations can be chained by nesting \kcode{apply} clauses. +In the \ucode{nested_apply} function, a loop is first tiled, then the intra-tile +loop is unrolled, and finally the iteration order of the unrolled loop is reversed. +For C/C++ codes, reversing a loop with an unsigned type index may cause the compiler +to ensure that underflow is handled correctly. + +\cexample[6.0]{apply_nested}{1} +\ffreeexample[6.0]{apply_nested}{1} + +In this example the \ucode{step1}, \ucode{step2} and \ucode{step3} +functions are all equivalent to the \ucode{nested_apply} function, but illustrate +a possible chain of transformations but done manually by a user. + +\cexample[6.0]{apply_nested_equivalent}{1} +\ffreeexample[6.0]{apply_nested_equivalent}{1} diff --git a/loop_transformations/sources/apply_nested.1.c b/loop_transformations/sources/apply_nested.1.c new file mode 100644 index 0000000..4c12918 --- /dev/null +++ b/loop_transformations/sources/apply_nested.1.c @@ -0,0 +1,14 @@ +/* +* @@name: apply_nested.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void nested_apply(double A[100]) +{ + #pragma omp tile sizes(10) \ + apply(intratile: unroll partial(2) apply(reverse)) + for (int i = 0; i < 100; ++i) + A[i] = A[i] + 1; +} diff --git a/loop_transformations/sources/apply_nested.1.f90 b/loop_transformations/sources/apply_nested.1.f90 new file mode 100644 index 0000000..c37bfd6 --- /dev/null +++ b/loop_transformations/sources/apply_nested.1.f90 @@ -0,0 +1,15 @@ +! @@name: apply_nested.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine nested_apply(A) + implicit none + double precision :: A(0:99) + integer :: i + + !$omp tile sizes(10) apply(intratile: unroll partial(2) apply(reverse)) + do i = 0, 99 + A(i) = A(i) + 1 + enddo +end subroutine diff --git a/loop_transformations/sources/apply_nested_equivalent.1.c b/loop_transformations/sources/apply_nested_equivalent.1.c new file mode 100644 index 0000000..f3037ae --- /dev/null +++ b/loop_transformations/sources/apply_nested_equivalent.1.c @@ -0,0 +1,39 @@ +/* +* @@name: apply_nested_equivalent.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void step1(double A[100]) +{ + for (int i1 = 0; i1 < 10; ++i1) + #pragma omp unroll partial(2) apply(reverse) + for (int i2 = 0; i2 < 10; ++i2) { + int i = i1 * 10 + i2; + A[i] = A[i] + 1; + } +} + +void step2(double A[100]) +{ + for (int i1 = 0; i1 < 10; ++i1) + #pragma omp reverse + for (int i2 = 0; i2 < 5; ++i2) { + int i = i1 * 10 + i2 * 2; + A[i] = A[i] + 1; + ++i; + A[i] = A[i] + 1; + } +} + +void step3(double A[100]) +{ + for (int i1 = 0; i1 < 10; ++i1) + for (int i2 = 4; i2 >= 0; --i2) { + int i = i1 * 10 + i2 * 2; + A[i] = A[i] + 1; + ++i; + A[i] = A[i] + 1; + } +} diff --git a/loop_transformations/sources/apply_nested_equivalent.1.f90 b/loop_transformations/sources/apply_nested_equivalent.1.f90 new file mode 100644 index 0000000..e426802 --- /dev/null +++ b/loop_transformations/sources/apply_nested_equivalent.1.f90 @@ -0,0 +1,46 @@ +! @@name: apply_nested_equivalent.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine step1(A) + implicit none + double precision :: A(0:99) + integer :: i,i1,i2 + + do i1 = 0, 9 + !$omp unroll partial(2) apply(reverse) + do i2 = 0, 9 + i = i1 * 10 + i2 + A(i) = A(i) + 1 + enddo; enddo +end subroutine + +subroutine step2(A) + implicit none + double precision :: A(0:99) + integer :: i,i1,i2 + + do i1 = 0, 9 + !$omp reverse + do i2 = 0, 4 + i = i1 * 10 + i2 * 2 + A(i) = A(i) + 1 + i = i + 1 + A(i) = A(i) + 1 + enddo; enddo +end subroutine + +subroutine step3(A) + implicit none + double precision :: A(0:99) + integer :: i,i1,i2 + + do i1 = 0, 9 + do i2 = 4, 0, -1 + i = i1 * 10 + i2 * 2 + A(i) = A(i) + 1 + i = i + 1 + A(i) = A(i) + 1 + enddo; enddo +end subroutine diff --git a/loop_transformations/sources/apply_span.1.c b/loop_transformations/sources/apply_span.1.c new file mode 100644 index 0000000..7c076e5 --- /dev/null +++ b/loop_transformations/sources/apply_span.1.c @@ -0,0 +1,16 @@ +/* +* @@name: apply_span.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void span_apply(double A[128][128]) +{ + #pragma omp for collapse(2) + #pragma omp tile sizes(16,16) \ + apply(grid: interchange,reverse) + for (int i = 0; i < 128; ++i) + for (int j = 0; j < 128; ++j) + A[i][j] = A[i][j] + 1; +} diff --git a/loop_transformations/sources/apply_span.1.f90 b/loop_transformations/sources/apply_span.1.f90 new file mode 100644 index 0000000..493c8e4 --- /dev/null +++ b/loop_transformations/sources/apply_span.1.f90 @@ -0,0 +1,18 @@ +! @@name: apply_span.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine span_apply( A ) + implicit none + double precision :: A(0:127,0:127) + integer :: i , j + + !$omp for collapse(2) + !$omp tile sizes(16,16) apply(grid: interchange,reverse) + do i = 0, 127 + do j = 0, 127 + A(j,i) = A(j,i) + 1 + enddo; enddo + +end subroutine diff --git a/loop_transformations/sources/apply_span_equivalent.1.c b/loop_transformations/sources/apply_span_equivalent.1.c new file mode 100644 index 0000000..9f7da5b --- /dev/null +++ b/loop_transformations/sources/apply_span_equivalent.1.c @@ -0,0 +1,52 @@ +/* +* @@name: apply_span_equivalent.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void step1(double A[128][128]) +{ + #pragma omp for collapse(2) + #pragma omp interchange + for (int i1 = 0; i1 < 8; ++i1) + #pragma omp reverse + for (int j1 = 0; j1 < 8; ++j1) + + for (int i2 = 0; i2 < 16; ++i2) + for (int j2 = 0; j2 < 16; ++j2) { + int i = i1 * 16 + i2; + int j = j1 * 16 + j2; + A[i][j] = A[i][j] + 1; + } +} + +void step2(double A[128][128]) +{ + #pragma omp for collapse(2) + #pragma omp interchange + for (int i1 = 0; i1 < 8; ++i1) + for (int j1 = 7; j1 >= 0; --j1) + + for (int i2 = 0; i2 < 16; ++i2) + for (int j2 = 0; j2 < 16; ++j2) { + int i = i1 * 16 + i2; + int j = j1 * 16 + j2; + A[i][j] = A[i][j] + 1; + } +} + +void step3(double A[128][128]) +{ + #pragma omp for collapse(2) + for (int j1 = 7; j1 >= 0; --j1) + for (int i1 = 0; i1 < 8; ++i1) + + for (int i2 = 0; i2 < 16; ++i2) + for (int j2 = 0; j2 < 16; ++j2) { + int i = i1 * 16 + i2; + int j = j1 * 16 + j2; + A[i][j] = A[i][j] + 1; + } + +} diff --git a/loop_transformations/sources/apply_span_equivalent.1.f90 b/loop_transformations/sources/apply_span_equivalent.1.f90 new file mode 100644 index 0000000..d86dfa1 --- /dev/null +++ b/loop_transformations/sources/apply_span_equivalent.1.f90 @@ -0,0 +1,64 @@ +! @@name: apply_span_equivalent.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine step1(A) + implicit none + double precision :: A(0:127, 0:127) + integer :: i,i1,i2, j,j1,j2 + + !$omp do collapse(2) + !$omp interchange + do i1 = 0, 7 + !$omp reverse + do j1 = 0, 7 + + do i2 = 0, 15 + do j2 = 0, 15 + i = i1 * 16 + i2 + j = j1 * 16 + j2 + A(j,i) = A(j,i) + 1 + enddo; enddo + enddo; enddo + +end subroutine + +subroutine step2(A) + implicit none + double precision :: A(0:127, 0:127) + integer :: i,i1,i2, j,j1,j2 + + !$omp do collapse(2) + !$omp interchange + do i1 = 0, 7 + do j1 = 7, 0, -1 + + do i2 = 0, 15 + do j2 = 0, 15 + i = i1 * 16 + i2 + j = j1 * 16 + j2 + A(j,i) = A(j,i) + 1 + enddo; enddo + enddo; enddo + +end subroutine + +subroutine step3(A) + implicit none + double precision :: A(0:127, 0:127) + integer :: i,i1,i2, j,j1,j2 + + !$omp do collapse(2) + do j1 = 7, 0, -1 + do i1 = 0, 7 + + do i2 = 0, 15 + do j2 = 0, 15 + i = i1 * 16 + i2 + j = j1 * 16 + j2 + A(j,i) = A(j,i) + 1 + enddo; enddo + enddo; enddo + +end subroutine diff --git a/loop_transformations/sources/apply_syntax.1.c b/loop_transformations/sources/apply_syntax.1.c new file mode 100644 index 0000000..4e7c241 --- /dev/null +++ b/loop_transformations/sources/apply_syntax.1.c @@ -0,0 +1,21 @@ +/* +* @@name: apply_syntax.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void construct_unroll(double A[100]) +{ + #pragma omp unroll + #pragma omp tile sizes(4) + for (int i = 0; i < 100; ++i) + A[i] = A[i] + 1; +} + +void apply_unroll(double A[100]) +{ + #pragma omp tile sizes(4) apply(grid: unroll) + for (int i = 0; i < 100; ++i) + A[i] = A[i] + 1; +} diff --git a/loop_transformations/sources/apply_syntax.1.f90 b/loop_transformations/sources/apply_syntax.1.f90 new file mode 100644 index 0000000..e7e06f0 --- /dev/null +++ b/loop_transformations/sources/apply_syntax.1.f90 @@ -0,0 +1,27 @@ +! @@name: apply_syntax.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine construct_unroll(A) + implicit none + integer :: i + double precision :: A(0:99) + + !$omp unroll + !$omp tile sizes(4) + do i = 0, 99 + A(i) = A(i) + 1 + end do +end subroutine + +subroutine apply_unroll(A) + implicit none + integer :: i + double precision :: A(0:99) + + !$omp tile sizes(4) apply(grid: unroll) + do i = 0, 99 + A(i) = A(i) + 1 + end do +end subroutine diff --git a/loop_transformations/sources/apply_syntax.2.c b/loop_transformations/sources/apply_syntax.2.c new file mode 100644 index 0000000..089526f --- /dev/null +++ b/loop_transformations/sources/apply_syntax.2.c @@ -0,0 +1,19 @@ +/* +* @@name: apply_syntax.2 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void apply_assoc(double A[100][100][3]) +{ + #pragma omp tile sizes(4,4) \ + apply( grid: interchange,nothing) \ + apply(intratile: nothing,interchange) + for (int i = 0; i < 100; ++i) + for (int j = 0; j < 100; ++j) + + // k loop not associated with tile, but with interchange + for (int k = 0; k < 3; ++k) + A[i][j][k] = A[i][j][k] + 1; +} diff --git a/loop_transformations/sources/apply_syntax.2.f90 b/loop_transformations/sources/apply_syntax.2.f90 new file mode 100644 index 0000000..ad50b3e --- /dev/null +++ b/loop_transformations/sources/apply_syntax.2.f90 @@ -0,0 +1,21 @@ +! @@name: apply_syntax.2 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine apply_assoc(A) + implicit none + double precision :: A(0:2, 0:99, 0:99) + integer :: k, j, i + + !$omp tile sizes(4,4) & + !$omp& apply( grid: interchange, nothing) & + !$omp& apply(intratile: nothing, interchange) + do i = 0, 99 + do j = 0, 99 + + do k = 0, 2 !! k loop not associated with tile, but w. interchange + A(k,j,i) = A(k,j,i) + 1 + enddo + enddo; enddo +end subroutine diff --git a/loop_transformations/sources/apply_syntax.3.c b/loop_transformations/sources/apply_syntax.3.c new file mode 100644 index 0000000..8ec5eb7 --- /dev/null +++ b/loop_transformations/sources/apply_syntax.3.c @@ -0,0 +1,16 @@ +/* +* @@name: apply_syntax.3 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void apply_complexarg(double A[100*100]) +{ + #pragma omp tile sizes(4,5) \ + apply(grid(1): reverse) \ + apply(intratile(2): unroll) + for (int i = 0; i < 100; ++i) + for (int j = 0; j < 100; ++j) + A[i*100+j] += 1; +} diff --git a/loop_transformations/sources/apply_syntax.3.f90 b/loop_transformations/sources/apply_syntax.3.f90 new file mode 100644 index 0000000..f1d0a36 --- /dev/null +++ b/loop_transformations/sources/apply_syntax.3.f90 @@ -0,0 +1,19 @@ +! @@name: apply_syntax.3 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine apply_complexarg(A) + implicit none + double precision :: A(100,100) + integer :: i, j + + !$omp tile sizes(4,5) & + !$omp& apply(grid(1): reverse) & + !$omp& apply(intratile(2): unroll) + do i = 1, 100 + do j = 1, 100 + A(j,i) = A(j,i) + 1 + end do + end do +end subroutine diff --git a/loop_transformations/sources/apply_syntax_equivalent.1.c b/loop_transformations/sources/apply_syntax_equivalent.1.c new file mode 100644 index 0000000..d20f096 --- /dev/null +++ b/loop_transformations/sources/apply_syntax_equivalent.1.c @@ -0,0 +1,16 @@ +/* +* @@name: apply_syntax_equivalent.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_5.1 +*/ +void equivalent(double A[100]) +{ + #pragma omp unroll + for (int i1 = 0; i1 < 25; ++i1) + for (int i2 = 0; i2 < 4; ++i2) { + int i = i1 * 4 + i2; + A[i] = A[i] + 1; + } +} diff --git a/loop_transformations/sources/apply_syntax_equivalent.1.f90 b/loop_transformations/sources/apply_syntax_equivalent.1.f90 new file mode 100644 index 0000000..8b86b9c --- /dev/null +++ b/loop_transformations/sources/apply_syntax_equivalent.1.f90 @@ -0,0 +1,18 @@ +! @@name: apply_syntax_equivalent.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_5.1 +subroutine equivalent(A) + implicit none + double precision :: A(0:99) + integer :: i1,i2, i + + !$omp unroll + do i1=0,24 + do i2=0, 3 + i = i1 * 4 + i2 + A(i) = A(i) + 1 + enddo; enddo + +end subroutine diff --git a/loop_transformations/sources/apply_syntax_equivalent.2.c b/loop_transformations/sources/apply_syntax_equivalent.2.c new file mode 100644 index 0000000..fe6a1a6 --- /dev/null +++ b/loop_transformations/sources/apply_syntax_equivalent.2.c @@ -0,0 +1,25 @@ +/* +* @@name: apply_syntax_equivalent.2 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void equivalent(double A[100][100][3]) +{ + #pragma omp interchange + for (int i1 = 0; i1 < 25; ++i1) + #pragma omp nothing + for (int j1 = 0; j1 < 25; ++j1) + + #pragma omp nothing + for (int i2 = 0; i2 < 4; ++i2) + #pragma omp interchange + for (int j2 = 0; j2 < 4; ++j2) + + for (int k = 0; k < 3; ++k) { + int i = i1 * 4 + i2; + int j = j1 * 4 + j2; + A[i][j][k] = A[i][j][k] + 1; + } +} diff --git a/loop_transformations/sources/apply_syntax_equivalent.2.f90 b/loop_transformations/sources/apply_syntax_equivalent.2.f90 new file mode 100644 index 0000000..39372f1 --- /dev/null +++ b/loop_transformations/sources/apply_syntax_equivalent.2.f90 @@ -0,0 +1,29 @@ +! @@name: apply_syntax_equivalent.2 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine equivalent(A) + implicit none + double precision :: A(0:2, 0:99, 0:99) + integer :: k, j1,j2, i1,i2 + + !$omp interchange !! grid modifier + do i1 = 0, 24 + !$omp nothing !! grid modifier + do j1 = 0, 24 + + !$omp nothing !! intratile modifier + do i2 = 0, 3 + !$omp interchange !! intratile modifier + do j2 = 0, 3 + + do k = 0, 2 + i = i1 * 4 + i2 + j = j1 * 4 + j2 + A(k,j,i) = A(k,j,i) + 1 + enddo + + enddo; enddo + enddo; enddo +end subroutine diff --git a/loop_transformations/sources/apply_syntax_equivalent.3.c b/loop_transformations/sources/apply_syntax_equivalent.3.c new file mode 100644 index 0000000..5a1b8ae --- /dev/null +++ b/loop_transformations/sources/apply_syntax_equivalent.3.c @@ -0,0 +1,27 @@ +/* +* @@name: apply_syntax_equivalent.3 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +void apply_complexarg_equivalent1(double A[100*100]) +{ + #pragma omp tile sizes(4,5) \ + apply(grid: reverse,nothing) \ + apply(intratile: nothing,unroll) + for (int i = 0; i < 100; ++i) + for (int j = 0; j < 100; ++j) + A[i*100+j] += 1; +} + +void apply_complexarg_equivalent2(double A[100*100]) +{ + #pragma omp reverse + for (int i1 = 0; i1 < 100; i1+=4) // grid loop 1 + for (int j1 = 0; j1 < 100; j1+=5) // grid loop 2 + for (int i = i1; i < i1+4; i+=1) // tile loop 1 + #pragma omp unroll + for (int j = j1; j < j1+5; j+=1) // tile loop 2 + A[i*100+j] += 1; +} diff --git a/loop_transformations/sources/apply_syntax_equivalent.3.f90 b/loop_transformations/sources/apply_syntax_equivalent.3.f90 new file mode 100644 index 0000000..277b152 --- /dev/null +++ b/loop_transformations/sources/apply_syntax_equivalent.3.f90 @@ -0,0 +1,37 @@ +! @@name: apply_syntax_equivalent.3 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine apply_complexarg_equivalent1(A) + implicit none + double precision :: A(100,100) + integer :: i, j + + !$omp tile sizes(4,5) & + !$omp& apply(grid: reverse,nothing) & + !$omp& apply(intratile: nothing,unroll) + do i = 1, 100 + do j = 1, 100 + A(j,i) = A(j,i) + 1 + end do + end do +end subroutine + +subroutine apply_complexarg_equivalent2(A) + implicit none + double precision :: A(100,100) + integer :: i, j, i1, j1 + + !$omp reverse + do i1 = 1, 100, 4 ! grid loop 1 + do j1 = 1, 100, 5 ! grid loop 2 + do i = i1, i1+3 ! tile loop 1 + !$omp unroll + do j = j1, j1+4 ! tile loop 2 + A(j,i) = A(j,i) + 1 + end do + end do + end do + end do +end subroutine diff --git a/memory_model/fort_race.tex b/memory_model/fort_race.tex index 97e914e..74f9b38 100644 --- a/memory_model/fort_race.tex +++ b/memory_model/fort_race.tex @@ -1,6 +1,6 @@ -%\pagebreak +\pagebreak +\begin{fortranspecific}[4ex] \section{Race Conditions Caused by Implied Copies of Shared Variables in Fortran} -\fortranspecificstart \label{sec:fort_race} \index{shared variables!race conditions} @@ -12,6 +12,6 @@ the call and copy from the temporary location into the original variable when th subroutine returns. This copying would cause races in the \kcode{parallel} region. \ffreenexample{fort_race}{1} -\fortranspecificend +\end{fortranspecific} diff --git a/openmp-examples.tex b/openmp-examples.tex index c1ece0c..088e907 100644 --- a/openmp-examples.tex +++ b/openmp-examples.tex @@ -50,7 +50,7 @@ \input{generated-include} % Text to appear in the footer on even-numbered pages: -\newcommand{\footerText}{OpenMP Examples Version \VER{} - \VERDATE} +\newcommand{\footerText}{OpenMP \langselect Examples Version \VER{} -- \VERDATE} % Unified style sheet for OpenMP documents: \input{openmp.sty} diff --git a/openmp.sty b/openmp.sty index 0f7ac76..c47e588 100644 --- a/openmp.sty +++ b/openmp.sty @@ -52,20 +52,20 @@ % % \specref{} % formats the cross-reference "Section X on page Y" % -% \notestart % black horizontal rule for Notes -% \noteend +% \begin{note} % black horizontal rule for Notes +% \end{note} % -% \cspecificstart % blue horizontal rule for C-specific text -% \cspecificend +% \begin{cspecific} % blue horizontal rule for C-specific text +% \end{cspecific} % -% \cppspecificstart % blue horizontal rule for C++ -specific text -% \cppspecificend +% \begin{cppspecific} % blue horizontal rule for C++ -specific text +% \end{cppspecific} % -% \ccppspecificstart % blue horizontal rule for C / C++ -specific text -% \ccppspecificend +% \begin{ccppspecific} % blue horizontal rule for C/C++ -specific text +% \end{ccppspecific} % -% \fortranspecificstart % blue horizontal rule for Fortran-specific text -% \fortranspecificend +% \begin{fortranspecific} % blue horizontal rule for Fortran-specific text +% \end{fortranspecific} % % \glossaryterm % for use in formatting glossary entries % \glossarydefstart @@ -302,7 +302,7 @@ % Enable \alltt{} for formatting blocks of code: \usepackage{alltt} -\usepackage{toolbox} % for \toolboxMakeSplit +\usepackage{toolbox} % for \toolboxReplace % This sets the default \code{} font to tt (monospace) and bold: \newcommand\code[1]{\texttt{\textbf{#1}}} @@ -315,20 +315,16 @@ % This is an updated set of macros for code style work % kcode - keywords, vcode - value, bcode - base language, % pvar - variables, pout - program outputs -\toolboxMakeSplit*{ }{DoSplitS}\toolboxMakeSplit*{_}{DoSplitU} -\protected\def\DoReplaceU#1{\DoSplitU{#1}\leftutext\rightutext - \leftutext% - \ifthenelse{\isundefined{\rightutext}}{}% - {\_\expandafter\DoReplaceU\expandafter{\rightutext}}} -\protected\def\DoReplaceS#1{\DoSplitS{#1}\leftstext\rightstext - \expandafter\DoReplaceU\expandafter{\leftstext}% - \ifthenelse{\isundefined{\rightstext}}{}% - {\textrm{~}\expandafter\DoReplaceS\expandafter{\rightstext}}} -\newcommand{\myreplacedmt}[1]{\protect\DoReplaceS{#1}} -\newcommand\kcode[1]{\texttt{\bfseries\upshape\myreplacedmt{#1}}} -\newcommand\bcode[1]{\texttt{\mdseries\upshape\myreplacedmt{#1}}} -\newcommand\vcode[1]{\bcode{#1}} -\newcommand\ucode[1]{\texttt{\mdseries\slshape\myreplacedmt{#1}}} +\protected\def\DoReplaceU#1{\def\utexttmp{#1}% + \toolboxReplace{_}{\_}\utexttmp\utexttmp} +\protected\def\myreplacedmt#1#2{\def\stexttmp{#1}% + \toolboxReplace{_}{\_}\stexttmp% + \toolboxReplace{ }{\rmfamily{ }\ttfamily#2}\stexttmp% + {\ttfamily#2\stexttmp}} +\newcommand\kcode[1]{\myreplacedmt{#1}{\bfseries\upshape}} +\newcommand\vcode[1]{\myreplacedmt{#1}{\mdseries\upshape}} +\newcommand\bcode[1]{\kcode{#1}} +\newcommand\ucode[1]{\myreplacedmt{#1}{\mdseries\slshape}} \newcommand\pvar[1]{\ucode{#1}} \newcommand\pout[1]{\vcode{#1}} \newcommand\docref[1]{\textrm{\mdseries\itshape{#1}}} @@ -340,14 +336,24 @@ \newcommand\examplesblob[1]{\href{\examplesrepo/blob/#1}{#1}} % Environment for a paragraph of literal code, single-spaced, no outline, no indenting: -\newenvironment{codepar}[1] -{\begin{alltt}\bfseries #1} -{\end{alltt}} +\usepackage{listings} +\lstnewenvironment{codepar}{% + }{} +%\newenvironment{codepar}[1] +%{\begin{alltt}\bfseries #1} +%{\end{alltt}} % For blocks of code inside a box frame: -\newenvironment{boxedcode}[1] -{\vspace{0.25em plus 5em minus 0.25em}\begin{framed}\begin{minipage}[t]{\textwidth}\begin{alltt}\bfseries #1} -{\end{alltt}\end{minipage}\end{framed}\vspace{0.25em plus 5em minus 0.25em}} +\lstnewenvironment{boxedcode}{% + \lstset{framesep=1.2ex,frame=l,framerule=3pt, + backgroundcolor=\color{white!90!black}}}{} +\lstnewenvironment{boxeducode}{% + \lstset{framesep=1.2ex,frame=l,framerule=3pt, + basicstyle=\ttfamily\mdseries\slshape, + backgroundcolor=\color{white!90!black}}}{} +%\newenvironment{boxedcode}[1] +%{\vspace{0.25em plus 5em minus %0.25em}\begin{framed}\begin{minipage}[t]{\textwidth}\begin{alltt}\bfseries #1} +%{\end{alltt}\end{minipage}\end{framed}\vspace{0.25em plus 5em minus 0.25em}} % This sets the margins in the framed box: \setlength{\FrameSep}{0.6em} @@ -355,9 +361,39 @@ % For indented lists of verbatim code at a relaxed line spacing, % e.g., for use after "where clause is one of the following:" \usepackage{setspace} -\newenvironment{indentedcodelist}{% - \begin{adjustwidth}{0.25in}{}\begin{spacing}{1.5}\begin{alltt}\bfseries} - {\end{alltt}\end{spacing}\vspace{-0.25\baselineskip}\end{adjustwidth}} +\lstnewenvironment{indentedcodelist}{% + \lstset{xleftmargin=0.25in}}{} +%\newenvironment{indentedcodelist}{% +%\begin{adjustwidth}{0.25in}{}\vspace{-0.2\baselineskip}\begin{spacing}{1.2}\beg%in{alltt}\bfseries} +% {\end{alltt}\end{spacing}\vspace{-0.2\baselineskip}\end{adjustwidth}} + +\lstdefinestyle{openmp}{ + showstringspaces=false, + basicstyle=\ttfamily\bfseries, + linewidth=.99\linewidth, + xleftmargin=0.01\linewidth, + columns=fullflexible, + keepspaces=true, + escapechar=@, + belowskip=\smallskipamount, + aboveskip=\smallskipamount, + morecomment=[l][\color{red}\sout]{\%DIF\ <}, % deleted empty lines + morecomment=[l][\color{blue}\uwave]{\%DIF\ >}, % added empty lines + moredelim=[il][\color{red}\sout]{\%DIF\ <\ }, % deleted lines + moredelim=[il][\color{blue}\uwave]{\%DIF\ >\ }, % added lines + moredelim=**[is][\rmfamily\mdseries\itshape]{\\plc\{}{\}}, + moredelim=**[is][\textsubscript]{\\textsubscript\{}{\}}, + moredelim=**[is][]{\\textnormal\{}{\}}, + moredelim=**[is][\rmfamily\mdseries\itshape]{\\textsl\{}{\}}, + moredelim=**[is][\ttfamily\mdseries\slshape]{\\ucode\{}{\}}, + moredelim=**[is][\ttfamily\bfseries\upshape]{\\kcode\{}{\}}, + moredelim=**[is][]{\\code\{}{\}}, + moredelim=**[is][]{\\scode\{}{\}}, + moredelim=*[is][\color{red}\sout]{*!----}{----!*}, + moredelim=*[is][\color{blue}\uwave]{*!++++}{++++!*}, + moredelim=**[is][\mdseries\rmfamily]{\\text\{}{\}}, +} +\lstset{style=openmp} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -413,31 +449,61 @@ \newcommand{\VSPb}{\vspace{0.5ex plus 5ex minus 0.25ex}} \newcommand{\VSPa}{\vspace{0.25ex plus 5ex minus 0.25ex}} +% remove language marker definition if either ccpp or fortran is undefined +\ifthenelse{\boolean{ccpp}\and\boolean{fortran}}{}% +{\renewcommand{\linewitharrows}[4]{\par}} +\newcommand{\langselect}{} +\ifccpp\else\renewcommand{\langselect}{Fortran~}\fi +\iffortran\else\renewcommand{\langselect}{C/C++~}\fi + % C +\ifccpp +\newenvironment{cspecific}[1][0ex]{\vspace{#1}\cspecificstart\vspace{-#1}}{\cspecificend} +\else +\excludecomment{cspecific} +\fi \newcommand{\cspecificstart}{\needspace{\sbns}\linewitharrows{-1}{solid}{C}{3em}} \newcommand{\cspecificend}{\linewitharrows{1}{solid}{C}{3em}\bigskip} % C/C++ +\ifccpp +\newenvironment{ccppspecific}[1][0ex]{\vspace{#1}\ccppspecificstart\vspace{-#1}}{\ccppspecificend} +\else +\excludecomment{ccppspecific} +\fi \newcommand{\ccppspecificstart}{\VSPb\linewitharrows{-1}{solid}{C / C++}{6em}\VSPa} \newcommand{\ccppspecificend}{\VSPb\linewitharrows{1}{solid}{C / C++}{6em}\VSPa} % C++ +\ifccpp +\newenvironment{cppspecific}[1][0ex]{\vspace{#1}\cppspecificstart\vspace{-#1}}{\cppspecificend} +\else +\excludecomment{cppspecific} +\fi \newcommand{\cppspecificstart}{\needspace{\sbns}\linewitharrows{-1}{solid}{C++}{6em}} \newcommand{\cppspecificend}{\linewitharrows{1}{solid}{C++}{6em}\bigskip} % C90 +\newenvironment{cNinetyspecific}{\cNinetyspecificstart}{\cNinetyspecificend} \newcommand{\cNinetyspecificstart}{\needspace{\sbns}\linewitharrows{-1}{solid}{C90}{4em}} \newcommand{\cNinetyspecificend}{\linewitharrows{1}{solid}{C90}{4em}\bigskip} % C99 +\newenvironment{cNinetyNinespecific}{\cNinetyNinespecificstart}{\cNinetyNinespecificend} \newcommand{\cNinetyNinespecificstart}{\needspace{\sbns}\linewitharrows{-1}{solid}{C99}{4em}} \newcommand{\cNinetyNinespecificend}{\linewitharrows{1}{solid}{C99}{4em}\bigskip} % Fortran +\iffortran +\newenvironment{fortranspecific}[1][0ex]{\vspace{#1}\fortranspecificstart\vspace{-#1}}{\fortranspecificend} +\else +\excludecomment{fortranspecific} +\fi \newcommand{\fortranspecificstart}{\VSPb\linewitharrows{-1}{solid}{Fortran}{6em}\VSPa} \newcommand{\fortranspecificend}{\VSPb\linewitharrows{1}{solid}{Fortran}{6em}\VSPa} % Note +\newenvironment{note}{\notestart}{\noteend} \newcommand{\notestart}{\VSPb\notelinewitharrows{-1}{solid}\VSPa} \newcommand{\noteend}{\VSPb\notelinewitharrows{1}{solid}\VSPa} @@ -486,7 +552,8 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Formats a cross reference label as "Section X on page Y". -\newcommand{\specref}[1]{Section~\ref{#1} on page~\pageref{#1}} +\newcommand{\nspecref}[2]{#1~\ref{#2} on page~\pageref{#2}} +\newcommand{\specref}[1]{\nspecref{Section}{#1}} % For caption for supertabular and figure, by yanyh15 \captionsetup[table]{labelfont={sf,sc,bf},textfont=normalfont,singlelinecheck=off,labelformat=simple,labelsep=colon,aboveskip=00pt,belowskip=10pt} @@ -501,7 +568,6 @@ % \cppexample formats blue markers, caption, and code for C++ examples % \fexample formats blue markers, caption, and code for Fortran (fixed) examples % \ffreeexample formats blue markers, caption, and code for Fortran90 (free) examples -% Thanks to Jin, Haoqiang H. for the original definitions of the following: \usepackage{color,fancyvrb} % for \VerbatimInput \usepackage{xargs} % for optional args @@ -542,7 +608,7 @@ \def\fcnt{\the\cnt} % \def\fcnt{\stagcnt} \noindent - \hypertarget{ex:\cname}{\textit{Example \ename}}\vername + \underline{\hypertarget{ex:\cname}{\textit{Example \ename}}\vername} %\vspace*{-3mm} \code{\VerbatimInput[numbers=left,numbersep=5ex,firstnumber=1,firstline=\fcnt,fontsize=\small]% {\chapdirname/sources/\cname}} @@ -569,27 +635,37 @@ } \newcommandx*\cexample[4][1=,4=0]{% -\needspace{5\baselineskip}\ccppspecificstart +\ifccpp +\needspace{5\baselineskip}\begin{ccppspecific} \cnexample[#1]{#2}{#3}[#4] -\ccppspecificend +\end{ccppspecific} +\fi } \newcommandx*\cppexample[4][1=,4=0]{% -\needspace{5\baselineskip}\cppspecificstart +\ifccpp +\needspace{5\baselineskip}\begin{cppspecific} \cppnexample[#1]{#2}{#3}[#4] -\cppspecificend +\end{cppspecific} +\fi } \newcommandx*\fexample[4][1=,4=0]{% -\needspace{5\baselineskip}\fortranspecificstart +\iffortran +\needspace{5\baselineskip} +\begin{fortranspecific} \fnexample[#1]{#2}{#3}[#4] -\fortranspecificend +\end{fortranspecific} +\fi } \newcommandx*\ffreeexample[4][1=,4=0]{% -\needspace{5\baselineskip}\fortranspecificstart +\iffortran +\needspace{5\baselineskip} +\begin{fortranspecific} \ffreenexample[#1]{#2}{#3}[#4] -\fortranspecificend +\end{fortranspecific} +\fi } \newcommandx*\hexentry[4][1=c,3=]{% diff --git a/parallel_execution/fort_do.tex b/parallel_execution/fort_do.tex index 9ef9bab..57a7a92 100644 --- a/parallel_execution/fort_do.tex +++ b/parallel_execution/fort_do.tex @@ -1,9 +1,9 @@ \pagebreak +\begin{fortranspecific}[4ex] \section{Fortran Restrictions on the \kcode{do} Construct} \label{sec:fort_do} \index{constructs!do@\kcode{do}} \index{do construct@\kcode{do} construct} -\fortranspecificstart If an \kcode{end do} directive follows a \plc{do-construct} in which several \bcode{DO} statements share a \bcode{DO} termination statement, then a \kcode{do} @@ -17,6 +17,6 @@ The following example is non-conforming because the matching \kcode{do} directiv for the \kcode{end do} does not precede the outermost loop: \fnexample{fort_do}{2} -\fortranspecificend +\end{fortranspecific} diff --git a/parallel_execution/loop.tex b/parallel_execution/loop.tex index d8940bc..6e865d6 100644 --- a/parallel_execution/loop.tex +++ b/parallel_execution/loop.tex @@ -21,10 +21,8 @@ from where the function \ucode{foo} is called. Binding to \kcode{teams} allows t parallelism to be available for the second \kcode{loop} construct. The loop iterations can be executed concurrently, thus allowing implementations to perform various loop nest optimizations including -reordering of the \ucode{i} and \ucode{j} loops. The \kcode{loop} construct can be -implemented using any parallelism-generating mechanism, which allows better use -of hardware resources while also allowing sequential optimizations, reordering, -tiling etc. +reordering of the \ucode{i} and \ucode{j} loops. The \kcode{loop} construct can be implemented +with the use of additional threads or some other concurrency mechanism, which allows better use of hardware resources while also allowing sequential optimizations, reordering, tiling etc. For example, the first \kcode{loop} construct could be implemented as if it was specified as \kcode{distribute parallel for} and the second \kcode{loop} construct as if it was specified as diff --git a/parallel_execution/pra_iterator.tex b/parallel_execution/pra_iterator.tex index fb215a6..63de292 100644 --- a/parallel_execution/pra_iterator.tex +++ b/parallel_execution/pra_iterator.tex @@ -1,12 +1,12 @@ %\pagebreak +\begin{cppspecific}[4ex] \section{Parallel Random Access Iterator Loop} -\cppspecificstart \label{sec:pra_iterator} \index{random access iterator, C++} The following example shows a parallel random access iterator loop. \cppnexample[3.0]{pra_iterator}{1} -\cppspecificend +\end{cppspecific} diff --git a/parallel_execution/workshare.tex b/parallel_execution/workshare.tex index 45ae15d..241c5d8 100644 --- a/parallel_execution/workshare.tex +++ b/parallel_execution/workshare.tex @@ -1,6 +1,6 @@ %\pagebreak +\begin{fortranspecific}[4ex] \section{\kcode{workshare} Construct} -\fortranspecificstart \label{sec:workshare} \index{constructs!workshare@\kcode{workshare}} \index{workshare construct@\kcode{workshare} construct} @@ -68,6 +68,6 @@ fragment regardless of whether the code is executed sequentially or inside an Op program with multiple threads: \fnexample{workshare}{7} -\fortranspecificend +\end{fortranspecific} diff --git a/program_control/assumption.tex b/program_control/assumption.tex index f535ab8..f585761 100644 --- a/program_control/assumption.tex +++ b/program_control/assumption.tex @@ -18,7 +18,9 @@ Assumption directives provide additional information about the expected properties of the program that may be used by an implementation for optimization. -Ignoring this information should not alter the behavior of the program. The C/C++ example +Ignoring this information should not alter the behavior of the program. + +The C/C++ example shows the use of delimited scope (Case 1) and block-associated (Case 2) assumption directives. A similar effect is shown for Fortran where the \kcode{assumes} directive is used in the module (Case 1) and the block-associated directive uses an \kcode{end assume} termination (Case 2). @@ -32,3 +34,19 @@ could eliminate additional checks. \cexample[5.1]{assumption}{1} \ffreeexample[5.1]{assumption}{1} + +\pagebreak +In the following example the \kcode{no_openmp} and \kcode{no_parallelism} assumption clauses are used. +The \kcode{no_openmp} clause is shorthand for the \kcode{no_openmp_contructs} and \kcode{no_openmp_routines} clauses. + +In Case 1 the \kcode{assume} directive with the \kcode{no_openmp} clause is applied to an external function call \ucode{init}. +Independent of the compiler's ability to derive necessary information about \ucode{init}, the \kcode{assume} directive guarantees +the absence of OpenMP constructs or OpenMP runtime calls so that the compiler may manage hardware and the runtime in a more optimal manner. + +In Case 2, the \kcode{assume} directive with \kcode{no_parallelism} is nested inside the \kcode{target teams loop} directive. By providing the information +that no other OpenMP parallelism generating constructs are going to be encountered in the function, +the implementation of \ucode{element_transform} may have an opportunity to optimize the code in the \kcode{loop} construct, +which may now be implemented using all additional threads available or via some other concurrency mechanism. + +\cexample[6.0]{assumption}{2} +\ffreeexample[6.0]{assumption}{2} diff --git a/program_control/cancellation.tex b/program_control/cancellation.tex index d999344..decd3dc 100644 --- a/program_control/cancellation.tex +++ b/program_control/cancellation.tex @@ -5,10 +5,16 @@ \index{constructs!cancel@\kcode{cancel}} \index{cancel construct@\kcode{cancel} construct} +The examples in this section show how the \kcode{cancel} directive can be used to terminate +an OpenMP region. Cancellation of the binding region is activated only if the \plc{cancel-var} ICV +is true, in which case the \kcode{cancel} construct (except \kcode{taskgroup}) causes the encountering +\kcode{task} to continue execution at the end of the binding. If the \plc{cancel-var} ICV is false, the +\kcode{cancel} construct is ignored. + \index{cancellation!for parallel region@for \kcode{parallel} region} \index{cancellation!for worksharing region} -The following example shows how the \kcode{cancel} directive can be used to terminate -an OpenMP region. Although the \kcode{cancel} construct terminates the OpenMP + +In the following example although the \kcode{cancel} construct terminates the OpenMP worksharing region, programmers must still track the exception through the pointer \ucode{ex} and issue a cancellation for the \kcode{parallel} region if an exception has been raised. The primary thread checks the exception pointer to make sure that the @@ -45,7 +51,11 @@ levels of the tree. The following is the equivalent parallel search example in Fortran. +The code uses the \kcode{atomic write} directive for atomically +updating pointer variables -- a feature defined in OpenMP 6.0. +For earlier versions of OpenMP, the \kcode{critical} directive could +be used instead. -\ffreeexample[5.1]{cancellation}{2} +\ffreeexample[6.0]{cancellation}{2} diff --git a/program_control/cond_comp.tex b/program_control/cond_comp.tex index 79e8290..e5f3369 100644 --- a/program_control/cond_comp.tex +++ b/program_control/cond_comp.tex @@ -4,20 +4,20 @@ \index{conditional compilation!_OPENMP macro@\kcode{_OPENMP} macro} \index{conditional compilation!sentinel} -\ccppspecificstart +\begin{ccppspecific} The following example illustrates the use of conditional compilation using the OpenMP macro \kcode{_OPENMP}. With OpenMP compilation, the \kcode{_OPENMP} macro becomes defined. \cnexample{cond_comp}{1} -\ccppspecificend +\end{ccppspecific} -\fortranspecificstart +\begin{fortranspecific} The following example illustrates the use of the conditional compilation sentinel. With OpenMP compilation, the conditional compilation sentinel \scode{!$} is recognized and treated as two spaces. In fixed form source, statements guarded by the sentinel must start after column 6. \fnexample{cond_comp}{1} -\fortranspecificend +\end{fortranspecific} diff --git a/program_control/dispatch.tex b/program_control/dispatch.tex new file mode 100644 index 0000000..3006d16 --- /dev/null +++ b/program_control/dispatch.tex @@ -0,0 +1,60 @@ +\pagebreak +\section{\kcode{dispatch} Construct} +\label{sec:dispatch} + +\index{construct!dispatch@\kcode{dispatch}} +\index{dispatch construct@\kcode{dispatch} construct} + +\index{dispatch construct@\kcode{dispatch} construct!novariants clause@\kcode{novariants} clause} +\index{clauses!novariants@\kcode{novariants}} +\index{novariants clause@\kcode{novariants} clause} + +\index{dispatch construct@\kcode{dispatch} construct!nocontext clause@\kcode{nocontext} clause} +\index{clauses!nocontext@\kcode{nocontext}} +\index{nocontext clause@\kcode{nocontext} clause} + +The \kcode{dispatch} directive can be applied to a statement that performs a +procedure call to control variant substitution for the called procedure. + +In the example below, the \ucode{foo_variant1()} and \ucode{foo_variant2()} +procedures are declared as variants for \ucode{foo()} using the +\kcode{declare variant} directive, with matching requirements specified +by the \kcode{match} clause's context selector. To be selected for +substitution, both variants require that the condition \ucode{foo_sub} evaluates +to \plc{true}. + +In Cases 1 and 2, the calls to \ucode{foo()} are not controlled by a +\kcode{dispatch} construct. Hence, there can be no match for the +\ucode{foo_variant2()} variant. A \ucode{foo_variant1()} call is substituted for +the call to \ucode{foo()} in Case 1, as the matching requirement +is satisfied by \ucode{foo_sub} being \plc{true}. In Case 2, there is +no variant substitution as \ucode{foo_sub} is \plc{false}. + +Cases 3 through 6 illustrate some uses of the \kcode{dispatch} construct, +including uses of the \kcode{novariants} and \kcode{nocontext} clauses on the +directive. + +In Case 3, variant substitution does not occur as \ucode{foo_sub} is \plc{false}. + +In Case 4, \ucode{foo_sub} is \plc{true} and the \kcode{dispatch} construct is +part of the OpenMP context; therefore, the matching requirements for both +variants to \ucode{foo()} are satisfied. As the matching requirements for the +\ucode{foo_variant1()} variant are a subset of the matching requirements for the +\ucode{foo_variant2()} variant (per the OpenMP specification, its computed score +for matching purposes is smaller), \ucode{foo_variant2()} is selected for +variant substitution. (Note that prior to OpenMP 6.0, which of the two variants +are selected for substitution is implementation defined since the earlier +specifications did not require an implementation to treat the \kcode{dispatch} +construct as part of the OpenMP context at the call site.) + +In Case 5, the \kcode{novariants} clause disables variant +substitution for the call to \ucode{foo()}, despite the matching requirements +being satisfied for both variants. + +In Case 6, the \kcode{nocontext} clause directs the implementation to not +include the \kcode{dispatch} construct in the OpenMP context at the call site +for \ucode{foo()}. Hence, the \ucode{foo_variant2()} variant is not considered +and \ucode{foo_variant1()} is instead selected for variant substitution. + + \cexample[6.0]{dispatch}{1} + \ffreeexample[6.0]{dispatch}{1} diff --git a/program_control/icv.tex b/program_control/icv.tex index 7c6b4f5..c82ee8a 100644 --- a/program_control/icv.tex +++ b/program_control/icv.tex @@ -13,8 +13,7 @@ whole program. In the following example, the \plc{nest-var}, \plc{max-active-levels-var}, \plc{dyn-var}, and \plc{nthreads-var} ICVs are modified through calls to -the runtime library routines \kcode{omp_set_nested},\\ \kcode{omp_set_max_active_levels}, \kcode{omp_set_dynamic}, -and \kcode{omp_set_num_threads} respectively. These ICVs +the runtime library routines \kcode{omp_set_nested}, \kcode{omp_set_max_active_levels}, \kcode{omp_set_dynamic}, and \kcode{omp_set_num_threads} respectively. These ICVs affect the operation of \kcode{parallel} regions. Each implicit task generated by a \kcode{parallel} region has its own copy of the \plc{nest-var}, \plc{dyn-var}, and \plc{nthreads-var} ICVs. @@ -40,7 +39,7 @@ by 3, there will be a total of 6 implicit tasks generated by the two inner \kcod regions. Each implicit task generated by an inner \kcode{parallel} region will execute -the call to\\ \kcode{omp_set_num_threads(\ucode{4})}, assigning the value 4 to its respective +the call to \kcode{omp_set_num_threads(\ucode{4})}, assigning the value 4 to its respective copy of \plc{nthreads-var}. The print statement in the outer \kcode{parallel} region is executed by only one @@ -56,3 +55,53 @@ region. \fexample{icv}{1} +\pagebreak +\subsection{\kcode{num_threads} Clause with a List} +\label{subsec:icv_nthreads} +\index{clauses!num_threads@\kcode{num_threads}} +\index{num_threads clause@\kcode{num_threads} clause} + +Prior to OpenMP 6.0, only a single argument can be specified in the +\kcode{num_threads} clause of a \kcode{parallel} construct. +In this case, the clause argument is used as the requested team size for +that \kcode{parallel} region only and does not affect the value of the +\plc{nthreads-var} ICV in any generated implicit tasks for nested +\kcode{parallel} regions. +That value is instead inherited from the value of the \plc{nthreads-var} +ICV in the task that encountered the \kcode{parallel} construct, +stripping away the first integer, if the value of that ICV is a list of +multiple integers. + +In OpenMP 6.0, the \kcode{num_threads} clause permits more than one argument. +In this case, the first argument is still used as the requested team size for +the \kcode{parallel} region. The difference is the \plc{nthreads-var} ICVs of +the generated implicit tasks are set to the list of values given by the +remaining clause arguments, rather than inheriting the value of the +encountering task's \plc{nthreads-var} ICV. Consequentially, a +\kcode{num_threads} clause with an argument list may be used to control not +only the team size for a given \kcode{parallel} region, but also the +requested team size of any nested \kcode{parallel} regions. + +The following example illustrates the effect of the \kcode{num_threads} clause +for nested \kcode{parallel} regions. The program starts with the environment +variable \kcode{OMP_NUM_THREADS} set to \ucode{"4,5,6"}, which initializes the +\plc{nthreads-var} ICV of the initial task to the list \{\vcode{4,5,6}\}. Case 1 shows +how this ICV is used to control the requested team size for a nest of three +\kcode{parallel} regions. As indicated from the comments, with each +successive nesting level the \plc{nthreads-var} ICV inherits all but the first +integer in the \plc{nthreads-var} ICV of the task that encounters the +\kcode{parallel} construct. This pattern continues until the \plc{nthreads-var} +ICV contains only a single integer, at which point that value persists for any +further nesting levels. In Case 2, a \kcode{num_threads(\ucode{8})} clause appears on +the outermost \kcode{parallel} construct. This only has the effect of altering +the requested team size for that \kcode{parallel} region. Note that the value of +the \plc{nthreads-var} ICVs inside the \kcode{parallel} region are the same as +for Case 1. In Case 3, the \kcode{num_threads} clause is specified with +multiple arguments \kcode{(\ucode{8,2})}. This sets the \plc{nthreads-var} ICV value in each of +the generated implicit tasks to \{\vcode{2}\}, in accordance with the inheritance rules +for the \plc{nthreads-var} ICV described above. + +\cexample[6.0]{icv}{2}[2] + +\ffreeexample[6.0]{icv}{2}[2] + diff --git a/program_control/sources/assumption.1.c b/program_control/sources/assumption.1.c index e75dcda..5e528e1 100644 --- a/program_control/sources/assumption.1.c +++ b/program_control/sources/assumption.1.c @@ -5,7 +5,6 @@ * @@expect: success * @@version: omp_5.1 */ - #include #include @@ -35,7 +34,6 @@ int main() { } // Case 2: Block associated - #pragma omp assume holds (N % 8 == 0 && N > 0) #pragma omp simd for (int i = 0; i < N; ++i){ @@ -44,4 +42,3 @@ int main() { return 0; } - diff --git a/program_control/sources/assumption.1.f90 b/program_control/sources/assumption.1.f90 index 0dfaff7..b06e828 100644 --- a/program_control/sources/assumption.1.f90 +++ b/program_control/sources/assumption.1.f90 @@ -3,7 +3,6 @@ ! @@operation: compile ! @@expect: success ! @@version: omp_5.1 - module m !$omp assumes no_parallelism interface @@ -32,14 +31,12 @@ program main end do !! Case 1: Delimited scope, see module interface - !$omp target teams distribute parallel do map(tofrom: A) do i = 1, N call fun(A,i) end do !! Case 2: Block associated - !$omp assume holds (8*(N/8) == N .and. N>0) !! N is multiple of 8 !$omp simd do i = 1, N diff --git a/program_control/sources/assumption.2.c b/program_control/sources/assumption.2.c new file mode 100644 index 0000000..066d479 --- /dev/null +++ b/program_control/sources/assumption.2.c @@ -0,0 +1,34 @@ +/* +* @@name: assumption.2 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +#include +#define N 5 + +void init(int *arr, int len); +int element_transform(int a); + +int main() { + int arr[N], arr_bang[N]; + +//Case 1: Use in sequential code + #pragma omp assume no_openmp + { + init(arr,N); + } + +//Case 2: Use inside openmp construct + #pragma omp target teams loop map(to: arr) map(from: arr_bang) + for(int i = 0; i < N; i++) { + #pragma omp assume no_parallelism + { + arr_bang[i] = element_transform(arr[i]); + } + } + printf("%d, %d\n", arr_bang[0], arr_bang[N-1]); + + return 0; +} \ No newline at end of file diff --git a/program_control/sources/assumption.2.f90 b/program_control/sources/assumption.2.f90 new file mode 100644 index 0000000..78241b9 --- /dev/null +++ b/program_control/sources/assumption.2.f90 @@ -0,0 +1,40 @@ +! @@name: assumption.2 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 + +module mm + interface + subroutine init(arr, n) + integer :: arr(*) + integer :: n + end subroutine + function element_transform(a) result(r) + !$omp declare target + integer :: a, r + end function + end interface +end module + +program main + use mm + integer, parameter :: N=5 + integer :: arr(N), arr_bang(N) + +!!Case 1: Use in sequential code + !$omp assume no_openmp + call init(arr,N) + !$omp end assume + +!!Case 2: Use inside openmp construct + !$omp target teams loop map(to: arr) map(from: arr_bang) + do i=1,N + !$omp assume no_parallelism + arr_bang(i) = element_transform(arr(i)) + !$omp end assume + enddo + + print *, arr_bang(1), arr_bang(N) + +end program main \ No newline at end of file diff --git a/program_control/sources/cancellation.2.c b/program_control/sources/cancellation.2.c index 86a2589..83dbba2 100644 --- a/program_control/sources/cancellation.2.c +++ b/program_control/sources/cancellation.2.c @@ -21,7 +21,7 @@ binary_tree_t *search_tree(binary_tree_t *tree, int value, int level) { else { #pragma omp task shared(found) if(level < 10) { - binary_tree_t *found_left = NULL; + binary_tree_t *found_left; found_left = search_tree(tree->left, value, level + 1); if (found_left) { #pragma omp atomic write @@ -31,7 +31,7 @@ binary_tree_t *search_tree(binary_tree_t *tree, int value, int level) { } #pragma omp task shared(found) if(level < 10) { - binary_tree_t *found_right = NULL; + binary_tree_t *found_right; found_right = search_tree(tree->right, value, level + 1); if (found_right) { #pragma omp atomic write @@ -44,6 +44,7 @@ binary_tree_t *search_tree(binary_tree_t *tree, int value, int level) { } return found; } + binary_tree_t *search_tree_parallel(binary_tree_t *tree, int value) { binary_tree_t *found = NULL; #pragma omp parallel shared(found, tree, value) diff --git a/program_control/sources/cancellation.2.f90 b/program_control/sources/cancellation.2.f90 index be65d1a..eea6e37 100644 --- a/program_control/sources/cancellation.2.f90 +++ b/program_control/sources/cancellation.2.f90 @@ -2,7 +2,7 @@ ! @@type: F-free ! @@operation: compile ! @@expect: success -! @@version: omp_5.1 +! @@version: omp_6.0 module parallel_search type binary_tree integer :: value @@ -11,34 +11,34 @@ module parallel_search end type contains - recursive subroutine search_tree(tree, value, level, found) + recursive function search_tree(tree, value, level) result(found) type(binary_tree), intent(in), pointer :: tree integer, intent(in) :: value, level type(binary_tree), pointer :: found - type(binary_tree), pointer :: found_left => NULL(), & - found_right => NULL() + type(binary_tree), pointer :: found_left, found_right + found => NULL() if (associated(tree)) then if (tree%value .eq. value) then found => tree else !$omp task shared(found) if(level<10) - call search_tree(tree%left, value, level+1, found_left) + found_left => search_tree(tree%left, value, level+1) if (associated(found_left)) then -!$omp critical +!$omp atomic write found => found_left -!$omp end critical +!$omp end atomic !$omp cancel taskgroup endif !$omp end task !$omp task shared(found) if(level<10) - call search_tree(tree%right, value, level+1, found_right) + found_right => search_tree(tree%right, value, level+1) if (associated(found_right)) then -!$omp critical +!$omp atomic write found => found_right -!$omp end critical +!$omp end atomic !$omp cancel taskgroup endif @@ -47,7 +47,7 @@ contains !$omp taskwait endif endif - end subroutine + end function subroutine search_tree_parallel(tree, value, found) type(binary_tree), intent(in), pointer :: tree @@ -58,7 +58,7 @@ contains !$omp parallel shared(found, tree, value) !$omp masked !$omp taskgroup - call search_tree(tree, value, 0, found) + found => search_tree(tree, value, 0) !$omp end taskgroup !$omp end masked !$omp end parallel diff --git a/program_control/sources/declare_variant.3.c b/program_control/sources/declare_variant.3.c index 01d5a8c..d004230 100644 --- a/program_control/sources/declare_variant.3.c +++ b/program_control/sources/declare_variant.3.c @@ -2,7 +2,7 @@ * @@name: declare_variant.3 * @@type: C * @@operation: view -* @@expect: +* @@expect: none * @@version: omp_5.1 */ diff --git a/program_control/sources/dispatch.1.c b/program_control/sources/dispatch.1.c new file mode 100644 index 0000000..9beba35 --- /dev/null +++ b/program_control/sources/dispatch.1.c @@ -0,0 +1,59 @@ +/* +* @@name: dispatch.1 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +*/ +#include + +int foo_sub; + +void foo_variant1() +{ printf("in foo_variant1\n"); } + +void foo_variant2() +{ printf("in foo_variant2\n"); } + +#pragma omp declare variant(foo_variant1) \ + match(user={condition(foo_sub)}) +#pragma omp declare variant(foo_variant2) \ + match(construct={dispatch},user={condition(foo_sub)}) +void foo() +{ printf("in foo\n"); } + +int main() +{ + // Case 1 + foo_sub = 1; + foo(); // "in foo_variant1" + + // Case 2 + foo_sub = 0; + foo(); // "in foo" + + // Dispatch Cases + + // Case 3 + foo_sub = 0; + #pragma omp dispatch + foo(); // "in foo" + + // Case 4 + foo_sub = 1; + #pragma omp dispatch + foo(); // "in foo_variant2" + // see discussion for OpenMP 5.1/5.2 + + // Case 5 + foo_sub = 1; + #pragma omp dispatch novariants(1) + foo(); // "in foo" + + // Case 6 + foo_sub = 1; + #pragma omp dispatch nocontext(1) + foo(); // "in foo_variant1" + + return 0; +} diff --git a/program_control/sources/dispatch.1.f90 b/program_control/sources/dispatch.1.f90 new file mode 100644 index 0000000..26b483b --- /dev/null +++ b/program_control/sources/dispatch.1.f90 @@ -0,0 +1,62 @@ +! @@name: dispatch.1 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +module funcs + logical :: foo_sub + +contains + subroutine foo_variant1() + print*, "in foo_variant1" + end subroutine + + subroutine foo_variant2() + print*, "in foo_variant2" + end subroutine + + subroutine foo() + !$omp declare variant(foo_variant1) & + !$omp& match(user={condition(foo_sub)}) + !$omp declare variant(foo_variant2) & + !$omp match(construct={dispatch},user={condition(foo_sub)}) + print*, "in foo" + end subroutine + +end module funcs + +program main + use funcs + + !! Case 1 + foo_sub = .TRUE. + call foo() !! "in foo_variant1" + + !! Case 2 + foo_sub = .FALSE. + call foo() !! "in foo" + + !! Dispatch Cases + + !! Case 3 + foo_sub=.FALSE. + !$omp dispatch + call foo() !! "in foo" + + !! Case 4 + foo_sub = .TRUE. + !$omp dispatch + call foo(); !! "in foo_variant2" + !! see discussion for OpenMP 5.1/5.2 + + !! Case 5 + foo_sub = .TRUE. + !$omp dispatch novariants(.true.) + call foo(); !! "in foo" + + !! Case 6 + foo_sub = .TRUE. + !$omp dispatch nocontext(.true.) + call foo(); !! "in foo_variant1" + +end program diff --git a/program_control/sources/icv.2.c b/program_control/sources/icv.2.c new file mode 100644 index 0000000..7b29c9c --- /dev/null +++ b/program_control/sources/icv.2.c @@ -0,0 +1,87 @@ +/* +* @@name: icv.2 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +* @@env: OMP_NUM_THREADS="4,5,6" +* @@env: OMP_MAX_ACTIVE_LEVELS=3 +*/ +#include +#include + +void prn_info(int level) +{ + #pragma omp masked + printf("LV%d: nthrs_next=%d\n", + level, omp_get_max_threads()); +} + +// run with OMP_NUM_THREADS="4,5,6" OMP_MAX_ACTIVE_LEVELS=3 +int main (void) +{ + // nthreads-var: 4,5,6 + // max-active-levels-var: 3 + + // Case 1 + #pragma omp parallel // request 4 threads + { + prn_info(1); // LV1: nthrs_next=5 + + // nthreads-var: 5,6 + #pragma omp parallel // request 5 threads + { + prn_info(2); // LV2: nthrs_next=6 + + // nthreads-var: 6 + #pragma omp parallel // request 6 threads + { + prn_info(3); // LV3: nthrs_next=6 + + // nthreads-var: 6 + } + } + } + + // Case 2 + #pragma omp parallel num_threads(8) + { + prn_info(1); // LV1: nthrs_next=5 + + // nthreads-var: 5,6 + #pragma omp parallel // request 5 threads + { + prn_info(2); // LV2: nthrs_next=6 + + // nthreads-var: 6 + #pragma omp parallel // request 6 threads + { + prn_info(3); // LV3: nthrs_next=6 + + // nthreads-var: 6 + } + } + } + + // Case 3 + #pragma omp parallel num_threads(8,2) + { + prn_info(1); // LV1: nthrs_next=2 + + // nthreads-var: 2 + #pragma omp parallel // request 2 threads + { + prn_info(2); // LV2: nthrs_next=2 + + // nthreads-var: 2 + #pragma omp parallel // request 2 threads + { + prn_info(3); // LV3: nthrs_next=2 + + // nthreads-var: 2 + } + } + } + + return 0; +} diff --git a/program_control/sources/icv.2.f90 b/program_control/sources/icv.2.f90 new file mode 100644 index 0000000..6e7eba5 --- /dev/null +++ b/program_control/sources/icv.2.f90 @@ -0,0 +1,77 @@ +! @@name: icv.2 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +! @@env: OMP_NUM_THREADS="4,5,6" +! @@env: OMP_MAX_ACTIVE_LEVELS=3 +subroutine prn_info(level) + use omp_lib, only : omp_get_max_threads + implicit none + integer level + + !$omp masked + print 10, level, omp_get_max_threads() + !$omp end masked + 10 format("LV",i0,": nthrs_next=",i0) +end subroutine + +program main + implicit none + + !! run with OMP_NUM_THREADS="4,5,6" OMP_MAX_ACTIVE_LEVELS=3 + !! nthreads-var: 4,5,6 + !! max-active-levels-var: 3 + + !! Case 1 + !$omp parallel ! request 4 threads + call prn_info(1) ! LV1: nthrs_next=5 + + !! nthreads-var: 5,6 + !$omp parallel ! request 5 threads + call prn_info(2) ! LV2: nthrs_next=6 + + !! nthreads-var: 6 + !$omp parallel ! request 6 threads + call prn_info(3) ! LV3: nthrs_next=6 + + !! nthreads-var: 6 + !$omp end parallel + !$omp end parallel + !$omp end parallel + + !! Case 2 + !$omp parallel num_threads(8) + call prn_info(1) ! LV1: nthrs_next=5 + + !! nthreads-var: 5,6 + !$omp parallel ! request 5 threads + call prn_info(2) ! LV2: nthrs_next=6 + + !! nthreads-var: 6 + !$omp parallel ! request 6 threads + call prn_info(3) ! LV3: nthrs_next=6 + + !! nthreads-var: 6 + !$omp end parallel + !$omp end parallel + !$omp end parallel + + !! Case 3 + !$omp parallel num_threads(8,2) + call prn_info(1) ! LV1: nthrs_next=2 + + !! nthreads-var: 2 + !$omp parallel ! request 2 threads + call prn_info(2) ! LV2: nthrs_next=2 + + !! nthreads-var: 2 + !$omp parallel ! request 2 threads + call prn_info(3) ! LV3: nthrs_next=2 + + !! nthreads-var: 2 + !$omp end parallel + !$omp end parallel + !$omp end parallel + +end program diff --git a/program_control/sources/requires.1.f90 b/program_control/sources/requires.1.f90 index a94fb23..b4b6b18 100644 --- a/program_control/sources/requires.1.f90 +++ b/program_control/sources/requires.1.f90 @@ -27,6 +27,7 @@ end program subroutine do_something_with_p(p,q) use data + !$omp declare target type(mypoints) :: p integer :: q diff --git a/program_control/sources/selector_scoring.1.c b/program_control/sources/selector_scoring.1.c index 742b84f..44570a5 100644 --- a/program_control/sources/selector_scoring.1.c +++ b/program_control/sources/selector_scoring.1.c @@ -9,15 +9,6 @@ #include #include -#pragma omp declare variant(fx1) match(construct={target}) -#pragma omp declare variant(fx2) match(construct={teams,parallel,for}) -#pragma omp declare variant(fx3) match(device={kind(gpu),isa(sm_70)}) -#pragma omp declare variant(fx4) match(device={arch(nvptx),isa(sm_70)}) -void f(int *a, int i) -{ - *a = i; -} - void fx1(int *a, int i) { *a = i; @@ -38,6 +29,15 @@ void fx4(int *a, int i) *a = 4*i; } +#pragma omp declare variant(fx1) match(construct={target}) +#pragma omp declare variant(fx2) match(construct={teams,parallel,for}) +#pragma omp declare variant(fx3) match(device={kind(gpu),isa(sm_70)}) +#pragma omp declare variant(fx4) match(device={arch(nvptx),isa(sm_70)}) +void f(int *a, int i) +{ + *a = i; +} + int main() { #define N 4 diff --git a/program_control/sources/selector_scoring.2.c b/program_control/sources/selector_scoring.2.c index 9f5964d..dfc596c 100644 --- a/program_control/sources/selector_scoring.2.c +++ b/program_control/sources/selector_scoring.2.c @@ -13,21 +13,6 @@ const int version = 2; -#pragma declare variant(kernel_target_ua) \ - match(implementation={requires(unified_address)}) -#pragma declare variant(kernel_target_usm) \ - match(implementation={requires(unified_shared_memory)}) -#pragma declare variant(kernel_target_usm_v2) \ - match(implementation={requires(unified_shared_memory)}, \ - user={condition(score(1): version==2)}) -void kernel(int *a, int n) -{ - #pragma omp parallel for - for (int i = 0; i < n; i++) { - a[i] = i*i; - } -} - void kernel_target_ua(int *a, int n) { #pragma omp target data map(a[:n]) use_device_ptr(a) @@ -53,6 +38,21 @@ void kernel_target_usm_v2(int *a, int n) } } +#pragma omp declare variant(kernel_target_ua) \ + match(implementation={requires(unified_address)}) +#pragma omp declare variant(kernel_target_usm) \ + match(implementation={requires(unified_shared_memory)}) +#pragma omp declare variant(kernel_target_usm_v2) \ + match(implementation={requires(unified_shared_memory)}, \ + user={condition(score(1): version==2)}) +void kernel(int *a, int n) +{ + #pragma omp parallel for + for (int i = 0; i < n; i++) { + a[i] = i*i; + } +} + int main() { int a[1000]; diff --git a/program_control/sources/selector_scoring.2.f90 b/program_control/sources/selector_scoring.2.f90 index 4239275..fb17fb6 100644 --- a/program_control/sources/selector_scoring.2.f90 +++ b/program_control/sources/selector_scoring.2.f90 @@ -21,7 +21,7 @@ contains !$omp match(implementation={requires(unified_shared_memory)}, & !$omp user={condition(score(1): version==2)}) - integer :: a(*) + integer, target :: a(n) integer, value :: n integer :: i !$omp parallel do @@ -32,25 +32,27 @@ contains subroutine kernel_target_ua(a, n) use iso_c_binding - integer, target :: a(*) + integer, target :: a(n) integer, value :: n type(c_ptr) :: c_ap integer, pointer :: ap(:) integer :: i c_ap = c_loc(a) + ap => null() !$omp target data map(a(:n)) use_device_ptr(c_ap) - !$omp target - call c_f_pointer(c_ap, ap) + !$omp target + call c_f_pointer(c_ap, ap, [n]) !$omp parallel do do i = 1, n ap(i) = 2*i*i end do + ap => null() ! reset pointer association status !$omp end target !$omp end target data end subroutine subroutine kernel_target_usm(a, n) - integer :: a(*) + integer, target :: a(n) integer, value :: n integer :: i !$omp target parallel do @@ -60,7 +62,7 @@ contains end subroutine subroutine kernel_target_usm_v2(a, n) - integer :: a(*) + integer, target :: a(n) integer, value :: n integer :: i !$omp target teams loop diff --git a/program_control/sources/standalone.1.f90 b/program_control/sources/standalone.1.f90 index e938f7e..9ea3e90 100644 --- a/program_control/sources/standalone.1.f90 +++ b/program_control/sources/standalone.1.f90 @@ -4,7 +4,6 @@ ! @@expect: ct-error ! @@version: omp_3.1 SUBROUTINE STANDALONE_WRONG() - INTEGER A A = 1 diff --git a/program_control/sources/target_offload_control.1.c b/program_control/sources/target_offload_control.1.c index c581827..c9e9de8 100644 --- a/program_control/sources/target_offload_control.1.c +++ b/program_control/sources/target_offload_control.1.c @@ -6,11 +6,12 @@ * @@version: omp_5.0 * @@env: OMP_TARGET_OFFLOAD=default */ -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include typedef enum offload_policy {MANDATORY, DISABLED, DEFAULT, UNKNOWN, NOTSET} offload_policy_t; diff --git a/program_control/standalone.tex b/program_control/standalone.tex index 65e6f0c..7cc9eec 100644 --- a/program_control/standalone.tex +++ b/program_control/standalone.tex @@ -25,11 +25,11 @@ target. \ffreeexample[3.1]{standalone}{1} +\pagebreak The following version of the above example is conforming because the \kcode{flush}, \kcode{barrier}, \kcode{taskwait}, and \kcode{taskyield} directives are enclosed in a compound statement. -\pagebreak \cexample[3.1]{standalone}{2} The following example is conforming because the \kcode{flush}, \kcode{barrier}, diff --git a/synchronization/atomic_cas.tex b/synchronization/atomic_cas.tex index c1315bd..99b07b5 100644 --- a/synchronization/atomic_cas.tex +++ b/synchronization/atomic_cas.tex @@ -1,4 +1,5 @@ %\pagebreak +\begin{ccppspecific}[4ex] \section{Atomic Compare} \label{sec:cas} @@ -21,7 +22,7 @@ The ``greater than'' and ``less than'' forms are not available with the Fortran clause. One can use the \vcode{max} and \vcode{min} functions with the \kcode{atomic update} construct to perform the C/C++ example operations. -\cexample[5.1]{cas}{1} +\cnexample[5.1]{cas}{1} %\ffreeexample[5.1]{cas}{1} In OpenMP 5.1 the \kcode{compare} clause was also added to support \emph{Compare And @@ -38,5 +39,6 @@ Since the equivalence of Fortran pointers can be determined only with a function no Fortran version is provided here. The use of the associated function in an \kcode{atomic compare} syntax is being considered in a future release. -\cexample[5.1]{cas}{2} +\cnexample[5.1]{cas}{2} %\ffreeexample[5.1]{cas}{2} +\end{ccppspecific} diff --git a/synchronization/atomic_restrict.tex b/synchronization/atomic_restrict.tex index 50efed8..a03f9a8 100644 --- a/synchronization/atomic_restrict.tex +++ b/synchronization/atomic_restrict.tex @@ -13,7 +13,7 @@ construct. \cexample[3.1]{atomic_restrict}{2} -\fortranspecificstart +\begin{fortranspecific} The following example is non-conforming because \ucode{I} and \ucode{R} reference the same location but have different types. @@ -23,5 +23,5 @@ Although the following example might work on some implementations, this is also non-conforming: \fnexample[3.1]{atomic_restrict}{3} -\fortranspecificend +\end{fortranspecific} diff --git a/synchronization/sources/atomic.4.c b/synchronization/sources/atomic.4.c index 7232040..4dbf40d 100644 --- a/synchronization/sources/atomic.4.c +++ b/synchronization/sources/atomic.4.c @@ -5,6 +5,7 @@ * @@expect: success * @@version: omp_5.0 */ +#include void calc_val(float *val); diff --git a/synchronization/sources/atomic.4.f90 b/synchronization/sources/atomic.4.f90 index 7c4f354..b43dbad 100644 --- a/synchronization/sources/atomic.4.f90 +++ b/synchronization/sources/atomic.4.f90 @@ -3,8 +3,8 @@ ! @@operation: compile ! @@expect: success ! @@version: omp_5.0 - subroutine boxster(box_totals, vals, box, N) + use omp_lib external calc_val real, intent(inout) :: box_totals(:) real, intent(in) :: vals(:) @@ -18,4 +18,4 @@ subroutine boxster(box_totals, vals, box, N) box_totals( box(idx) ) = box_totals( box(idx) ) + vals(idx) enddo -end subroutine \ No newline at end of file +end subroutine diff --git a/tasking/sources/task_dep.14.c b/tasking/sources/task_dep.14.c new file mode 100644 index 0000000..4a89d62 --- /dev/null +++ b/tasking/sources/task_dep.14.c @@ -0,0 +1,55 @@ +/* +* @@name: task_dep.14 +* @@type: C +* @@operation: run +* @@expect: success +* @@version: omp_6.0 +*/ +#include + +void my_func(int *M, int *v); + +#define N_ROWS 20 +#define N_COLS 20 +#define NUM_VS 5 +#define ROWS_PER_TASK 5 +int M[N_ROWS*N_COLS], v[NUM_VS][N_COLS]; + +int main() +{ + for (int i = 0; i < N_ROWS*N_COLS; i++) + M[i] = 1; + + for (int i = 0; i < NUM_VS; i++) + for (int j = 0; j < N_COLS; j++) + v[i][j] = 2; + + #pragma omp parallel single + for (int h = 0; h < NUM_VS; h++) { + // Generate transparent task to establish dependences + // between child tasks that don't share the same parent. + #pragma omp task depend(inout:M[:]) transparent(omp_impex) + my_func(M, v[h]); + } + + int check_value = 1; + for (int i = 0; i < NUM_VS; i++) + check_value *= 2; + for (int i = 0; i < N_ROWS*N_COLS; i++) + if (M[i] != check_value) + return 1; + + return 0; +} + +void my_func(int *M, int *v) +{ + for (int i = 0; i < N_ROWS; i += ROWS_PER_TASK) { + // This task is dependency-ordered with respect to the corresponding + // task in iteration i generated by other transparent tasks. + #pragma omp task depend(inout:M[i*N_COLS]) + for (int j = 0; j < ROWS_PER_TASK; j++) + for (int k = 0; k < N_COLS; k++) + M[(i+j)*N_COLS + k] *= v[k]; + } +} diff --git a/tasking/sources/task_dep.14.f90 b/tasking/sources/task_dep.14.f90 new file mode 100644 index 0000000..0b0793b --- /dev/null +++ b/tasking/sources/task_dep.14.f90 @@ -0,0 +1,49 @@ +! @@name: task_dep.14 +! @@type: F-free +! @@operation: run +! @@expect: success +! @@version: omp_6.0 +program main + use omp_lib + integer, parameter :: N_ROWS = 20 + integer, parameter :: N_COLS = 20 + integer, parameter :: NUM_VS = 5 + integer, parameter :: ROWS_PER_TASK = 5 + integer :: h + integer :: M(0:N_ROWS*N_COLS-1), v(0:N_COLS-1,0:NUM_VS-1) + integer :: check_value + + M(:) = 1 + v(:,:) = 2 + + !$omp parallel single + do h = 0, NUM_VS-1 + ! Generate transparent task to establish dependences + ! between child tasks that don't share the same parent. + !$omp task depend(inout:M) transparent(omp_impex) + call my_func(M, v(:,h)) + !$omp end task + end do + !$omp end parallel single + + check_value = 2**NUM_VS + if (any(M /= check_value)) error stop + +contains + subroutine my_func(M, v) + integer :: M(0:), v(0:) + integer :: i,j,k + + do i = 0, N_ROWS-1, ROWS_PER_TASK + ! This task is dependency-ordered with respect to the corresponding + ! task in iteration i generated by other transparent tasks. + !$omp task depend(inout:M(i*N_COLS)) + do j = 0, ROWS_PER_TASK-1 + do k = 0, N_COLS-1 + M((i+j)*N_COLS+k) = M((i+j)*N_COLS+k) * v(k) + end do + end do + !$omp end task + end do + end subroutine +end program diff --git a/tasking/sources/taskloop_dep.1.c b/tasking/sources/taskloop_dep.1.c new file mode 100644 index 0000000..55872a6 --- /dev/null +++ b/tasking/sources/taskloop_dep.1.c @@ -0,0 +1,35 @@ +/* +* @@name: taskloop_dep.1 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +#include + +void process_work_a(int n, float *A) +{ + // Dependences for taskloop iterations and across taskloops + + // TL1 taskloop + // nogroup removes the implicit taskgroup + #pragma omp taskloop nogroup + for (int i = 1; i < n; i++) + { + #pragma omp task_iteration depend(inout: A[i]) depend(in: A[i-1]) + A[i] += A[i] * A[i-1]; + } + + // TL2 taskloop + grainsize + #pragma omp taskloop grainsize(strict: 4) nogroup + for (int i = 1; i < n; i++) + { + #pragma omp task_iteration depend(inout: A[i]) depend(in: A[i-4]) \ + if ((i % 4) == 0 || i == n-1) + A[i] += A[i] * A[i-1]; + } + + // T3 other task + #pragma omp task depend(in: A[n-1]) + printf("A[n-1] = %f\n", A[n-1]); +} diff --git a/tasking/sources/taskloop_dep.1.f90 b/tasking/sources/taskloop_dep.1.f90 new file mode 100644 index 0000000..d707008 --- /dev/null +++ b/tasking/sources/taskloop_dep.1.f90 @@ -0,0 +1,36 @@ +! @@name: taskloop_dep.1 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine process_work_a(n, A) + implicit none + integer :: n + real :: A(*) + integer :: i + + ! Dependences for taskloop iterations and across taskloops + + ! TL1 taskloop + ! nogroup removes the implicit taskgroup + !$omp taskloop nogroup + do i = 2, n + !$omp task_iteration depend(inout: A(i)) depend(in: A(i-1)) + A(i) = A(i) + A(i) * A(i-1) + end do + !$omp end taskloop + + ! TL2 taskloop + grainsize + !$omp taskloop grainsize(strict: 4) nogroup + do i = 2, n + !$omp task_iteration depend(inout: A(i)) depend(in: A(i-4)) & + !$omp& if (mod(i, 4) == 1 .or. i == n) + A(i) = A(i) + A(i) * A(i-1) + end do + !$omp end taskloop + + ! T3 other task + !$omp task depend(in: A(n)) + print *, "A(n) =", A(n) + !$omp end task +end subroutine diff --git a/tasking/sources/taskloop_dep.2.c b/tasking/sources/taskloop_dep.2.c new file mode 100644 index 0000000..b10e988 --- /dev/null +++ b/tasking/sources/taskloop_dep.2.c @@ -0,0 +1,29 @@ +/* +* @@name: taskloop_dep.2 +* @@type: C +* @@operation: compile +* @@expect: success +* @@version: omp_6.0 +*/ +#include + +void process_work_b(int n, float *B[n]) +{ + // Dependences for taskloop iterations in multi-dimensional loop nest + + // TL4 taskloop + collapse + #pragma omp taskloop collapse(2) nogroup + for (int i = 1; i < n; i++) + { + for (int j = 1; j < n; j++) + { + #pragma omp task_iteration depend(inout: B[i][j]) \ + depend(in: B[i-1][j], B[i][j-1]) + B[i][j] += B[i][j] * B[i-1][j] * B[i][j-1]; + } + } + + // T5 other task + #pragma omp task depend(in: B[n-1][n-1]) + printf("B[n-1][n-1] = %f\n", B[n-1][n-1]); +} diff --git a/tasking/sources/taskloop_dep.2.f90 b/tasking/sources/taskloop_dep.2.f90 new file mode 100644 index 0000000..2a787da --- /dev/null +++ b/tasking/sources/taskloop_dep.2.f90 @@ -0,0 +1,29 @@ +! @@name: taskloop_dep.2 +! @@type: F-free +! @@operation: compile +! @@expect: success +! @@version: omp_6.0 +subroutine process_work_b(n, B) + implicit none + integer :: n + real :: B(n,*) + integer :: i, j + + ! Dependences for taskloop iterations in multi-dimensional loop nest + + ! TL4 taskloop + collapse + !$omp taskloop collapse(2) nogroup + do j = 2, n + do i = 2, n + !$omp task_iteration depend(inout: B(i,j)) & + !$omp& depend(in: B(i-1,j), B(i,j-1)) + B(i,j) = B(i,j) + B(i,j) * B(i-1,j) * B(i,j-1) + end do + end do + !$omp end taskloop + + ! T5 other task + !$omp task depend(in: B(n,n)) + print *, "B(n,n) =", B(n,n) + !$omp end task +end subroutine diff --git a/tasking/task_dep.tex b/tasking/task_dep.tex index 70549ac..6b95089 100644 --- a/tasking/task_dep.tex +++ b/tasking/task_dep.tex @@ -292,3 +292,35 @@ scheduled to execute at any time, with no ordering. \cexample[5.1]{task_dep}{13} \ffreeexample[5.1]{task_dep}{13} + +\subsection{Transparent Task Dependences} +\label{subsec:depend_trans_task} +\index{task dependences!transparent tasks} + + +In the following example, each iteration of the \ucode{h}-loop updates all +elements of array \ucode{M} and task dependences are used to synchronize +updates across different iterations of the loop. The code uses two levels of +dependent tasks and assumes that +\ucode{N_ROWS} is evenly divisible by \ucode{ROWS_PER_TASK}. +The \ucode{h}-loop generates the first level of tasks, with +the \kcode{depend} clause serializing their execution and each task calling +\ucode{my_func}. A second level of tasks are generated by the \ucode{i}-loop in +\ucode{my_func}. + +However, the dependences for this second level of tasks are between tasks from +different calls to \ucode{my_func}. In order to enforce these dependences, the +first-level tasks are specified as transparent tasks with the +\kcode{transparent(omp_impex)} clause. The \kcode{omp_impex} argument (which +is the default if not explicitly specified) indicates that the task is both an +exporting and importing task. For the purposes of dependence matching, an +exporting task is one that makes its child tasks visible to its successors and +an importing task is one that makes its preceding tasks (such as earlier +sibling tasks) visible to its child tasks. As a result of the exposed +dependences, the task generated in the $i^{th}$ iteration of the +\ucode{h}=$h_0$ instance of \ucode{my_func} is guaranteed to be ordered before +the task generated in the $i^{th}$ iteration of the \ucode{h}=$h_1$ instance of +\ucode{my_func}, where $h_0 < h_1$. + +\cexample[6.0]{task_dep}{14} +\ffreeexample[6.0]{task_dep}{14} diff --git a/tasking/taskloop_dep.tex b/tasking/taskloop_dep.tex new file mode 100644 index 0000000..749fc80 --- /dev/null +++ b/tasking/taskloop_dep.tex @@ -0,0 +1,51 @@ +\section{Task Dependences for \kcode{taskloop} Construct} +\label{sec:taskloop_depend} +\index{dependences!taskloop dependences} + +\index{task_iteration directive@\kcode{task_iteration} directive!depend clause@\kcode{depend} clause} +\index{task_iteration directive@\kcode{task_iteration} directive} +\index{directives!task_iteration@\kcode{task_iteration}} +\index{taskloop construct@\kcode{taskloop} construct} +\index{constructs!taskloop@\kcode{taskloop}} +\index{depend clause@\kcode{depend} clause} +\index{clauses!depend@\kcode{depend}} + +Dependences for tasks generated from a \kcode{taskloop} construct can +be specified using the \kcode{task_iteration} directive nested in +the beginning of the associated loop body. + +In the following example, taskloop TL1 contains +a \kcode{task_iteration} directive with the \kcode{depend} clauses +that specify task dependences across loop iterations on variable \ucode{A} +(\ucode{A[i] $\rightarrow$ A[i-1]}). +The \kcode{nogroup} clause for the \kcode{taskloop} construct removes +the implicit taskgroup for a taskloop so that dependences across taskloops and +with other tasks can be specified. +For taskloop TL2, the dependence (\ucode{A[i] $\rightarrow$ A[i-4]}) +is specified for every 4 loop iterations +as defined by the \kcode{if} clause that matches with +the chunk size 4 specified in the \kcode{grainsize} clause for taskloop tasks. +The dependences are generated only for those iterations where +the \kcode{if} condition evaluates to \plc{true}. +For instance, the first task generated from TL2 will update elements +\ucode{A[1:4]} with depend clauses \kcode{depend(inout: \ucode{A[4]})} +and \kcode{depend(in: \ucode{A[0]})}. This ensures element \ucode{A[4]} +(thus elements \ucode{A[1:3]}) will be available from TL1 before executing +the task. +The last task T3 will wait for the availability of \ucode{A[n-1]} +(or \ucode{A(n)} in Fortran) before printing the result. + +\cexample[6.0]{taskloop_dep}{1} + +\ffreeexample[6.0]{taskloop_dep}{1} + +The following example shows the use of the \kcode{task_iteration} +directive for specifying task dependences in a multi-dimensional loop nest +from multiple loop iterations in taskloop TL4. +Similar to the previous example, the \kcode{nogroup} clause removes +the implicit taskgroup for the \kcode{taskloop} construct so that +dependences with other tasks (T5 in this case) can be specified. + +\cexample[6.0]{taskloop_dep}{2} + +\ffreeexample[6.0]{taskloop_dep}{2} diff --git a/util/latexdiff/latexdiff b/util/latexdiff/latexdiff index 049e870..420567e 100755 --- a/util/latexdiff/latexdiff +++ b/util/latexdiff/latexdiff @@ -2791,8 +2791,8 @@ sub postprocess { # add basicstyle color{blue} to added lstinline commands # finally add the comment to the ones not having an optional argument before ###s/\\DIFaddlstinline(?!\[)/\\lstinline\n[basicstyle=\\color{blue}]$AUXCMD\n/g; - s/(%DIF < )([^\n]*?\n)/"${1}" . escape_command(${2}, "(?:plc|textnormal|textsubscript|textsl)", "@\\DIFdel{", "}@")/esg; - s/(%DIF > )([^\n]*?\n)/"${1}" . escape_command(${2}, "(?:plc|textnormal|textsubscript|textsl)", "@\\DIFadd{", "}@")/esg; + s/(%DIF < )([^\n]*?\n)/"${1}" . escape_command(${2}, "(?:plc|textnormal|textsubscript|textsl|ucode|kcode)", "@\\DIFdel{", "}@")/esg; + s/(%DIF > )([^\n]*?\n)/"${1}" . escape_command(${2}, "(?:plc|textnormal|textsubscript|textsl|ucode|kcode)", "@\\DIFadd{", "}@")/esg; s/}\\glossarydef(start|end)/}\n\\glossarydef\1\n/g; s/\\\\\s+\\DIFaddend\s+\\end\{supertabular\}/\\DIFaddend \\\\\n\\end{supertabular}/gm; diff --git a/util/latexdiff/latexdiff-fast b/util/latexdiff/latexdiff-fast index 4d9599f..e21e9f6 100755 --- a/util/latexdiff/latexdiff-fast +++ b/util/latexdiff/latexdiff-fast @@ -3352,8 +3352,8 @@ sub postprocess { # add basicstyle color{blue} to added lstinline commands # finally add the comment to the ones not having an optional argument before ###s/\\DIFaddlstinline(?!\[)/\\lstinline\n[basicstyle=\\color{blue}]$AUXCMD\n/g; - s/(%DIF < )([^\n]*?\n)/"${1}" . escape_command(${2}, "(?:plc|textnormal|textsubscript|textsl)", "@\\DIFdel{", "}@")/esg; - s/(%DIF > )([^\n]*?\n)/"${1}" . escape_command(${2}, "(?:plc|textnormal|textsubscript|textsl)", "@\\DIFadd{", "}@")/esg; + s/(%DIF < )([^\n]*?\n)/"${1}" . escape_command(${2}, "(?:plc|textnormal|textsubscript|textsl|ucode|kcode)", "@\\DIFdel{", "}@")/esg; + s/(%DIF > )([^\n]*?\n)/"${1}" . escape_command(${2}, "(?:plc|textnormal|textsubscript|textsl|ucode|kcode)", "@\\DIFadd{", "}@")/esg; s/}\\glossarydef(start|end)/}\n\\glossarydef\1\n/g; s/\\\\\s+\\DIFaddend\s+\\end\{supertabular\}/\\DIFaddend \\\\\n\\end{supertabular}/gm; diff --git a/util/latexdiff/latexdiff-vc b/util/latexdiff/latexdiff-vc index 099febc..817b4c6 100755 --- a/util/latexdiff/latexdiff-vc +++ b/util/latexdiff/latexdiff-vc @@ -514,7 +514,7 @@ foreach $diff ( @difffiles ) { # final compilation system("$CFG{LATEX} --interaction=batchmode \"$diff\";"); # needed if cross-refs - system("$CFG{LATEX} \"$diff\";"); # final, with possible error messages + system("$CFG{LATEX} --interaction=batchmode \"$diff\";"); if ( $rundvi2 ) { my $dvi="$diffbase.dvi"; diff --git a/versioninfo b/versioninfo index a70cecf..83f58a7 100644 --- a/versioninfo +++ b/versioninfo @@ -1,8 +1,8 @@ -# Examples Document Version -version=5.2.2 +# Examples Document Version (VER) +version=6.0 -# Supported Spec Version -version_spec=5.2 +# Supported Spec Version (SVER) +version_spec=6.0 -# Document Release Date -version_date=April 2024 +# Document Release Date (VERDATE) +version_date=November 2024