From eaec9ede6477a2c016e7c50150498a93f0f1811c Mon Sep 17 00:00:00 2001 From: Henry Jin Date: Fri, 8 Nov 2019 13:01:11 -0800 Subject: [PATCH] synced with v5.0.0 of the examples-internal repo --- Changes.log | 5 + Chap_SIMD.tex | 2 +- Chap_data_environment.tex | 4 +- Chap_memory_model.tex | 97 ++++++---- Chap_program_control.tex | 2 +- Chap_synchronization.tex | 38 ++-- Examples_SIMD.tex | 10 +- Examples_acquire_release.tex | 141 ++++++++++++++ Examples_affinity_display.tex | 104 +++++++++++ Examples_affinity_query.tex | 4 +- Examples_allocators.tex | 63 +++++++ Examples_array_sections.tex | 5 +- Examples_array_shaping.tex | 27 +++ Examples_async_target_depend.tex | 1 - Examples_async_target_with_tasks.tex | 1 + Examples_cancellation.tex | 2 + Examples_depobj.tex | 49 +++++ Examples_fort_sp_common.tex | 6 +- Examples_get_nthrs.tex | 1 + Examples_host_teams.tex | 28 +++ Examples_icv.tex | 1 + Examples_loop.tex | 13 ++ Examples_mem_model.tex | 3 + Examples_metadirective.tex | 88 +++++++++ Examples_nthrs_dynamic.tex | 1 + Examples_parallel_master_taskloop.tex | 33 ++++ Examples_reduction.tex | 176 +++++++++++++++++- Examples_requires.tex | 31 +++ Examples_standalone.tex | 2 + Examples_target.tex | 32 ++++ Examples_target_data.tex | 22 +-- Examples_target_mapper.tex | 86 +++++++++ Examples_target_offload.tex | 46 +++++ Examples_target_pointer_mapping.tex | 53 ++++++ Examples_target_structure_mapping.tex | 54 ++++++ Examples_target_unstructured_data.tex | 2 + Examples_task_affinity.tex | 32 ++++ Examples_task_dep.tex | 145 ++++++++++++++- Examples_taskgroup.tex | 2 +- Examples_tasking.tex | 2 +- Examples_taskloop.tex | 25 +++ Examples_udr.tex | 89 +++++++++ Examples_variant.tex | 77 ++++++++ Foreword_Chapt.tex | 23 +++ History.tex | 111 +++++++---- Introduction_Chapt.tex | 6 - Makefile | 3 +- README | 14 +- Title_Page.tex | 10 +- omp_copyright.txt | 2 +- openmp-example.tex | 83 +++++++++ openmp-examples.tex | 37 +++- sources/Example_acquire_release.1.c | 32 ++++ sources/Example_acquire_release.1.f90 | 29 +++ sources/Example_acquire_release.2.c | 32 ++++ sources/Example_acquire_release.2.f90 | 29 +++ sources/Example_acquire_release.3.c | 34 ++++ sources/Example_acquire_release.3.f90 | 31 +++ sources/Example_acquire_release_broke.4.c | 41 ++++ sources/Example_acquire_release_broke.4.f90 | 40 ++++ sources/Example_affinity.1.c | 7 +- sources/Example_affinity.6.c | 47 ++--- sources/Example_affinity.6.f90 | 48 ++--- sources/Example_affinity_display.1.c | 62 ++++++ sources/Example_affinity_display.1.f90 | 66 +++++++ sources/Example_affinity_display.2.c | 74 ++++++++ sources/Example_affinity_display.2.f90 | 76 ++++++++ sources/Example_affinity_display.3.c | 88 +++++++++ sources/Example_affinity_display.3.f90 | 77 ++++++++ sources/Example_affinity_query.1.c | 39 ++++ sources/Example_affinity_query.1.f90 | 33 ++++ sources/Example_allocators.1.c | 47 +++++ sources/Example_allocators.1.f90 | 51 +++++ sources/Example_array_shaping.1.c | 41 ++++ sources/Example_async_target.1.c | 9 +- sources/Example_async_target.2.c | 4 +- sources/Example_collapse.1.c | 4 + sources/Example_collapse.1.f | 4 + sources/Example_collapse.2.c | 1 + sources/Example_collapse.2.f | 1 + sources/Example_declare_target.6.c | 4 +- sources/Example_declare_variant.1.c | 57 ++++++ sources/Example_declare_variant.1.f90 | 69 +++++++ sources/Example_declare_variant.2.c | 50 +++++ sources/Example_declare_variant.2.f90 | 65 +++++++ sources/Example_depobj.1.c | 76 ++++++++ sources/Example_depobj.1.f90 | 83 +++++++++ sources/Example_device.4.c | 2 +- sources/Example_get_nthrs.1.c | 3 +- sources/Example_host_teams.1.c | 61 ++++++ sources/Example_host_teams.1.f90 | 64 +++++++ sources/Example_icv.1.c | 14 +- sources/Example_init_lock.1.cpp | 8 +- sources/Example_init_lock.1.f | 1 - sources/Example_init_lock_with_hint.1.cpp | 3 +- sources/Example_loop.1.c | 22 +++ sources/Example_loop.1.f90 | 19 ++ sources/Example_mem_model.3.f | 8 +- sources/Example_metadirective.1.c | 26 +++ sources/Example_metadirective.1.f90 | 20 ++ sources/Example_metadirective.2.c | 33 ++++ sources/Example_metadirective.2.f90 | 32 ++++ sources/Example_metadirective.3.c | 36 ++++ sources/Example_metadirective.3.f90 | 48 +++++ sources/Example_nestable_lock.1.c | 19 +- sources/Example_nesting_restrict.1.c | 4 + sources/Example_nesting_restrict.1.f | 7 +- sources/Example_parallel_master_taskloop.1.c | 31 +++ .../Example_parallel_master_taskloop.1.f90 | 41 ++++ sources/Example_requires.1.cpp | 40 ++++ sources/Example_requires.1.f90 | 39 ++++ sources/Example_standalone.1.f90 | 5 + sources/Example_target_data.3.c | 3 + sources/Example_target_data.4.c | 13 ++ sources/Example_target_data.4.f90 | 8 + sources/Example_target_mapper.1.c | 36 ++++ sources/Example_target_mapper.1.f90 | 38 ++++ sources/Example_target_mapper.2.c | 54 ++++++ sources/Example_target_mapper.2.f90 | 48 +++++ sources/Example_target_mapper.3.c | 43 +++++ sources/Example_target_mapper.3.f90 | 38 ++++ sources/Example_target_offload_control.1.c | 78 ++++++++ sources/Example_target_offload_control.1.f90 | 81 ++++++++ sources/Example_target_ptr_map.1.c | 49 +++++ sources/Example_target_ptr_map.2.c | 52 ++++++ sources/Example_target_reverse_offload.7.c | 47 +++++ sources/Example_target_reverse_offload.7.f90 | 37 ++++ sources/Example_target_struct_map.1.c | 46 +++++ sources/Example_target_struct_map.2.c | 43 +++++ sources/Example_target_struct_map.2.cpp | 55 ++++++ sources/Example_task_dep.1.c | 3 +- sources/Example_task_dep.10.c | 24 +++ sources/Example_task_dep.10.f90 | 25 +++ sources/Example_task_dep.11.c | 39 ++++ sources/Example_task_dep.11.f90 | 43 +++++ sources/Example_task_dep.3.c | 3 +- sources/Example_task_dep.4.c | 3 +- sources/Example_task_dep.4.f90 | 7 + sources/Example_task_dep.6.c | 40 ++++ sources/Example_task_dep.6.f90 | 43 +++++ sources/Example_task_dep.7.c | 41 ++++ sources/Example_task_dep.7.f90 | 43 +++++ sources/Example_task_dep.8.c | 35 ++++ sources/Example_task_dep.8.f90 | 37 ++++ sources/Example_task_dep.9.c | 30 +++ sources/Example_task_dep.9.f90 | 31 +++ sources/Example_task_reduction.1.c | 65 +++++++ sources/Example_task_reduction.1.f90 | 72 +++++++ sources/Example_tasking.1.c | 6 +- sources/Example_tasking.1.f90 | 3 + sources/Example_tasking.3.c | 2 + sources/Example_tasking.3.f90 | 6 + sources/Example_taskloop.2.c | 33 ++++ sources/Example_taskloop.2.f90 | 36 ++++ sources/Example_taskloop_reduction.1.c | 32 ++++ sources/Example_taskloop_reduction.1.f90 | 38 ++++ sources/Example_taskloop_reduction.2.c | 40 ++++ sources/Example_taskloop_reduction.2.f90 | 46 +++++ sources/Example_taskloop_simd_reduction.1.c | 53 ++++++ sources/Example_taskloop_simd_reduction.1.f90 | 63 +++++++ sources/Example_threadprivate.4.cpp | 2 + sources/Example_udr.1.c | 48 +++++ sources/Example_udr.1.f90 | 57 ++++++ sources/Example_udr.2.c | 40 ++++ sources/Example_udr.2.f90 | 44 +++++ sources/Example_udr.3.c | 71 +++++++ sources/Example_udr.3.f90 | 64 +++++++ sources/Example_udr.4.f90 | 58 ++++++ sources/Example_udr.5.cpp | 21 +++ sources/Example_udr.6.cpp | 20 ++ 170 files changed, 5828 insertions(+), 239 deletions(-) create mode 100644 Examples_acquire_release.tex create mode 100644 Examples_affinity_display.tex create mode 100644 Examples_allocators.tex create mode 100644 Examples_array_shaping.tex create mode 100644 Examples_depobj.tex create mode 100644 Examples_host_teams.tex create mode 100644 Examples_loop.tex create mode 100644 Examples_metadirective.tex create mode 100644 Examples_parallel_master_taskloop.tex create mode 100644 Examples_requires.tex create mode 100644 Examples_target_mapper.tex create mode 100644 Examples_target_offload.tex create mode 100644 Examples_target_pointer_mapping.tex create mode 100644 Examples_target_structure_mapping.tex create mode 100644 Examples_task_affinity.tex create mode 100644 Examples_udr.tex create mode 100644 Examples_variant.tex create mode 100644 Foreword_Chapt.tex create mode 100644 openmp-example.tex create mode 100644 sources/Example_acquire_release.1.c create mode 100644 sources/Example_acquire_release.1.f90 create mode 100644 sources/Example_acquire_release.2.c create mode 100644 sources/Example_acquire_release.2.f90 create mode 100644 sources/Example_acquire_release.3.c create mode 100644 sources/Example_acquire_release.3.f90 create mode 100644 sources/Example_acquire_release_broke.4.c create mode 100644 sources/Example_acquire_release_broke.4.f90 create mode 100644 sources/Example_affinity_display.1.c create mode 100644 sources/Example_affinity_display.1.f90 create mode 100644 sources/Example_affinity_display.2.c create mode 100644 sources/Example_affinity_display.2.f90 create mode 100644 sources/Example_affinity_display.3.c create mode 100644 sources/Example_affinity_display.3.f90 create mode 100644 sources/Example_affinity_query.1.c create mode 100644 sources/Example_affinity_query.1.f90 create mode 100644 sources/Example_allocators.1.c create mode 100644 sources/Example_allocators.1.f90 create mode 100644 sources/Example_array_shaping.1.c create mode 100644 sources/Example_declare_variant.1.c create mode 100644 sources/Example_declare_variant.1.f90 create mode 100644 sources/Example_declare_variant.2.c create mode 100644 sources/Example_declare_variant.2.f90 create mode 100644 sources/Example_depobj.1.c create mode 100644 sources/Example_depobj.1.f90 create mode 100644 sources/Example_host_teams.1.c create mode 100644 sources/Example_host_teams.1.f90 create mode 100644 sources/Example_loop.1.c create mode 100644 sources/Example_loop.1.f90 create mode 100644 sources/Example_metadirective.1.c create mode 100644 sources/Example_metadirective.1.f90 create mode 100644 sources/Example_metadirective.2.c create mode 100644 sources/Example_metadirective.2.f90 create mode 100644 sources/Example_metadirective.3.c create mode 100644 sources/Example_metadirective.3.f90 create mode 100644 sources/Example_parallel_master_taskloop.1.c create mode 100644 sources/Example_parallel_master_taskloop.1.f90 create mode 100644 sources/Example_requires.1.cpp create mode 100644 sources/Example_requires.1.f90 create mode 100644 sources/Example_target_mapper.1.c create mode 100644 sources/Example_target_mapper.1.f90 create mode 100644 sources/Example_target_mapper.2.c create mode 100644 sources/Example_target_mapper.2.f90 create mode 100644 sources/Example_target_mapper.3.c create mode 100644 sources/Example_target_mapper.3.f90 create mode 100644 sources/Example_target_offload_control.1.c create mode 100644 sources/Example_target_offload_control.1.f90 create mode 100644 sources/Example_target_ptr_map.1.c create mode 100644 sources/Example_target_ptr_map.2.c create mode 100644 sources/Example_target_reverse_offload.7.c create mode 100644 sources/Example_target_reverse_offload.7.f90 create mode 100644 sources/Example_target_struct_map.1.c create mode 100644 sources/Example_target_struct_map.2.c create mode 100644 sources/Example_target_struct_map.2.cpp create mode 100644 sources/Example_task_dep.10.c create mode 100644 sources/Example_task_dep.10.f90 create mode 100644 sources/Example_task_dep.11.c create mode 100644 sources/Example_task_dep.11.f90 create mode 100644 sources/Example_task_dep.6.c create mode 100644 sources/Example_task_dep.6.f90 create mode 100644 sources/Example_task_dep.7.c create mode 100644 sources/Example_task_dep.7.f90 create mode 100644 sources/Example_task_dep.8.c create mode 100644 sources/Example_task_dep.8.f90 create mode 100644 sources/Example_task_dep.9.c create mode 100644 sources/Example_task_dep.9.f90 create mode 100644 sources/Example_task_reduction.1.c create mode 100644 sources/Example_task_reduction.1.f90 create mode 100644 sources/Example_taskloop.2.c create mode 100644 sources/Example_taskloop.2.f90 create mode 100644 sources/Example_taskloop_reduction.1.c create mode 100644 sources/Example_taskloop_reduction.1.f90 create mode 100644 sources/Example_taskloop_reduction.2.c create mode 100644 sources/Example_taskloop_reduction.2.f90 create mode 100644 sources/Example_taskloop_simd_reduction.1.c create mode 100644 sources/Example_taskloop_simd_reduction.1.f90 create mode 100644 sources/Example_udr.1.c create mode 100644 sources/Example_udr.1.f90 create mode 100644 sources/Example_udr.2.c create mode 100644 sources/Example_udr.2.f90 create mode 100644 sources/Example_udr.3.c create mode 100644 sources/Example_udr.3.f90 create mode 100644 sources/Example_udr.4.f90 create mode 100644 sources/Example_udr.5.cpp create mode 100644 sources/Example_udr.6.cpp diff --git a/Changes.log b/Changes.log index 594cd19..70b2b73 100644 --- a/Changes.log +++ b/Changes.log @@ -1,3 +1,8 @@ +[02-Feb-2018] Note +This "Changes.log" is no longer updated. Please use History.tex and +the git log messages for changes. + + [20-May-2016] Version 4.5.0 Changes from 4.0.2ltx diff --git a/Chap_SIMD.tex b/Chap_SIMD.tex index efa6cbe..e7874c2 100644 --- a/Chap_SIMD.tex +++ b/Chap_SIMD.tex @@ -33,7 +33,7 @@ directive. Clauses provide argument specifications (\code{linear}, \code{uniform}, and \code{aligned}), a requested vector length (\code{simdlen}), and designate whether the function is always/never called conditionally in a loop (\code{branch}/\code{inbranch}). -The latter is for optimizing peformance. +The latter is for optimizing performance. Also, the \code{simd} construct has been combined with the worksharing loop constructs (\code{for simd} and \code{do simd}) to enable simultaneous thread diff --git a/Chap_data_environment.tex b/Chap_data_environment.tex index fdc20e4..e38d8aa 100644 --- a/Chap_data_environment.tex +++ b/Chap_data_environment.tex @@ -44,7 +44,7 @@ subsection of the OpenMP Specifications document. \bigskip DATA-MAPPING ATTRIBUTES -The \code{map} clause on a device construct explictly specifies how the list items in +The \code{map} clause on a device construct explicitly specifies how the list items in the clause are mapped from the encountering task's data environment (on the host) to the corresponding item in the device data environment (on the device). The common \plc{list items} are arrays, array sections, scalars, pointers, and @@ -55,7 +55,7 @@ within the list or block of a \code{declare target} directive. Also, a C/C++ poi is mapped as a zero-length array section, as is a C++ variable that is a reference to a pointer. % Waiting for response from Eric on this. -Without explict mapping, non-scalar and non-pointer variables within the scope of the \code{target} +Without explicit mapping, non-scalar and non-pointer variables within the scope of the \code{target} construct are implicitly mapped with a \plc{map-type} of \code{tofrom}. Without explicit mapping, scalar variables within the scope of the \code{target} construct are not mapped, but have an implicit firstprivate data-sharing diff --git a/Chap_memory_model.tex b/Chap_memory_model.tex index c44d53d..6447dcd 100644 --- a/Chap_memory_model.tex +++ b/Chap_memory_model.tex @@ -2,44 +2,71 @@ \chapter{Memory Model} \label{chap:memory_model} -In this chapter, examples illustrate race conditions on access to variables with -shared data-sharing attributes. A race condition can exist when two -or more threads are involved in accessing a variable in which not all -of the accesses are reads; that is, a WaR, RaW or WaW condition -exists (R=read, a=after, W=write). A RaR does not produce a race condition. - Ensuring thread execution order at -the processor level is not enough to avoid race conditions, because the -local storage at the processor level (registers, caches, etc.) -must be synchronized so that a consistent view of the variable in the -memory hierarchy can be seen by the threads accessing the variable. +OpenMP provides a shared-memory model that allows all threads on a given +device shared access to \emph{memory}. For a given OpenMP region that may be +executed by more than one thread or SIMD lane, variables in memory may be +\emph{shared} or \emph{private} with respect to those threads or SIMD lanes. A +variable's data-sharing attribute indicates whether it is shared (the +\emph{shared} attribute) or private (the \emph{private}, \emph{firstprivate}, +\emph{lastprivate}, \emph{linear}, and \emph{reduction} attributes) in the data +environment of an OpenMP region. While private variables in an OpenMP region +are new copies of the original variable (with same name) that may then be +concurrently accessed or modified by their respective threads or SIMD lanes, a +shared variable in an OpenMP region is the same as the variable of the same +name in the enclosing region. Concurrent accesses or modifications to a +shared variable may therefore require synchronization to avoid data races. -OpenMP provides a shared-memory model which allows all threads access -to \plc{memory} (shared data). Each thread also has exclusive -access to \plc{threadprivate memory} (private data). A private -variable referenced in an OpenMP directive's structured block is a -new version of the original variable (with the same name) for each -task (or SIMD lane) within the code block. A private variable is -initially undefined (except for variables in \code{firstprivate} -and \code{linear} clauses), and the original variable value is -unaltered by assignments to the private variable, (except for -\code{reduction}, \code{lastprivate} and \code{linear} clauses). +OpenMP's memory model also includes a \emph{temporary view} of memory that is +associated with each thread. Two different threads may see different values for +a given variable in their respective temporary views. Threads may employ flush +operations for the purposes of making their temporary view of a variable +consistent with the value of the variable in memory. The effect of a given +flush operation is characterized by its flush properties -- some combination of +\emph{strong}, \emph{release}, and \emph{acquire} -- and, for \emph{strong} +flushes, a \emph{flush-set}. -Private variables in an outer \code{parallel} region can be -shared by implicit tasks of an inner \code{parallel} region -(with a \code{share} clause on the inner \code{parallel} directive). -Likewise, a private variable may be shared in the region of an -explicit \code{task} (through a \code{shared} clause). +A \emph{strong} flush will force consistency between the temporary view and the +memory for all variables in its \emph{flush-set}. Furthermore all strong flushes in a +program that have intersecting flush-sets will execute in some total order, and +within a thread strong flushes may not be reordered with respect to other +memory operations on variables in its flush-set. \emph{Release} and +\emph{acquire} flushes operate in pairs. A release flush may ``synchronize'' +with an acquire flush, and when it does so the local memory operations that +precede the release flush will appear to have been completed before the local +memory operations on the same variables that follow the acquire flush. + +Flush operations arise from explicit \code{flush} directives, implicit +\code{flush} directives, and also from the execution of \code{atomic} +constructs. The \code{flush} directive forces a consistent view of local +variables of the thread executing the \code{flush}. When a list is supplied on +the directive, only the items (variables) in the list are guaranteed to be +flushed. Implied flushes exist at prescribed locations of certain constructs. +For the complete list of these locations and associated constructs, please +refer to the \plc{flush Construct} section of the OpenMP Specifications +document. + +In this chapter, examples illustrate how race conditions may arise for accesses +to variables with a \plc{shared} data-sharing attribute when flush operations +are not properly employed. A race condition can exist when two or more threads +are involved in accessing a variable in which not all of the accesses are +reads; that is, a WaR, RaW or WaW condition exists (R=read, a=after, W=write). +A RaR does not produce a race condition. In particular, a data race will arise +when conflicting accesses do not have a well-defined \emph{completion order}. +The existence of data races in OpenMP programs result in undefined behavior, +and so they should generally be avoided for programs to be correct. The +completion order of accesses to a shared variable is guaranteed in OpenMP +through a set of memory consistency rules that are described in the \plc{OpenMP +Memory Consitency} section of the OpenMP Specifications document. + +%This chapter also includes examples that exhibit non-sequentially consistent +%(\emph{non-SC}) behavior. Sequential consistency (\emph{SC}) is the desirable +%property that the results of a multi-threaded program are as if all operations +%are performed in some total order, consistent with the program order of +%operations performed by each thread. OpenMP guarantees that a correct program +%(i.e. a program that does not have a data race) will exhibit SC behavior +%so long as the only \code{atomic} constructs it uses are SC atomic directives. -The \code{flush} directive forces a consistent view of local variables -of the thread executing the \code{flush}. -When a list is supplied on the directive, only the items (variables) -in the list are guaranteed to be flushed. - -Implied flushes exist at prescribed locations of certain constructs. -For the complete list of these locations and associated constructs, -please refer to the \plc{flush Construct} section of the OpenMP -Specifications document. % The following table lists construct in which implied flushes exist, and the % location of their execution. @@ -102,4 +129,4 @@ Specifications document. % specific storage location accessed atomically (specified as the \plc{x} variable % in \plc{atomic Construct} subsection of the OpenMP Specifications document). -Examples 1-3 show the difficulty of synchronizing threads through \code{flush} and \code{atomic} directives. +% Examples 1-3 show the difficulty of synchronizing threads through \code{flush} and \code{atomic} directives. diff --git a/Chap_program_control.tex b/Chap_program_control.tex index d45259c..0b53b97 100644 --- a/Chap_program_control.tex +++ b/Chap_program_control.tex @@ -24,7 +24,7 @@ That is, inclusion of one of the \plc{construct-type-clause} names \code{parall activates the corresponding region. The \code{cancel} construct is activated by the first encountering thread, and it continues execution at the end of the named region. -The \code{cancel} construct is also a concellation point for any other thread of the team +The \code{cancel} construct is also a cancellation point for any other thread of the team to also continue execution at the end of the named region. Also, once the specified region has been activated for cancellation any thread that encounnters diff --git a/Chap_synchronization.tex b/Chap_synchronization.tex index 3b96062..ec75388 100644 --- a/Chap_synchronization.tex +++ b/Chap_synchronization.tex @@ -19,10 +19,15 @@ mechanism. On a finer scale the \code{atomic} construct allows only a single thread at a time to have atomic access to a storage location involving a single read, write, update or capture statement, and a limited number of combinations -when specifying the \code{capture} \plc{atomic-clause} clause. The \plc{atomic-clause} clause -is required for some expression statements, but are not required for -\code{update} statements. Please see the details in the \plc{atomic Construct} -subsection of the \plc{Directives} chapter in the OpenMP Specifications document. +when specifying the \code{capture} \plc{atomic-clause} clause. The +\plc{atomic-clause} clause is required for some expression statements, but is +not required for \code{update} statements. The \plc{memory-order} clause can be +used to specify the degree of memory ordering enforced by an \code{atomic} +construct. From weakest to strongest, they are \code{relaxed} (the default), +acquire and/or release clauses (specified with \code{acquire}, \code{release}, +or \code{acq\_rel}), and \code{seq\_cst}. Please see the details in the +\plc{atomic Construct} subsection of the \plc{Directives} chapter in the OpenMP +Specifications document. % The following three sentences were stolen from the spec. The \code{ordered} construct either specifies a structured block in a loop, @@ -37,15 +42,22 @@ iteration vector argument (vec) to indicate the iteration that satisfies the dependence. The \code{depend} clause with a \code{source} \plc{dependence-type} specifies dependence satisfaction. -The \code{flush} directive is a stand-alone construct that forces a thread's -temporal local storage (view) of a variable to memory where a consistent view -of the variable storage can be accesses. When the construct is used without -a variable list, all the locally thread-visible data as defined by the -base language are flushed. A construct with a list applies the flush -operation only to the items in the list. The \code{flush} construct also -effectively insures that no memory (load or store) operation for -the variable set (list items, or default set) may be reordered across -the \code{flush} directive. +The \code{flush} directive is a stand-alone construct for enforcing consistency +between a thread's view of memory and the view of memory for other threads (see +the Memory Model chapter of this document for more details). When the construct +is used with an explicit variable list, a \plc{strong flush} that forces a +thread's temporary view of memory to be consistent with the actual memory is +applied to all listed variables. When the construct is used without an explicit +variable list and without a \plc{memory-order} clause, a strong flush is +applied to all locally thread-visible data as defined by the base language, and +additionally the construct provides both acquire and release memory ordering +semantics. When an explicit variable list is not present and a +\plc{memory-order} clause is present, the construct provides acquire and/or +release memory ordering semantics according to the \plc{memory-order} clause, +but no strong flush is performed. A resulting strong flush that applies to a +set of variables effectively ensures that no memory (load or store) +operation for the affected variables may be reordered across the \code{flush} +directive. General-purpose routines provide mutual exclusion semantics through locks, represented by lock variables. diff --git a/Examples_SIMD.tex b/Examples_SIMD.tex index a6842ba..6008fd8 100644 --- a/Examples_SIMD.tex +++ b/Examples_SIMD.tex @@ -8,6 +8,8 @@ to assure the compiler that the loop can be vectorized. \cexample{SIMD}{1} \ffreeexample{SIMD}{1} + +\clearpage When a function can be inlined within a loop the compiler has an opportunity to @@ -24,7 +26,7 @@ respectively. The \plc{add3} C function uses dereferencing. The \code{declare} \code{simd} constructs also illustrate the use of \code{uniform} and \code{linear} clauses. The \code{uniform(fact)} clause indicates that the variable \plc{fact} is invariant across the SIMD lanes. In -the \plc{add2} function \plc{a} and \plc{b} are included in the \code{unform} +the \plc{add2} function \plc{a} and \plc{b} are included in the \code{uniform} list because the C pointer and the Fortran array references are constant. The \plc{i} index used in the \plc{add2} function is included in a \code{linear} clause with a constant-linear-step of 1, to guarantee a unity increment of the @@ -42,7 +44,7 @@ variable. \ffreeexample{SIMD}{2} - +\pagebreak A thread that encounters a SIMD construct executes a vectorized code of the iterations. Similar to the concerns of a worksharing loop a loop vectorized with a SIMD construct must assure that temporary and reduction variables are @@ -55,6 +57,7 @@ construct. \ffreeexample{SIMD}{3} +\pagebreak A \code{safelen(N)} clause in a \code{simd} construct assures the compiler that there are no loop-carried dependencies for vectors of size \plc{N} or below. If the \code{safelen} clause is not specified, then the default safelen value is @@ -69,7 +72,7 @@ than 16, the behavior is undefined. \ffreeexample{SIMD}{4} - +\pagebreak The following SIMD construct instructs the compiler to collapse the \plc{i} and \plc{j} loops into a single SIMD loop in which SIMD chunks are executed by threads of the team. Within the workshared loop chunks of a thread, the SIMD @@ -110,6 +113,7 @@ version of the \plc{fib()} function. %%% section +\pagebreak \section{Loop-Carried Lexical Forward Dependence} \label{sec:SIMD_forward_dep} diff --git a/Examples_acquire_release.tex b/Examples_acquire_release.tex new file mode 100644 index 0000000..5205277 --- /dev/null +++ b/Examples_acquire_release.tex @@ -0,0 +1,141 @@ +\pagebreak +\section{Synchronization Based on Acquire/Release Semantics} +\label{sec:acquire_and_release_semantics} + +%OpenMP 5.0 introduced ``release/acquire'' memory ordering semantics to the +%specification. The memory ordering behavior of OpenMP constructs and routines +%that permit two threads to synchronize with each other are defined in terms of +%\textit{release flushes} and \textit{acquire flushes}, where a release flush +%must occur at the source of the synchronization and an acquire flush must occur +%at the sink of the synchronization. Flushes resulting from a \code{flush} +%directive without a list may function as a release flush, an acquire flush, or +%both a release and acquire flush. Flushes implied on entry to or exit from an +%atomic operation (specified by an \code{atomic} construct) may also function as +%a release flush or an acquire flush, depending on if a memory ordering clause +%appears on a construct. Flushes implied by other OpenMP constructs or routines +%also function as either a release flush or an acquire flush, according to the +%synchronization semantics of the construct. + +%%%%%%%%%%%%%%%%%% + +As explained in the Memory Model chapter of this document, a flush operation +may be an \emph{acquire flush} and/or a \emph{release flush}, and OpenMP 5.0 +defines acquire/release semantics in terms of these fundamental flush +operations. For any synchronization between two threads that is specified by +OpenMP, a release flush logically occurs at the source of the synchronization +and an acquire flush logically occurs at the sink of the synchronization. +OpenMP 5.0 added memory ordering clauses -- \code{acquire}, \code{release}, and +\code{acq\_rel} -- to the \code{flush} and \code{atomic} constructs for +explicitly requesting acquire/release semantics. Furthermore, implicit flushes +for all OpenMP constructs and runtime routines that synchronize OpenMP threads +in some manner were redefined in terms of synchronizing release and acquire +flushes to avoid the requirement of strong memory fences (see the \plc{Flush +Synchronization and Happens Before} and \plc{Implicit Flushes} sections of the +OpenMP Specifications document). + +The examples that follow in this section illustrate how acquire and release +flushes may be employed, implicitly or explicitly, for synchronizing threads. A +\code{flush} directive without a list and without any memory ordering clause +can also function as both an acquire and release flush for facilitating thread +synchronization. Flushes implied on entry to, or exit from, an atomic +operation (specified by an \code{atomic} construct) may function as an acquire +flush or a release flush if a memory ordering clause appears on the construct. +On entry to and exit from a \code{critical} construct there is now an implicit +acquire flush and release flush, respectively. + +%%%%%%%%%%%%%%%%%% + +The first example illustrates how the release and acquire flushes implied by a +\code{critical} region guarantee a value written by the first thread is visible +to a read of the value on the second thread. Thread 0 writes to \plc{x} and +then executes a \code{critical} region in which it writes to \plc{y}; the write +to \plc{x} happens before the execution of the \code{critical} region, +consistent with the program order of the thread. Meanwhile, thread 1 executes a +\code{critical} region in a loop until it reads a non-zero value from +\plc{y} in the \code{critical} region, after which it prints the value of +\plc{x}; again, the execution of the \code{critical} regions happen before the +read from \plc{x} based on the program order of the thread. The \code{critical} +regions executed by the two threads execute in a serial manner, with a +pair-wise synchronization from the exit of one \code{critical} region to the +entry to the next \code{critical} region. These pair-wise synchronizations +result from the implicit release flushes that occur on exit from +\code{critical} regions and the implicit acquire flushes that occur on entry to +\code{critical} regions; hence, the execution of each \code{critical} region in +the sequence happens before the execution of the next \code{critical} region. +A ``happens before'' order is therefore established between the assignment to \plc{x} +by thread 0 and the read from \plc{x} by thread 1, and so thread 1 must see that +\plc{x} equals 10. + +\pagebreak +\cexample{acquire_release}{1} +\ffreeexample{acquire_release}{1} + +In the second example, the \code{critical} constructs are exchanged with +\code{atomic} constructs that have \textit{explicit} memory ordering specified. When the +atomic read operation on thread 1 reads a non-zero value from \plc{y}, this +results in a release/acquire synchronization that in turn implies that the +assignment to \plc{x} on thread 0 happens before the read of \plc{x} on thread +1. Therefore, thread 1 will print ``x = 10''. + +\cexample{acquire_release}{2} +\ffreeexample{acquire_release}{2} + +\pagebreak +In the third example, \code{atomic} constructs that specify relaxed atomic +operations are used with explicit \code{flush} directives to enforce memory +ordering between the two threads. The explicit \code{flush} directive on thread +0 must specify a release flush and the explicit \code{flush} directive on +thread 1 must specify an acquire flush to establish a release/acquire +synchronization between the two threads. The \code{flush} and \code{atomic} +constructs encountered by thread 0 can be replaced by the \code{atomic} construct used in +Example 2 for thread 0, and similarly the \code{flush} and \code{atomic} +constructs encountered by thread 1 can be replaced by the \code{atomic} +construct used in Example 2 for thread 1. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%3 +%{\color{violet} +%For this example, the implicit release flush of the \code{flush} directive for thread 0 creates +%a source synchronization with release memory ordering, while the implicit release flush of the +%\code{flush} directive for thread 1 creates a sink synchronization with acquire memory ordering. +%The code performs the same thread synchronization of the previous example, with only a slight +%coding change. +%The explicit \code{release} and \code{acquire} clauses of the atomic construct has been +%replaced with implicit release and aquire flushes of explicit \code{flush} constructs. +%(Here, the \code{atomic} constructs have \plc{relaxed} operations.) +%} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%3 + +\cexample{acquire_release}{3} +\ffreeexample{acquire_release}{3} + +Example 4 will fail to order the write to \plc{x} on thread 0 before the read +from \plc{x} on thread 1. Importantly, the implicit release flush on exit from +the \code{critical} region will not synchronize with the acquire flush that +occurs on the atomic read operation performed by thread 1. This is because +implicit release flushes that occur on a given construct may only synchronize +with implicit acquire flushes on a compatible construct (and vice-versa) that +internally makes use of the same synchronization variable. For a +\code{critical} construct, this might correspond to a \plc{lock} object that +is used by a given implementation (for the synchronization semantics of other +constructs due to implicit release and acquire flushes, refer to the \plc{Implicit +Flushes} section of the OpenMP Specifications document). Either an explicit \code{flush} +directive that provides a release flush (i.e., a flush without a list that does +not have the \code{acquire} clause) must be specified between the +\code{critical} construct and the atomic write, or an atomic operation that +modifies \plc{y} and provides release semantics must be specified. + +%{\color{violet} +%In the following example synchronization between the acquire flush of the atomic read +%of \plc{y} by thread 1 is not synchronized with the relaxed atomic construct that +%assigns a value to \plc{y} by thread 0. +%While there is a \code{critical} construct and implicit release flush +%for the \plc{x} assignment of thread 0, +%a release flush association with the \plc{y} assignment of +%thread 0 is not formed. A \code{release} or \code{acq-rel} clause on the +%\code{atomic write} construct or a \code{flush} directive after the assignment to \plc{y} +%will form a synchronization and will guarantee memory ordering of the x and y assignments +%by thread 0. +%} + +\cexample{acquire_release_broke}{4} +\ffreeexample{acquire_release_broke}{4} diff --git a/Examples_affinity_display.tex b/Examples_affinity_display.tex new file mode 100644 index 0000000..03f2f86 --- /dev/null +++ b/Examples_affinity_display.tex @@ -0,0 +1,104 @@ +\section{Affinity Display} +\label{sec:affinity_display} + +The following examples illustrate ways to display thread affinity. +Automatic display of affinity can be invoked by setting +the \code{OMP\_DISPLAY\_AFFINITY} environment variable to \code{TRUE}. +The format of the output can be customized by setting the +\code{OMP\_AFFINITY\_FORMAT} environment variable to an appropriate string. +Also, there are API calls for the user to display thread affinity +at selected locations within code. + +For the first example the environment variable \code{OMP\_DISPLAY\_AFFINITY} has been +set to \code{TRUE}, and execution occurs on an 8-core system with \code{OMP\_NUM\_THREADS} set to 8. + +The affinity for the master thread is reported through a call to the API +\code{omp\_display\_affinity()} routine. For default affinity settings +the report shows that the master thread can execute on any of the cores. +In the following parallel region the affinity for each of the team threads is reported +automatically since the \code{OMP\_DISPLAY\_AFFINITY} environment variable has been set +to \code{TRUE}. + +These two reports are often useful (as in hybrid codes using both MPI and OpenMP) +to observe the affinity (for an MPI task) before the parallel region, +and during an OpenMP parallel region. Note: the next parallel region uses the +same number of threads as in the previous parallel region and affinities are +not changed, so affinity is NOT reported. + +In the last parallel region, the thread affinities are reported +because the thread affinity has changed. + +\cexample{affinity_display}{1} + +\ffreeexample{affinity_display}{1} + + +In the following example 2 threads are forked, and each executes on a socket. Next, +a nested parallel region runs half of the available threads on each socket. + +These OpenMP environment variables have been set: + +\begin{compactitem} +\item \code{OMP\_PROC\_BIND}="TRUE" +\item \code{OMP\_NUM\_THREADS}="2,4" +\item \code{OMP\_PLACES}="\{0,2,4,6\},\{1,3,5,7\}" +\item \code{OMP\_AFFINITY\_FORMAT}="nest\_level= \%L, parent\_thrd\_num= \%a, thrd\_num= \%n, thrd\_affinity= \%A" +\end{compactitem} + +where the numbers correspond to core ids for the system. Note, \code{OMP\_DISPLAY\_AFFINITY} is not +set and is \code{FALSE} by default. This example shows how to use API routines to +perform affinity display operations. + +For each of the two first-level threads the \code{OMP\_PLACES} variable specifies +a place with all the core-ids of the socket (\{0,2,4,6\} for one thread and \{1,3,5,7\} for the other). +(As is sometimes the case in 2-socket systems, one socket may consist +of the even id numbers, while the other may have the odd id numbers.) The affinities +are printed according to the \code{OMP\_AFFINITY\_FORMAT} format: providing +the parallel nesting level (\%L), the ancestor thread number (\%a), the thread number (\%n) +and the thread affinity (\%A). In the nested parallel region within the \plc{socket\_work} routine +the affinities for the threads on each socket are printed according to this format. + +\cexample{affinity_display}{2} + +\ffreeexample{affinity_display}{2} + +The next example illustrates more details about affinity formatting. +First, the \code{omp\_get\_affininity\_format()} API routine is used to +obtain the default format. The code checks to make sure the storage +provides enough space to hold the format. +Next, the \code{omp\_set\_affinity\_format()} API routine sets a user-defined +format: \plc{host=\%20H thrd\_num=\%0.4n binds\_to=\%A}. + +The host, thread number and affinity fields are specified by \plc{\%20H}, +\plc{\%0.4n} and \plc{\%A}: \plc{H}, \plc{n} and \plc{A} are single character "short names" +for the host, thread\_num and thread\_affinity data to be printed, +with format sizes of \plc{20}, \plc{4}, and "size as needed". +The period (.) indicates that the field is displayed right-justified (default is left-justified) +and the "0" indicates that any unused space is to be prefixed with zeros +(e.g. instead of "1", "0001" is displayed for the field size of 4). + +%The period (.) indicates that the field is displayed left-justified and the "0" indicates +%that leading zeros are to be added so that the total length for the display of this ā€œnā€ (thread_num) field is 4. + +%The period (\plc{.}) indicates right justified and \plc{0} leading zeros. +%All other text in the format is just user narrative. + +Within the parallel region the affinity for each thread is captured by +\code{omp\_capture\_affinity()} into a buffer array with elements indexed +by the thread number (\plc{thrd\_num}). +After the parallel region, the thread affinities are printed in thread-number order. + +If the storage area in buffer is inadequate for holding the affinity +data, the stored affinity data is truncated. +%The \plc{max} reduction on the required storage, returned by +%\code{omp\_capture\_affinity} in \plc{nchars}, is used to report +%possible truncation (if \plc{max\_req\_store} > \plc{buffer\_store}). +The maximum value for the number of characters (\plc{nchars}) returned by +\code{omp\_capture\_affinity} is captured by the \code{reduction(max:max\_req\_store)} +clause and the \plc{if(nchars >= max\_req\_store) max\_req\_store=nchars} statement. +It is used to report possible truncation (if \plc{max\_req\_store} > \plc{buffer\_store}). + +\cexample{affinity_display}{3} + +\ffreeexample{affinity_display}{3} + diff --git a/Examples_affinity_query.tex b/Examples_affinity_query.tex index 06f56b6..1630aaf 100644 --- a/Examples_affinity_query.tex +++ b/Examples_affinity_query.tex @@ -37,7 +37,7 @@ On some systems there are utilities, files or user guides that provide configura information. For instance, the socket number and proc\_id's for a socket can be found in the /proc/cpuinfo text file on Linux systems. -\cexample{affinity}{6} +\cexample{affinity_query}{1} -\ffreeexample{affinity}{6} +\ffreeexample{affinity_query}{1} diff --git a/Examples_allocators.tex b/Examples_allocators.tex new file mode 100644 index 0000000..a51f439 --- /dev/null +++ b/Examples_allocators.tex @@ -0,0 +1,63 @@ +\pagebreak +\section{ Memory Allocators} +\label{sec:allocators} + +OpenMP memory allocators can be used to allocate memory with +specific allocator traits. In the following example an OpenMP allocator is used to +specify an alignment for arrays \plc{x} and \plc{y}. The +general approach for attributing traits to variables allocated by +OpenMP is to create or specify a pre-defined \plc{memory space}, create an +array of \plc{traits}, and then form an \plc{allocator} from the +memory space and trait. +The allocator is then specified +in an OpenMP allocation (using an API \plc{omp\_alloc()} function +for C/C++ code and an \code{allocate} directive for Fortran code +in the allocators.1 example). + +In the example below the \plc{xy\_memspace} variable is declared +and assigned the default memory space (\plc{omp\_default\_mem\_space}). +Next, an array for \plc{traits} is created. Since only one +trait will be used, the array size is \plc{1}. +A trait is a structure in C/C++ and a derived type in Fortran, +containing 2 components: a key and a corresponding value (key-value pair). +The trait key used here is \plc{omp\_atk\_alignment} (an enum for C/C++ +and a parameter for Fortran) +and the trait value of 64 is specified in the \plc{xy\_traits} declaration. +These declarations are followed by a call to the +\plc{omp\_init\_allocator()} function to combine the memory +space (\plc{xy\_memspace}) and the traits (\plc{xy\_traits}) +to form an allocator (\plc{xy\_alloc}). + +%In the C/C++ code the API \plc{omp\_allocate()} function is used +%to allocate space, similar to \plc{malloc}, except that the allocator +%is specified as the second argument. +%In Fortran an API allocation function is not available. +%An \code{allocate} construct is used (with \plc{x} and \plc{y} +%listed as the variables to be allocated), along +%with an \code{allocator} clause (specifying the \plc{xy\_alloc} as the allocator) +%for the following Fortran \plc{allocate} statement. + +In the C/C++ code the API \plc{omp\_allocate()} function is used +to allocate space, similar to \plc{malloc}, except that the allocator +is specified as the second argument. +In Fortran an \code{allocate} directive is used to specify an allocator +for a following Fortran \plc{allocate} statement. +A variable list may be supplied if the allocator +is to be applied to a subset of variables in the Fortran allocate +statement. Specifying the complete list is optional. +Here, the \plc{xy\_alloc} allocator is specified +in the \code{allocator} clause, +and the set of all variables used in the allocate statement is specified in the list. + +%"for a following Fortran allocation statement" (no using "immediately" here) +% it looks like if you have a list, the allocation statement does not need +% to follow immediately.(?) +% spec5.0 157:19-20 The allocate directive must appear in the same scope as +% the declarations of each of its list items and must follow all such declarations. + +%\pagebreak + + \cexample{allocators}{1} +\ffreeexample{allocators}{1} + + diff --git a/Examples_array_sections.tex b/Examples_array_sections.tex index 41e4980..c970780 100644 --- a/Examples_array_sections.tex +++ b/Examples_array_sections.tex @@ -5,13 +5,14 @@ The following examples show the usage of array sections in \code{map} clauses on \code{target} and \code{target} \code{data} constructs. -This example shows the invalid usage of two seperate sections of the same array +This example shows the invalid usage of two separate sections of the same array inside of a \code{target} construct. \cexample{array_sections}{1} \ffreeexample{array_sections}{1} +\pagebreak This example shows the invalid usage of two separate sections of the same array inside of a \code{target} construct. @@ -19,6 +20,7 @@ inside of a \code{target} construct. \ffreeexample{array_sections}{2} +\pagebreak This example shows the valid usage of two separate sections of the same array inside of a \code{target} construct. @@ -26,6 +28,7 @@ of a \code{target} construct. \ffreeexample{array_sections}{3} +\pagebreak This example shows the valid usage of a wholly contained array section of an already mapped array section inside of a \code{target} construct. diff --git a/Examples_array_shaping.tex b/Examples_array_shaping.tex new file mode 100644 index 0000000..b2697b9 --- /dev/null +++ b/Examples_array_shaping.tex @@ -0,0 +1,27 @@ +\section{Array Shaping} +\label{sec:array-shaping} + +\ccppspecificstart +A pointer variable can be shaped to a multi-dimensional array to facilitate +data access. This is achieved by a \plc{shape-operator} casted in front of +a pointer (lvalue expression): +\begin{description} +\item[]\hspace*{5mm}\code{([$s_1$][$s_2$]...[$s_n$])}\plc{pointer} +\end{description} +where each $s_i$ is an integral-type expression of positive value. +The shape-operator can appear in either the \plc{motion-clause} +of the \code{target}~\code{update} directive or the \code{depend} clause. + +The following example shows the use of the shape-operator in the +\code{target}~\code{update} directive. The shape-operator \code{([nx][ny+2])} +casts pointer variable $a$ to a 2-dimentional array of size +\plc{nx}$\times$\plc{(ny+2)}. The resulting array is then accessed as +array sections (such as \code{[0:nx][1]} and \code{[0:nx][ny]}) +in the \code{from} or \code{to} clause for transferring two columns of +noncontiguous boundary data from or to the device. +Note the use of additional parentheses +around the shape-operator and $a$ to ensure the correct precedence +over array-section operations. + +\cnexample{array_shaping}{1} +\ccppspecificend diff --git a/Examples_async_target_depend.tex b/Examples_async_target_depend.tex index 14f09b0..5410d8a 100644 --- a/Examples_async_target_depend.tex +++ b/Examples_async_target_depend.tex @@ -12,4 +12,3 @@ As of OpenMP 4.5 and beyond the \code{nowait} clause can be used on the This section also shows the use of \code{depend} clauses to order executions through dependences. - diff --git a/Examples_async_target_with_tasks.tex b/Examples_async_target_with_tasks.tex index 5a3126d..09bf78c 100644 --- a/Examples_async_target_with_tasks.tex +++ b/Examples_async_target_with_tasks.tex @@ -11,6 +11,7 @@ task or one of the previously generated explicit tasks. \cexample{async_target}{1} +\pagebreak The Fortran version has an interface block that contains the \code{declare} \code{target}. An identical statement exists in the function declaration (not shown here). diff --git a/Examples_cancellation.tex b/Examples_cancellation.tex index 88125be..05d41b6 100644 --- a/Examples_cancellation.tex +++ b/Examples_cancellation.tex @@ -22,6 +22,8 @@ the worksharing construct after the cancellation has been activated. \ffreeexample{cancellation}{1} +\clearpage + The following example shows how to cancel a parallel search on a binary tree as soon as the search value has been detected. The code creates a task to descend into the child nodes of the current tree node. If the search value has been found, diff --git a/Examples_depobj.tex b/Examples_depobj.tex new file mode 100644 index 0000000..2355138 --- /dev/null +++ b/Examples_depobj.tex @@ -0,0 +1,49 @@ +\pagebreak +\section{The \code{depobj} Construct} +\label{sec:depobj} + +The stand-alone \code{depobj} construct provides a mechanism +to create a \plc{depend object} that expresses a dependence to be +used subsequently in the \code{depend} clause of another construct. +The dependence is created from a dependence type and a storage location, +within a \code{depend} clause of an \code{depobj} construct; +%just as one would find directly on a \code{task} construct. +and it is stored in the depend object. +The depend object is represented by a variable of type \code{omp\_depend\_t} +in C/C++ (by a scalar variable of integer kind \code{omp\_depend\_kind} in Fortran). + +In the example below the stand-alone \code{depobj} construct uses the +\code{depend}, \code{update} and \code{destroy} clauses to +\plc{initialize}, \plc{update} and \plc{uninitialize} +a depend object (\code{obj}). + +The first \code{depobj} construct initializes the \code{obj} +depend object with +an \code{inout} dependence type with a storage +location defined by variable \code{a}. +This dependence is passed into the \plc{driver} +routine via the \code{obj} depend object. + +In the first \plc{driver} routine call, \emph{Task 1} uses +the dependence of the object (\code{inout}), +while \emph{Task 2} uses an \code{in} dependence specified +directly in a \code{depend} clause. +For these task dependences \emph{Task 1} must execute and +complete before \emph{Task 2} begins. + +Before the second call to \plc{driver}, \code{obj} is updated +using the \code{depobj} construct to represent an \code{in} dependence. +Hence, in the second call to \plc{driver}, \emph{Task 1} +will have an \code{in} dependence; and \emph{Task 1} and +\emph{Task 2} can execute simultaneously. Note: in an \code{update} +clause, only the dependence type can be (is) updated. + +The third \code{depobj} construct uses the \code{destroy} clause. +It frees resources as it puts the depend object in an uninitialized state-- +effectively destroying the depend object. +After an object has been uninitialized it can be initialized again +with a new dependence type \emph{and} a new variable. + +\cexample{depobj}{1} + +\ffreeexample{depobj}{1} diff --git a/Examples_fort_sp_common.tex b/Examples_fort_sp_common.tex index b404a8c..0d56381 100644 --- a/Examples_fort_sp_common.tex +++ b/Examples_fort_sp_common.tex @@ -16,9 +16,9 @@ The following example is also conforming: \fnexample{fort_sp_common}{2} % blue line floater at top of this page for "Fortran, cont." -\begin{figure}[t!] -\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} -\end{figure} +%\begin{figure}[t!] +%\linewitharrows{-1}{dashed}{Fortran (cont.)}{8em} +%\end{figure} The following example is conforming: diff --git a/Examples_get_nthrs.tex b/Examples_get_nthrs.tex index 971d857..c0eb26b 100644 --- a/Examples_get_nthrs.tex +++ b/Examples_get_nthrs.tex @@ -11,6 +11,7 @@ call should be inside the \code{parallel} region. \fexample{get_nthrs}{1} +\pagebreak The following example shows how to rewrite this program without including a query for the number of threads: diff --git a/Examples_host_teams.tex b/Examples_host_teams.tex new file mode 100644 index 0000000..a1c436d --- /dev/null +++ b/Examples_host_teams.tex @@ -0,0 +1,28 @@ +\pagebreak +\section{\code{teams} Construct on Host} +\label{sec:host_teams} + +%{\color{blue} ... } {\color{violet} ... } +Originally the \code{teams} construct was created for devices (such as GPUs) +for independent executions of a structured block by teams within a league (on SMs). +It was only available through offloading with the \code{target} construct, +and the execution of a \code{teams} region could only be directed to host +execution by various means such as \code{if} and \code{device} clauses, +and the \code{OMP\_TARGET\_OFFLOAD} environment variable. + +In OpenMP 5.0 the \code{teams} construct was extended to enable the host +to execute a \code{teams} region (without an associated \code{target} construct), +with anticipation of further affinity and threading controls in future OpenMP releases. +%With additional affinity controls, a team could be +%assigned to execute on a socket or use only a specified number of threads. + +In the example below the \code{teams} construct is used to create two +teams, one to execute single precision code, and the other +to execute double precision code. Two teams are required, and +the thread limit for each team is set to 1/2 of the number of +available processors. + +\cexample{host_teams}{1} + +\ffreeexample{host_teams}{1} + diff --git a/Examples_icv.tex b/Examples_icv.tex index a8f1bdb..d3c004a 100644 --- a/Examples_icv.tex +++ b/Examples_icv.tex @@ -50,6 +50,7 @@ one of the threads in the team. Since we have a total of two inner \code{paralle regions, the print statement will be executed twice -- once per inner \code{parallel} region. +\pagebreak \cexample{icv}{1} \fexample{icv}{1} diff --git a/Examples_loop.tex b/Examples_loop.tex new file mode 100644 index 0000000..ff94c7e --- /dev/null +++ b/Examples_loop.tex @@ -0,0 +1,13 @@ +\pagebreak +\section{The \code{loop} Construct} +\label{sec:loop} + +The following example illustrates the use of the OpenMP 5.0 \code{loop} +construct for the execution of a loop. +The \code{loop} construct asserts to the compiler that the iterations +of the loop are free of data dependencies and may be executed concurrently. +It allows the compiler to use heuristics to select the parallelization scheme +and compiler-level optimizations for the concurrency. + + \cexample{loop}{1} +\ffreeexample{loop}{1} diff --git a/Examples_mem_model.tex b/Examples_mem_model.tex index 51ab56f..7089570 100644 --- a/Examples_mem_model.tex +++ b/Examples_mem_model.tex @@ -1,3 +1,4 @@ + \pagebreak \section{The OpenMP Memory Model} \label{sec:mem_model} @@ -18,6 +19,7 @@ be printed by both Print 2 and Print 3. \ffreeexample{mem_model}{1} +\pagebreak The following example demonstrates why synchronization is difficult to perform correctly through variables. The value of flag is undefined in both prints on thread 1 and the value of data is only well-defined in the second print. @@ -26,6 +28,7 @@ correctly through variables. The value of flag is undefined in both prints on th \fexample{mem_model}{2} +\pagebreak The next example demonstrates why synchronization is difficult to perform correctly through variables. Because the \plc{write}(1)-\plc{flush}(1)-\plc{flush}(2)-\plc{read}(2) sequence cannot be guaranteed in the example, the statements on thread 0 and thread diff --git a/Examples_metadirective.tex b/Examples_metadirective.tex new file mode 100644 index 0000000..ef2fa88 --- /dev/null +++ b/Examples_metadirective.tex @@ -0,0 +1,88 @@ +\pagebreak +\section{Metadirective Directive} +\label{sec:metadirective} + +A \code{metadirective} directive provides a mechanism to select a directive in +a \code{when} clause to be used, depending upon one or more contexts: +implementation, available devices and the present enclosing construct. +The directive in a \code{default} clause is used when a directive of the +\code{when} clause is not selected. + +In the \code{when} clause the \plc{context selector} (or just \plc{selector}) defines traits that are +evaluated for selection of the directive that follows the selector. +This "selectable" directive is called a \plc{directive variant}. +Traits are grouped by \plc{construct}, \plc{implementation} and +\plc{device} \plc{sets} to be used by a selector of the same name. + +In the first example the architecture trait \plc{arch} of the +\plc{device} selector set specifies that if an \plc{nvptx} (NVIDIA) architecture is +active in the OpenMP context, then the \code{teams}~\code{loop} +\plc{directive variant} is selected as the directive; otherwise, the \code{parallel}~\code{loop} +\plc{directive variant} of the \code{default} clause is selected as the directive. +That is, if a \plc{device} of \plc{nvptx} architecture is supported by the implementation within +the enclosing \code{target} construct, its \plc{directive variant} is selected. +The architecture names, such as \plc{nvptx}, are implementation defined. +Also, note that \plc{device} as used in a \code{target} construct specifies +a device number, while \plc{device}, as used in the \code{metadirective} +directive as selector set, has traits of \plc{kind}, \plc{isa} and \plc{arch}. + + + +\cexample{metadirective}{1} + +\ffreeexample{metadirective}{1} + +%\pagebreak +In the second example, the \plc{implementation} selector set is specified +in the \code{when} clause to distinguish between AMD and NVIDIA platforms. +Additionally, specific architectures are specified with the \plc{device} +selector set. + +In the code, different \code{teams} constructs are employed as determined +by the \code{metadirective} directive. +The number of teams is restricted by a \code{num\_teams} clause +and a thread limit is also set by a \code{thread\_limit} clause for +\plc{vendor} AMD and NVIDIA platforms and specific architecture +traits. Otherwise, just the \code{teams} construct is used without +any clauses, as prescribed by the \code{default} clause. + + +\cexample{metadirective}{2} + +\ffreeexample{metadirective}{2} + +\clearpage + +%\pagebreak +In the third example, a \plc{construct} selector set is specified in the \code{when} clause. +Here, a \code{metadirective} directive is used within a function that is also +compiled as a function for a target device as directed by the \code{declare}~\code{target} directive. +The \plc{target} directive name of the \code{construct} selector ensures that the +\code{distribute}~\code{parallel}~\code{for/do} construct is employed for the target compilation. +Otherwise, for the host-compiled version the \code{parallel}~\code{for/do}~\code{simd} construct is used. + +In the first call to the \plc{exp\_pi\_diff()} routine the context is a +\code{target}~\code{teams} construct and the \code{distribute}~\code{parallel}~\code{for/do} +construct version of the function is invoked, +while in the second call the \code{parallel}~\code{for/do}~\code{simd} construct version is used. + +%%%%%%%% +This case illustrates an important point for users that may want to hoist the +\code{target} directive out of a function that contains the usual +\code{target}~\code{teams}~\code{distribute}~\code{parallel}~\code{for/do} construct +(for providing alternate constructs through the \code{metadirective} directive as here). +While this combined construct can be decomposed into a \code{target} and +\code{teams distribute parallel for/do} constructs, the OpenMP 5.0 specification has the restriction: +``If a \code{teams} construct is nested within a \code{target} construct, that \code{target} construct must +contain no statements, declarations or directives outside of the \code{teams} construct''. +So, the \code{teams} construct must immediately follow the \code{target} construct without any intervening +code statements (which includes function calls). +Since the \code{target} construct alone cannot be hoisted out of a function, +the \code{target}~\code{teams} construct has been hoisted out of the function, and the +\code{distribute}~\code{parallel}~\code{for/do} construct is used +as the \plc{variant} directive of the \code{metadirective} directive within the function. +%%%%%%%% + +\cexample{metadirective}{3} + +\ffreeexample{metadirective}{3} diff --git a/Examples_nthrs_dynamic.tex b/Examples_nthrs_dynamic.tex index 3ca3ba4..802674a 100644 --- a/Examples_nthrs_dynamic.tex +++ b/Examples_nthrs_dynamic.tex @@ -16,6 +16,7 @@ abort the program or to supply any number of threads available. \fexample{nthrs_dynamic}{1} +\pagebreak The call to the \code{omp\_set\_dynamic} routine with a non-zero argument in C/C++, or \code{.TRUE.} in Fortran, allows the OpenMP implementation to choose any number of threads between 1 and 10. diff --git a/Examples_parallel_master_taskloop.tex b/Examples_parallel_master_taskloop.tex new file mode 100644 index 0000000..c2474cb --- /dev/null +++ b/Examples_parallel_master_taskloop.tex @@ -0,0 +1,33 @@ +\pagebreak +\section{The \code{parallel master taskloop} Construct} +\label{sec:parallel_master_taskloop} + +In the OpenMP 5.0 Specification several combined constructs containing +the \code{taskloop} construct were added. + +Just as the \code{for} and \code{do} constructs have been combined +with the \code{parallel} construct for convenience, so too, the combined +\code{parallel}~\code{master}~\code{taskloop} and +\code{parallel}~\code{master}~\code{taskloop}~\code{simd} +constructs have been created for convenience. + +In the following example the first \code{taskloop} construct is enclosed +by the usual \code{parallel} and \code{master} constructs to form +a team of threads, and a single task generator (master thread) for +the \code{taskloop} construct. + +The same OpenMP operations for the first taskloop are accomplished by the second +taskloop with the \code{parallel}~\code{master}~\code{taskloop} +combined construct. +The third taskloop uses the combined \code{parallel}~\code{master}~\code{taskloop}~\code{simd} +construct to accomplish the same behavior as closely nested \code{parallel master}, +and \code{taskloop simd} constructs. + +As with any combined construct the clauses of the components may be used +with appropriate restrictions. The combination of the \code{parallel}~\code{master} construct +with the \code{taskloop} or \code{taskloop}~\code{simd} construct produces no additional +restrictions. + +\cexample{parallel_master_taskloop}{1} + +\ffreeexample{parallel_master_taskloop}{1} diff --git a/Examples_reduction.tex b/Examples_reduction.tex index 80898ee..4e326eb 100644 --- a/Examples_reduction.tex +++ b/Examples_reduction.tex @@ -1,13 +1,21 @@ \pagebreak -\section{The \code{reduction} Clause} + +\section{Reduction} \label{sec:reduction} -The following example demonstrates the \code{reduction} clause ; note that some +This section covers ways to perform reductions in parallel, task, taskloop, and SIMD regions. + +\subsection{The \code{reduction} Clause} +\label{subsec:reduction} + +The following example demonstrates the \code{reduction} clause; note that some reductions can be expressed in the loop in several ways, as shown for the \code{max} and \code{min} reductions below: \cexample{reduction}{1} +\pagebreak + \ffreeexample{reduction}{1} A common implementation of the preceding example is to treat it as if it had been @@ -41,6 +49,7 @@ to \code{MIN}. \ffreenexample{reduction}{5} \fortranspecificend +\pagebreak The following example is non-conforming because the initialization (\code{a = 0}) of the original list item \code{a} is not synchronized with the update of \code{a} as a result of the reduction computation in the \code{for} loop. Therefore, @@ -63,3 +72,166 @@ The following example demonstrates the reduction of array \plc{a}. In C/C++ thi \cexample{reduction}{7} \ffreeexample{reduction}{7} + + +\subsection{Task Reduction} +\label{subsec:task_reduction} + +The following C/C++ and Fortran examples show how to implement +a task reduction over a linked list. + +Task reductions are supported by the \code{task\_reduction} clause which can only be +applied to the \code{taskgroup} directive, and a \code{in\_reduction} clause +which can be applied to the \code{task} construct among others. + +The \code{task\_reduction} clause on the \code{taskgroup} construct is used to +define the scope of a new reduction, and after the \code{taskgroup} +region the original variable will contain the final value of the reduction. +In the task-generating while loop the \code{in\_reduction} clause of the \code{task} +construct is used to specify that the task participates "in" the reduction. + +Note: The \plc{res} variable is private in the \plc{linked\_list\_sum} routine +and is not required to be shared (as in the case of a \code{parallel} construct +reduction). + + +\cexample{task_reduction}{1} + +\ffreeexample{task_reduction}{1} + + +\subsection{Taskloop Reduction} +\label{subsec:taskloop_reduction} + +In the OpenMP 5.0 Specification the \code{taskloop} construct +was extended to include the reductions. + +The following two examples show how to implement a reduction over an array +using taskloop reduction in two different ways. +In the first +example we apply the \code{reduction} clause to the \code{taskloop} construct. As it was +explained above in the task reduction examples, a reduction over tasks is +divided in two components: the scope of the reduction, which is defined by a +\code{taskgroup} region, and the tasks that participate in the reduction. In this +example, the \code{reduction} clause defines both semantics. First, it specifies that +the implicit \code{taskgroup} region associated with the \code{taskloop} construct is the scope of the +reduction, and second, it defines all tasks created by the \code{taskloop} construct as +participants of the reduction. About the first property, it is important to note +that if we add the \code{nogroup} clause to the \code{taskloop} construct the code will be +nonconforming, basically because we have a set of tasks that participate in a +reduction that has not been defined. + +\cexample{taskloop_reduction}{1} +\ffreeexample{taskloop_reduction}{1} + +%In the second example, we are computing exactly the same +%value but we do it in a very different way. The first thing that we do in the +%\plc{array\_sum} function is to create a \code{taskgroup} region that defines the scope of a +%new reduction using the \code{task\_reduction} clause. +%After that, we specify that a task and also the tasks generated +%by a taskloop will participate in that reduction using the \code{in\_reduction} clause +%on the \code{task} and \code{taskloop} constructs, respectively. Note that +%we also added the \code{nogroup} clause to the \code{taskloop} construct. This is allowed +%because what we are expressing with the \code{in\_reduction} clause is different +%from what we were expressing with the \code{reduction} clause. In one case we specify +%that the generated tasks will participate in a previously declared reduction +%(\code{in\_reduction} clause) whereas in the other case we specify that we want to +%create a new reduction and also that all tasks generated by the taskloop will +%participate on it. + +The second example computes exactly the same value as in the preceding\plc{taskloop\_reduction.1} code section, +but in a very different way. +First, in the \plc{array\_sum} function a \code{taskgroup} region is created +that defines the scope of a new reduction using the \code{task\_reduction} clause. +After that, a task and also the tasks generated by a taskloop participate in +that reduction by using the \code{in\_reduction} clause on the \code{task} +and \code{taskloop} constructs, respectively. +Note that the \code{nogroup} clause was added to the \code{taskloop} construct. +This is allowed because what is expressed with the \code{in\_reduction} clause +is different from what is expressed with the \code{reduction} clause. +In one case the generated tasks are specified to participate in a previously +declared reduction (\code{in\_reduction} clause) whereas in the other case +creation of a new reduction is specified and also that all tasks generated +by the taskloop will participate on it. + +\cexample{taskloop_reduction}{2} +\ffreeexample{taskloop_reduction}{2} + +In the OpenMP 5.0 Specification, \code{reduction} clauses for the +\code{taskloop}~\code{ simd} construct were also added. + +The examples below compare reductions for the \code{taskloop} and the \code{taskloop}~\code{simd} constructs. +These examples illustrate the use of \code{reduction} clauses within +"stand-alone" \code{taskloop} constructs, and the use of \code{in\_reduction} clauses for tasks of taskloops to participate +with other reductions within the scope of a parallel region. + +\textbf{taskloop reductions:} + +In the \plc{taskloop reductions} section of the example below, +\plc{taskloop 1} uses the \code{reduction} clause +in a \code{taskloop} construct for a sum reduction, accumulated in \plc{asum}. +The behavior is as though a \code{taskgroup} construct encloses the +taskloop region with a \code{task\_reduction} clause, and each taskloop +task has an \code{in\_reduction} clause with the specifications +of the \code{reduction} clause. +At the end of the taskloop region \plc{asum} contains the result of the reduction. + +The next taskloop, \plc{taskloop 2}, illustrates the use of the +\code{in\_reduction} clause to participate in a previously defined +reduction scope of a \code{parallel} construct. + +The task reductions of \plc{task 2} and \plc{taskloop 2} are combined +across the \code{taskloop} construct and the single \code{task} construct, as specified +in the \code{reduction(task,}~\code{+:asum)} clause of the \code{parallel} construct. +At the end of the parallel region \plc{asum} contains the combined result of all reductions. + +\textbf{taskloop simd reductions:} + +Reductions for the \code{taskloop}~\code{simd} construct are shown in the second half of the code. +Since each component construct, \code{taskloop} and \code{simd}, +can accept a reduction-type clause, the \code{taskloop}~\code{simd} construct +is a composite construct, and the specific application of the reduction clause is defined +within the \code{taskloop}~\code{simd} construct section of the OpenMP 5.0 Specification. +The code below illustrates use cases for these reductions. + +In the \plc{taskloop simd reduction} section of the example below, +\plc{taskloop simd 3} uses the \code{reduction} clause +in a \code{taskloop}~\code{simd} construct for a sum reduction within a loop. +For this case a \code{reduction} clause is used, as one would use +for a \code{simd} construct. +The SIMD reductions of each task are combined, and the results of these tasks are further +combined just as in the \code{taskloop} construct with the \code{reduction} clause for \plc{taskloop 1}. +At the end of the taskloop region \plc{asum} contains the combined result of all reductions. + +If a \code{taskloop}~\code{simd} construct is to participate in a previously defined +reduction scope, the reduction participation should be specified with +a \code{in\_reduction} clause, as shown in the \code{parallel} region enclosing +\plc{task 4} and \plc{taskloop simd 4} code sections. + +Here the \code{taskloop}~\code{simd} construct's +\code{in\_reduction} clause specifies participation of the construct's tasks as +a task reduction within the scope of the parallel region. +That is, the results of each task of the \code{taskloop} construct component +contribute to the reduction in a broader level, just as in \plc{parallel reduction a} code section above. +Also, each \code{simd}-component construct +occurs as if it has a \code{reduction} clause, and the +SIMD results of each task are combined as though to form a single result for +each task (that participates in the \code{in\_reduction} clause). +At the end of the parallel region \plc{asum} contains the combined result of all reductions. + +%Just as in \plc{parallel reduction a} the +%\code{taskloop simd} construct reduction results are combined +%with the \code{task} construct reduction results +%as specified by the \code{in\_reduction} clause of the \code{task} construct +%and the \plc{task} reduction-modifier of the \code{reduction} clause of +%the \code{parallel} construct. +%At the end of the parallel region \plc{asum} contains the combined result of all reductions. + + +\cexample{taskloop_simd_reduction}{1} + +\ffreeexample{taskloop_simd_reduction}{1} + + + +% All other reductions diff --git a/Examples_requires.tex b/Examples_requires.tex new file mode 100644 index 0000000..334e481 --- /dev/null +++ b/Examples_requires.tex @@ -0,0 +1,31 @@ +\pagebreak +\section{The \code{requires} Directive} +\label{sec:requires} + +The declarative \code{requires} directive can be used to +specify features that an implementation must provide to compile and +execute correctly. + +In the following example the \code{unified\_shared\_memory} clause +of the \code{requires} directive ensures that the host and all +devices accessible through OpenMP provide a \plc{unified address} space +for memory that is shared by all devices. + +The example illustrates the use of the \code{requires} directive specifying +\plc{unified shared memory} in file scope, before any device +directives or device routines. No \code{map} clause is needed for +the \plc{p} structure on the device (and its address \plc{\&p}, for the C++ code, +is the same address on the host and device). +However, scalar variables referenced within the \code{target} +construct still have a default data-sharing attribute of firstprivate. +The \plc{q} scalar is incremented on the device, and its change is +not updated on the host. +% will defaultmap(toform:scalar) make q use shared address space? +%Or will it be ignored at this point. +% Does before device routines also mean before prototype? + +%\pagebreak + +\cppexample{requires}{1} + +\ffreeexample{requires}{1} diff --git a/Examples_standalone.tex b/Examples_standalone.tex index 5e3c54e..01013ff 100644 --- a/Examples_standalone.tex +++ b/Examples_standalone.tex @@ -9,6 +9,7 @@ and cannot be the immediate substatement of an \code{if} statement. \cexample{standalone}{1} +\pagebreak The following example is non-conforming, because the \code{flush}, \code{barrier}, \code{taskwait}, and \code{taskyield} directives are stand-alone directives and cannot be the action statement of an \code{if} statement or a labeled branch @@ -22,6 +23,7 @@ in a compound statement. \cexample{standalone}{2} +\pagebreak The following example is conforming because the \code{flush}, \code{barrier}, \code{taskwait}, and \code{taskyield} directives are enclosed in an \code{if} construct or follow the labeled branch target. diff --git a/Examples_target.tex b/Examples_target.tex index 50ec31e..d04fbeb 100644 --- a/Examples_target.tex +++ b/Examples_target.tex @@ -110,3 +110,35 @@ to the \code{parallel} component of the combined directive. \cexample{target}{6} \ffreeexample{target}{6} + +\subsection{target Reverse Offload} +\label{subsec:target_reverse_offload} + +Beginning with OpenMP 5.0, implementations are allowed to +offload back to the host (reverse offload). + +In the example below the \plc{error\_handler} function +is executed back on the host, if an erroneous value is +detected in the \plc{A} array on the device. + +This is accomplished by specifying the \plc{device-modifier} +\code{ancestor} modifier, along with a device number of \code{1}, +to indicate that the execution is to be performed on the +immediate parent (\plc{1st ancestor})-- the host. + +The \code{requires} directive (another 5.0 feature) +uses the \code{reverse\_offload} clause to guarantee +that the reverse offload is implemented. + +Note that the \code{declare target} directive uses the +\code{device\_type} clause (another 5.0 feature) to specify that +the \plc{error\_handler} function is compiled to +execute on the \plc{host} only. This ensures that no +attempt will be made to create a device version of the +function. This feature may be necessary if the function +exists in another compile unit. + + +\cexample{target_reverse_offload}{7} + +\ffreeexample{target_reverse_offload}{7} diff --git a/Examples_target_data.tex b/Examples_target_data.tex index b0ec8ad..240b394 100644 --- a/Examples_target_data.tex +++ b/Examples_target_data.tex @@ -16,6 +16,7 @@ environment. \cexample{target_data}{1} +\pagebreak The Fortran code passes a reference and specifies the extent of the arrays in the declaration. No length information is necessary in the map clause, as is required with C/C++ pointers. @@ -49,20 +50,16 @@ once by the \code{target} \code{data} construct. \ffreeexample{target_data}{2} -In the following example, the variable tmp defaults to \code{tofrom} map-type -and is mapped at each \code{target} construct. The array \plc{Q} is mapped once at -the enclosing \code{target} \code{data} region instead of at each \code{target} -construct. +In the following example, the array \plc{Q} is mapped once at the enclosing +\code{target}~\code{data} region instead of at each \code{target} construct. +In OpenMP 4.0, a scalar variable is implicitly mapped with the \code{tofrom} map-type. +But since OpenMP 4.5, a scalar variable, such as the \plc{tmp} variable, has to be explicitly mapped with +the \code{tofrom} map-type at the first \code{target} construct in order to return +its reduced value from the parallel loop construct to the host. +The variable defaults to firstprivate at the second \code{target} construct. \cexample{target_data}{3} -In the following example the arrays \plc{v1} and \plc{v2} are mapped at each \code{target} -construct. Instead of mapping the array \plc{Q} twice at each \code{target} construct, -\plc{Q} is mapped once by the \code{target} \code{data} construct. Note, the \plc{tmp} -variable is implicitly remapped for each \code{target} region, mapping the value -from the device to the host at the end of the first \code{target} region, and -from the host to the device for the second \code{target} region. - \ffreeexample{target_data}{3} \subsection{\code{target} \code{data} Construct with Orphaned Call} @@ -145,6 +142,7 @@ of the \code{target} constructs. \cexample{target_data}{6} +\pagebreak The \code{if} clauses work the same way for the following Fortran code. The \code{target} constructs enclosed in the \code{target} \code{data} region should also use an \code{if} clause with the same condition, so that the \code{target} \code{data} @@ -153,6 +151,7 @@ are both ignored. \ffreeexample{target_data}{6} +\pagebreak In the following example, when the \code{if} clause conditional expression on the \code{target} construct evaluates to \plc{false}, the target region will execute on the host device. However, the \code{target} \code{data} construct @@ -164,6 +163,7 @@ the \code{target} \code{data} construct, resulting in undefined values in \plc{p \cexample{target_data}{7} +\pagebreak The \code{if} clauses work the same way for the following Fortran code. When the \code{if} clause conditional expression on the \code{target} construct evaluates to \plc{false}, the \code{target} region will execute on the host diff --git a/Examples_target_mapper.tex b/Examples_target_mapper.tex new file mode 100644 index 0000000..2e42d09 --- /dev/null +++ b/Examples_target_mapper.tex @@ -0,0 +1,86 @@ +\pagebreak +\section{ \code{declare mapper} Construct} +\label{sec:declare_mapper} + +The following examples show how to use the \code{declare mapper} +directive to prescribe a map for later use. +It is also quite useful for pre-defining partitioned and nested +structure elements. + +In the first example the \code{declare mapper} directive specifies +that any structure of type \plc{myvec\_t} for which implicit data-mapping +rules apply will be mapped according to its \code{map} clause. +The variable \plc{v} is used for referencing the structure and its +elements within the \code{map} clause. +Within the \code{map} clause the \plc{v} variable specifies that all +elements of the structure are to be mapped. Additionally, the +array section \plc{v.data[0:v.len]} specifies that the dynamic +storage for data is to be mapped. + +Within the main program the \plc{s} variable is typed as \plc{myvec\_t}. +Since the variable is found within the target region and the type has a mapping prescribed by +a \code{declare mapper} directive, it will be automatically mapped according to its prescription: +full structure, plus the dynamic storage of the \plc{data} element. + +%Note: By default the mapping is \code{tofrom}. +%The associated Fortran allocatable \plc{data} array is automatically mapped with the derived +%type, it does not require an array section as in the C/C++ example. + +\cexample{target_mapper}{1} + +\ffreeexample{target_mapper}{1} + +\pagebreak +The next example illustrates the use of the \plc{mapper-identifier} and deep copy within a structure. +The structure, \plc{dzmat\_t}, represents a complex matrix, +with separate real (\plc{r\_m}) and imaginary (\plc{i\_m}) elements. +Two map identifiers are created for partitioning the \plc{dzmat\_t} structure. + +For the C/C++ code the first identifier is named \plc{top\_id} and maps the top half of +two matrices of type \plc{dzmat\_t}; while the second identifier, \plc{bottom\_id}, +maps the lower half of two matrices. +Each identifier is applied to a different \code{target} construct, +as \code{map(mapper(top\_id), tofrom: a,b)} +and \code{map(mapper(bottom\_id), tofrom: a,b)}. +Each target offload is allowed to execute concurrently on two different devices +(\plc{0} and \plc{1}) through the \code{nowait} clause. +The OpenMP 5.0 \code{parallel master} construct creates a region of two threads +for these \code{target} constructs, with a single thread (\plc{master}) generator. + +The Fortran code uses the \plc{left\_id} and \plc{right\_id} map identifiers in the +\code{map(mapper(left\_id),tofrom: a,b)} and \code{map(mapper(right\_id),tofrom: a,b)} map clauses. +The array sections for these left and right contiguous portions of the matrices +were defined previously in the \code{declare mapper} directive. + +Note, the \plc{is} and \plc{ie} scalars are firstprivate +by default for a target region, but are declared firstprivate anyway +to remind the user of important firstprivate data-sharing properties required here. + +\cexample{target_mapper}{2} + +\ffreeexample{target_mapper}{2} + +\pagebreak +In the third example \plc{myvec} structures are +nested within a \plc{mypoints} structure. The \plc{myvec\_t} type is mapped +as in the first example. Following the \plc{mypoints} structure declaration, +the \plc{mypoints\_t} type is mapped by a \code{declare mapper} directive. +For this structure the \plc{hostonly\_data} element will not be mapped; +also the array section of \plc{x} (\plc{v.x[:1]}) and \plc{x} will be mapped; and +\plc{scratch} will be allocated and used as scratch storage on the device. +The default map-type mapping, \code{tofrom}, applies to the \plc{x} array section, +but not to \plc{scratch} which is explicitly mapped with the \code{alloc} map-type. +Note: the variable \plc{v} is not included in the map list (otherwise +the \plc{hostonly\_data} would be mapped)-- just the elements +to be mapped are listed. + +The two mappers are combined when a \plc{mypoints\_t} structure type is mapped, +because the mapper \plc{myvec\_t} structure type is used within a \plc{mypoints\_t} +type structure. +%Note, in the main program \plc{P} is an array of \plc{mypoints\_t} type structures, +%and hence every element of the array is mapped with the mapper prescription. + +\cexample{target_mapper}{3} + +\ffreeexample{target_mapper}{3} + diff --git a/Examples_target_offload.tex b/Examples_target_offload.tex new file mode 100644 index 0000000..ddd3926 --- /dev/null +++ b/Examples_target_offload.tex @@ -0,0 +1,46 @@ +\pagebreak +\section{Target Offload} +\label{sec:target_offload} + +In the OpenMP 5.0 implementation the \code{OMP\_TARGET\_OFFLOAD} +environment variable was defined to change \plc{default} offload behavior. +By \plc{default} the target code (region) is executed on the host if the target device +does not exist or the implementation does not support the target device. +%Last sentence uses words of the 5.0 spec pg. 21 lines 7-8 + +In an OpenMP 5.0 compliant implementation, setting the +\code{OMP\_TARGET\_OFFLOAD} variable to \code{MANDATORY} will +force the program to terminate execution when a \code{target} +construct is encountered and the target device is not supported or is not available. +With a value \code{DEFAULT} the target region will execute on a device if the +device exists and is supported by the implementation, +otherwise it will execute on the host. +Support for the \code{DISABLED} +value is optional; when it is supported the behavior is as if only the +host device exists (other devices are considered non-existent to the runtime), +and target regions are executed on the host. + +The following example reports execution behavior for different +values of the \code{OMP\_TARGET\_OFFLOAD} variable. A handy routine +for extracting the \code{OMP\_TARGET\_OFFLOAD} environment variable +value is deployed here, because the OpenMP API does not have a routine +for obtaining the value. %(\texit{yet}). + +Note: +The example issues a warning when a pre-5.0 implementation is used, +indicating that the \code{OMP\_TARGET\_OFFLOAD} is ignored. +The value of the \code{OMP\_TARGET\_OFFLOAD} variable is reported +when the \code{OMP\_DISPLAY\_ENV} +environment variable is set to \code{TRUE} or \code{VERBOSE}. + +%\pagebreak +\cexample{target_offload_control}{1} + +%\pagebreak +\ffreeexample{target_offload_control}{1} + + +% OMP 4.5 target offload 15:9-11 +%If the target device does not exist or the +%implementation does not support the target device, all target regions associated with that device +%execute on the host device. diff --git a/Examples_target_pointer_mapping.tex b/Examples_target_pointer_mapping.tex new file mode 100644 index 0000000..79798e7 --- /dev/null +++ b/Examples_target_pointer_mapping.tex @@ -0,0 +1,53 @@ +\pagebreak +\section{Pointer mapping} +\label{sec:pointer_mapping} + +The following example shows the basics of mapping pointers with and without +associated storage on the host. + +Storage for pointers \plc{ptr1} and \plc{ptr2} is created on the host. +To map storage that is associated with a pointer on the host, the data can be +explicitly mapped as an array section so that the compiler knows +the amount of data to be assigned in the device (to the "corresponding" data storage area). +On the \code{target} construct array sections are mapped; however, the pointer \plc{ptr1} +is mapped, while \plc{ptr2} is not. Since \plc{ptr2} is not explicitly mapped, it is +firstprivate. This creates a subtle difference in the way these pointers can be used. + +As a firstprivate pointer, \plc{ptr2} can be manipulated on the device; +however, as an explicitly mapped pointer, +\plc{ptr1} becomes an \emph{attached} pointer and cannot be manipulated. +In both cases the host pointer is not updated with the device pointer +address---as one would expect for distributed memory. +The storage data on the host is updated from the corresponding device +data at the end of the \code{target} region. + +As a comparison, note that the \plc{aray} array is automatically mapped, +since the compiler knows the extent of the array. + +The pointer \plc{ptr3} is used in the \code{target} region and has +a data-sharing attribute of firstprivate. +The pointer is implicitly mapped to a zero-length array section. +Neither the pointer address nor any +of its locally assigned data on the device is returned +to the host. + +\cexample{target_ptr_map}{1} + +In the following example the global pointer \plc{p} appears in a +\code{declare}~\code{target} directive. Hence, the pointer \plc{p} will +persist on the device throughout executions in all target regions. + +The pointer is also used in an array section of a \code{map} clause on +a \code{target} construct. When storage associated with +a \code{declare}~\code{target} pointer +is mapped, as for the array section \plc{p[:N]} in the +\code{target} construct, the array section on the device is \emph{attached} +to the device pointer \plc{p} on entry to the construct, and +the value of the device pointer \plc{p} becomes undefined on exit. +(Of course, storage allocation for +the array section on the device will occur before the +pointer on the device is \emph{attached}.) +% For globals with declare target is there such a things a +% original and corresponding? + +\cexample{target_ptr_map}{2} diff --git a/Examples_target_structure_mapping.tex b/Examples_target_structure_mapping.tex new file mode 100644 index 0000000..8166caf --- /dev/null +++ b/Examples_target_structure_mapping.tex @@ -0,0 +1,54 @@ +\pagebreak +\section{Structure mapping} +\label{sec:structure_mapping} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +In the example below, only structure elements \plc{S.a}, \plc{S.b} and \plc{S.p} +of the \plc{S} structure appear in \code{map} clauses of a \code{target} construct. +Only these components have corresponding variables and storage on the device. +Hence, the large arrays, \plc{S.buffera} and \plc{S.bufferb}, and the \plc{S.x} component have no storage +on the device and cannot be accessed. + +Also, since the pointer member \plc{S.p} is used in an array section of a +\code{map} clause, the array storage of the array section on the device, +\plc{S.p[:N]}, is \emph{attached} to the pointer member \plc{S.p} on the device. +Explicitly mapping the pointer member \plc{S.p} is optional in this case. + +Note: The buffer arrays and the \plc{x} variable have been grouped together, so that +the components that will reside on the device are all together (without gaps). +This allows the runtime to optimize the transfer and the storage footprint on the device. + +\cexample{target_struct_map}{1} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +The following example is a slight modification of the above example for +a C++ class. In the member function \plc{SAXPY::driver} +the array section \plc{p[:N]} is \emph{attached} to the pointer member \plc{p} +on the device. + +\cppexample{target_struct_map}{2} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%In this example a pointer, \plc{p}, is mapped in a +%\code{target}~\code{data} construct (\code{map(p)}) and remains +%persistent throughout the \code{target}~\code{data} region. The address stored +%on the host is not assigned to the device pointer variable, and +%the device value is not copied back to the host at the end of the +%region (for a pointer, it is as though \code{map(alloc:p}) is effectively +%used). The array section, \plc{p[:N]}, is mapped on both \code{target} +%constructs, and the pointer \plc{p} on the device is attached at the +%beginning and detached at the end of the regions to the newly created +%array section on the device. +% +%Also, in the following example the global variable, \plc{a}, becomes +%allocated when it is first used on the device in a \code{target} region, +%and persists on the device for all target regions. The value on the +%device and host may be different, as shown by the print statements. +%The values may be made consistent with the \code{update} construct, +%as shown in the \plc{declare\_target.3.c} and \plc{declare\_target.3.f90} +%examples. +% +%\cexample{target_struct_map}{2} diff --git a/Examples_target_unstructured_data.tex b/Examples_target_unstructured_data.tex index 82605c2..21c1641 100644 --- a/Examples_target_unstructured_data.tex +++ b/Examples_target_unstructured_data.tex @@ -24,6 +24,7 @@ construct occurs before the host data is deleted. \cppexample{target_unstructured_data}{1} +\pagebreak The following C code allocates and frees the data member of a Matrix structure. The \code{init\_matrix} function allocates the memory used in the structure and uses the \code{target}~\code{enter}~\code{data} directive to map it to the target device. The @@ -34,6 +35,7 @@ and then frees the memory on the host. Note, the stand-alone \cexample{target_unstructured_data}{1} +\pagebreak The following Fortran code allocates and deallocates a module array. The \code{initialize} subroutine allocates the module array and uses the \code{target}~\code{enter}~\code{data} directive to map it to the target device. The diff --git a/Examples_task_affinity.tex b/Examples_task_affinity.tex new file mode 100644 index 0000000..0542cbe --- /dev/null +++ b/Examples_task_affinity.tex @@ -0,0 +1,32 @@ +\section{Task Affinity} +\label{sec: task_affinity} + +The next example illustrates the use of the \code{affinity} +clause with a \code{task} construct. +The variables in the \code{affinity} clause provide a +hint to the runtime that the task should execute +"close" to the physical storage location of the variables. For example, +on a two-socket platform with a local memory component +close to each processor socket, the runtime will attempt to +schedule the task execution on the socket where the storage is located. + +Because the C/C++ code employs a pointer, an array section is used in +the \code{affinity} clause. +Fortran code can use an array reference to specify the storage, as +shown here. + +Note, in the second task of the C/C++ code the \plc{B} pointer is declared +shared. Otherwise, by default, it would be firstprivate since it is a local +variable, and would probably be saved for the second task before being assigned +a storage address by the first task. Also, one might think it reasonable to use +the \code{affinity} clause \plc{affinity(B[:N])} on the second \code{task} construct. +However, the storage behind \plc{B} is created in the first task, and the +array section reference may not be valid when the second task is generated. +The use of the \plc{A} array is sufficient for this case, because one +would expect the storage for \plc{A} and \plc{B} would be physically "close" +(as provided by the hint in the first task). + +\cexample{affinity}{6} + +\ffreeexample{affinity}{6} + diff --git a/Examples_task_dep.tex b/Examples_task_dep.tex index ea3f542..c26616b 100644 --- a/Examples_task_dep.tex +++ b/Examples_task_dep.tex @@ -5,7 +5,7 @@ \subsection{Flow Dependence} \label{subsec:task_flow_depend} -In this example we show a simple flow dependence expressed using the \code{depend} +This example shows a simple flow dependence using a \code{depend} clause on the \code{task} construct. \cexample{task_dep}{1} @@ -20,7 +20,7 @@ would have a race condition. \subsection{Anti-dependence} \label{subsec:task_anti_depend} -In this example we show an anti-dependence expressed using the \code{depend} +This example shows an anti-dependence using the \code{depend} clause on the \code{task} construct. \cexample{task_dep}{2} @@ -35,7 +35,7 @@ race condition. \subsection{Output Dependence} \label{subsec:task_out_depend} -In this example we show an output dependence expressed using the \code{depend} +This example shows an output dependence using the \code{depend} clause on the \code{task} construct. \cexample{task_dep}{3} @@ -47,6 +47,7 @@ clauses enforce the ordering of the tasks. If the \code{depend} clauses had been omitted, then the tasks could execute in any order and the program would have a race condition. +\pagebreak \subsection{Concurrent Execution with Dependences} \label{subsec:task_concurrent_depend} @@ -75,3 +76,141 @@ NxN elements, and the multiplication is implemented using blocks of BSxBS elemen \ffreeexample{task_dep}{5} +\subsection{\code{taskwait} with Dependences} +\label{subsec:taskwait_depend} + +In this subsection three examples illustrate how the +\code{depend} clause can be applied to a \code{taskwait} construct to make the +generating task wait for specific child tasks to complete. This is an OpenMP 5.0 feature. + In the same manner that +dependences can order executions among child tasks with \code{depend} clauses on +\code{task} constructs, the generating task can be scheduled to wait on child tasks +at a \code{taskwait} before it can proceed. + +Note: Since the \code{depend} clause on a \code{taskwait} construct relaxes the +default synchronization behavior (waiting for all children to finish), it is important to +realize that child tasks that are not predecessor tasks, as determined by the \code{depend} +clause of the \code{taskwait} construct, may be running concurrently while the +generating task is executing after the taskwait. + +In the first example the generating task waits at the \code{taskwait} construct +for the completion of the first child task because a dependence on the first task +is produced by \plc{x} with an \code{in} dependence type within the \code{depend} +clause of the \code{taskwait} construct. +Immediately after the first \code{taskwait} construct it is safe to access the +\plc{x} variable by the generating task, as shown in the print statement. +There is no completion restraint on the second child task. +Hence, immediately after the first \code{taskwait} it is unsafe to access the +\plc{y} variable since the second child task may still be executing. +The second \code{taskwait} ensures that the second child task has completed; hence +it is safe to access the \plc{y} variable in the following print statement. + +\cexample{task_dep}{6} + +\ffreeexample{task_dep}{6} + +In this example the first two tasks are serialized, because a dependence on +the first child is produced by \plc{x} with the \code{in} dependence type +in the \code{depend} clause of the second task. +However, the generating task at the first \code{taskwait} waits only on the +first child task to complete, because a dependence on only the first child task +is produced by \plc{x} with an \code{in} dependence type within the +\code{depend} clause of the \code{taskwait} construct. +The second \code{taskwait} (without a \code{depend} clause) is included +to guarantee completion of the second task before \plc{y} is accessed. +(While unnecessary, the \code{depend(inout:} \code{y)} clause on the 2nd child task is +included to illustrate how the child task dependences can be completely annotated +in a data-flow model.) + + +\cexample{task_dep}{7} + +\ffreeexample{task_dep}{7} + + +This example is similar to the previous one, except the generating task is +directed to also wait for completion of the second task. + +The \code{depend} clause of the \code{taskwait} construct now includes an +\code{in} dependence type for \plc{y}. Hence the generating task must now +wait on completion of any child task having \plc{y} with an \code{out} +(here \code{inout}) dependence type in its \code{depend} clause. +So, the \code{depend} clause of the \code{taskwait} construct now constrains +the second task to complete at the \code{taskwait}, too. +%--both tasks must now complete execution at the \code{taskwait}. +(This change makes the second \code{taskwait} of the previous example unnecessary-- +it has been removed in this example.) + +Note: While a taskwait construct ensures that all child tasks have completed; a depend clause on a taskwait +construct only waits for specific child tasks (prescribed by the dependence type and list +items in the \code{taskwait}'s \code{depend} clause). +This and the previous example illustrate the need to carefully determine +the dependence type of variables in the \code{taskwait} \code{depend} clause +when selecting child tasks that the generating task must wait on, so that its execution after the +taskwait does not produce race conditions on variables accessed by non-completed child tasks. + +\cexample{task_dep}{8} + +\ffreeexample{task_dep}{8} + +\pagebreak +\subsection{Mutually Exclusive Execution with Dependences} +\label{subsec:task_dep_mutexinoutset} + +In this example we show a series of tasks, including mutually exclusive +tasks, expressing dependences using the \code{depend} clause on the +\code{task} construct. + +The program will always print~6. Tasks T1, T2 and T3 will be scheduled first, +in any order. Task T4 will be scheduled after tasks T1 and T2 are +completed. T5 will be scheduled after tasks T1 and T3 are completed. Due +to the \code{mutexinoutset} dependence type on \code{c}, T4 and T5 may be +scheduled in any order with respect to each other, but not at the same +time. Tasks T6 will be scheduled after both T4 and T5 are completed. + +\cexample{task_dep}{9} + +\ffreeexample{task_dep}{9} + +The following example demonstrates a situation where the \code{mutexinoutset} +dependence type is advantageous. If \code{shortTaskB} completes +before \code{longTaskA}, the runtime can take advantage of this by +scheduling \code{longTaskBC} before \code{shortTaskAC}. + +\cexample{task_dep}{10} + +\ffreeexample{task_dep}{10} + +\subsection{Multidependences Using Iterators} +\label{subsec:depend_iterator} + +The following example uses an iterator to define a dynamic number of +dependences. + +In the \code{single} construct of a parallel region a loop generates n tasks +and each task has an \code{out} dependence specified through an element of +the \plc{v} array. This is followed by a single task that defines an \code{in} +dependence on each element of the array. This is accomplished by +using the \code{iterator} modifier in the \code{depend} clause, supporting a dynamic number +of dependences (\plc{n} here). + +The task for the \plc{print\_all\_elements} function is not executed until all dependences +prescribed (or registered) by the iterator are fulfilled; that is, +after all the tasks generated by the loop have completed. + +Note, one cannot simply use an array section in the \code{depend} clause +of the second task construct because this would violate the \code{depend} clause restriction: + +"List items used in \code{depend} clauses of the same task or sibling tasks +must indicate identical storage locations or disjoint storage locations". + +In this case each of the loop tasks use a single disjoint (different storage) +element in their \code{depend} clause; however, +the array-section storage area prescribed in the commented directive is neither +identical nor disjoint to the storage prescibed by the elements of the +loop tasks. The iterator overcomes this restriction by effectively +creating n disjoint storage areas. + +\cexample{task_dep}{11} + +\ffreeexample{task_dep}{11} diff --git a/Examples_taskgroup.tex b/Examples_taskgroup.tex index b098af7..3e06831 100644 --- a/Examples_taskgroup.tex +++ b/Examples_taskgroup.tex @@ -11,7 +11,7 @@ is started (the task executing the root of the recursive \code{compute\_tree()} calls). While synchronizing tasks at the end of each tree traversal, using the \code{taskgroup} construct ensures that the formerly started background task does not participate in the synchronization, and is left free to execute in parallel. -This is opposed to the behaviour of the \code{taskwait} construct, which would +This is opposed to the behavior of the \code{taskwait} construct, which would include the background tasks in the synchronization. \cexample{taskgroup}{1} diff --git a/Examples_tasking.tex b/Examples_tasking.tex index 891be2f..e926c42 100644 --- a/Examples_tasking.tex +++ b/Examples_tasking.tex @@ -53,7 +53,7 @@ and start executing unassigned tasks. Once the number of unassigned tasks is su low, the thread may resume execution of the task generating loop. \cexample{tasking}{5} -\pagebreak + \fexample{tasking}{5} The following example is the same as the previous one, except that the tasks are diff --git a/Examples_taskloop.tex b/Examples_taskloop.tex index 04d7aeb..9a0ed7c 100644 --- a/Examples_taskloop.tex +++ b/Examples_taskloop.tex @@ -12,3 +12,28 @@ The \code{nogroup} clause removes the implicit taskgroup of the \code{taskloop} \cexample{taskloop}{1} \ffreeexample{taskloop}{1} + +%\clearpage + +Because a \code{taskloop} construct encloses a loop, it is often incorrectly +perceived as a worksharing construct (when it is directly nested in +a \code{parallel} region). + +While a worksharing construct distributes the loop iterations across all threads in a team, +the entire loop of a \code{taskloop} construct is executed by every thread of the team. + +In the example below the first taskloop occurs closely nested within +a \code{parallel} region and the entire loop is executed by each of the \plc{T} threads; +hence the reduction sum is executed \plc{T}*\plc{N} times. + +The loop of the second taskloop is within a \code{single} region and is executed +by a single thread so that only \plc{N} reduction sums occur. (The other +\plc{N}-1 threads of the \code{parallel} region will participate in executing the +tasks. This is the common use case for the \code{taskloop} construct.) + +In the example, the code thus prints \code{x1 = 16384} (\plc{T}*\plc{N}) and +\code{x2 = 1024} (\plc{N}). + +\cexample{taskloop}{2} + +\ffreeexample{taskloop}{2} diff --git a/Examples_udr.tex b/Examples_udr.tex new file mode 100644 index 0000000..a0c57ae --- /dev/null +++ b/Examples_udr.tex @@ -0,0 +1,89 @@ +\subsection{User-Defined Reduction} +\label{subsec:UDR} + +The \code{declare}~\code{reduction} directive can be used to specify +user-defined reductions (UDR) for user data types. + +%The following examples show how user-defined reductions can be used to support user data types in the \code{reduction} clause. + +%The following example computes the enclosing rectangle of a set of points. The point data structure (\code{struct}~\code{point}) is not supported by the \code{reduction} clause. Using two \code{declare}~\code{reduction} directives we define how a reduction for the point data structure is done for the \plc{min} and \plc{max} operations. Each \code{declare}~\code{reduction} directive calls the appropriate function that passes the two special variables that can be used in the user-defined reduction expression: \code{omp\_in}, which holds one of the two values to reduce, and \code{omp\_out}, which holds the other value and should hold also the result of the reduction once the expression has been executed. Note, also, that when defining the user-defined reduction for \plc{min} we specify how the private variables of each thread are to be initialized (that is, the neutral value). This is not the case for \plc{max} as the default values (that is, zero filling) are already adequate. + + +In the following example, \code{declare}~\code{reduction} directives are used to define +\plc{min} and \plc{max} operations for the \plc{point} data structure for computing +the rectangle that encloses a set of 2-D points. + +Each \code{declare}~\code{reduction} directive defines new reduction identifiers, +\plc{min} and \plc{max}, to be used in a \code{reduction} clause. The next item in the +declaration list is the data type (\plc{struct} \plc{point}) used in the reduction, +followed by the combiner, here the functions \plc{minproc} and \plc{maxproc} perform +the min and max operations, respectively, on the user data (of type \plc{struct} \plc{point}). +In the function argument list are two special OpenMP variable identifiers, \code{omp\_in} and \code{omp\_out}, +that denote the two values to be combined in the "real" function; +the \code{omp\_out} identifier indicates which one is to hold the result. + +The initializer of the \code{declare}~\code{reduction} directive specifies +the initial value for the private variable of each implicit task. +The \code{omp\_priv} identifier is used to denote the private variable. + +\cexample{udr}{1} + +The following example shows the corresponding code in Fortran. +The \code{declare}~\code{reduction} directives are specified as part of +the declaration in subroutine \plc{find\_enclosing\_rectangle} and +the procedures that perform the min and max operations are specified as subprograms. + +\ffreeexample{udr}{1} + + +The following example shows the same computation as \plc{udr.1} but it illustrates that you can craft complex expressions in the user-defined reduction declaration. In this case, instead of calling the \plc{minproc} and \plc{maxproc} functions we inline the code in a single expression. + +\cexample{udr}{2} + +The corresponding code of the same example in Fortran is very similar +except that the assignment expression in the \code{declare}~\code{reduction} +directive can only be used for a single variable, in this case through +a type structure constructor \plc{point($\ldots$)}. + +\ffreeexample{udr}{2} + + +The following example shows the use of special variables in arguments for combiner (\code{omp\_in} and \code{omp\_out}) and initializer (\code{omp\_priv} and \code{omp\_orig}) routines. This example returns the maximum value of an array and the corresponding index value. The \code{declare}~\code{reduction} directive specifies a user-defined reduction operation \plc{maxloc} for data type \plc{struct} \plc{mx\_s}. The function \plc{mx\_combine} is the combiner and the function \plc{mx\_init} is the initializer. + +\cexample{udr}{3} + +Below is the corresponding Fortran version of the above example. The \code{declare}~\code{reduction} directive specifies the user-defined operation \plc{maxloc} for user-derived type \plc{mx\_s}. The combiner \plc{mx\_combine} and the initializer \plc{mx\_init} are specified as subprograms. + +\ffreeexample{udr}{3} + + +The following example explains a few details of the user-defined reduction +in Fortran through modules. The \code{declare}~\code{reduction} directive is declared in a module (\plc{data\_red}). +The reduction-identifier \plc{.add.} is a user-defined operator that is +to allow accessibility in the scope that performs the reduction +operation. +The user-defined operator \plc{.add.} and the subroutine \plc{dt\_init} specified in the \code{initializer} clause are defined in the same subprogram. + +The reduction operation (that is, the \code{reduction} clause) is in the main program. +The reduction identifier \plc{.add.} is accessible by use association. +Since \plc{.add.} is a user-defined operator, the explicit interface +should also be accessible by use association in the current +program unit. +Since the \code{declare}~\code{reduction} associated to this \code{reduction} clause +has the \code{initializer} clause, the subroutine specified on the clause +must be accessible in the current scoping unit. In this case, +the subroutine \plc{dt\_init} is accessible by use association. + +\ffreeexample{udr}{4} + + +The following example uses user-defined reductions to declare a plus (+) reduction for a C++ class. As the \code{declare}~\code{reduction} directive is inside the context of the \plc{V} class the expressions in the \code{declare}~\code{reduction} directive are resolved in the context of the class. Also, note that the \code{initializer} clause uses a copy constructor to initialize the private variables of the reduction and it uses as parameter to its original variable by using the special variable \code{omp\_orig}. + +\cppexample{udr}{5} + +The following examples shows how user-defined reductions can be defined for some STL containers. The first \code{declare}~\code{reduction} defines the plus (+) operation for \plc{std::vector} by making use of the \plc{std::transform} algorithm. The second and third define the merge (or concatenation) operation for \plc{std::vector} and \plc{std::list}. +%It shows how the same user-defined reduction operation can be defined to be done differently depending on the specified data type. +It shows how the user-defined reduction operation can be applied to specific data types of an STL. + +\cppexample{udr}{6} + diff --git a/Examples_variant.tex b/Examples_variant.tex new file mode 100644 index 0000000..24e2249 --- /dev/null +++ b/Examples_variant.tex @@ -0,0 +1,77 @@ +\pagebreak +\section{\code{declare}~\code{variant} Directive} +\label{sec:declare_variant} + +%A \code{declare variant} directive specifies that the following function is an alternate function, +%a \plc{function variant}, to be used in place of the specified \plc{base function} +%when the trait within the \code{match} clause has a valid context. + +A \code{declare}~\code{variant} directive specifies an alternate function, +\plc{function variant}, to be used in place of the \plc{base function} +%when the trait within the \code{match} clause has a valid context. +when the trait within the \code{match} clause matches the OpenMP context at a given call site. +The base function follows the directive in the C and C++ languages. +In Fortran, either a subroutine or function may be used as the \plc{base function}, +and the \code{declare}~\code{variant} directive must be in the specification +part of a subroutine or function (unless a \plc{base-proc-name} +modifier is used, as in the case of a procedure declaration statement). See +the OpenMP 5.0 Specification for details on the modifier. + +When multiple \code{declare}~\code{variant} directives are used +a function variant becomes a candidate for replacing the base function if the +%base function call context matches the traits of all selectors in the \code{match} clause. +context at the base function call matches the traits of all selectors in the \code{match} clause. +If there are multiple candidates, a score is assigned with rules for each +of the selector traits. The scoring algorithm can be found in the OpenMP 5.0 Specification. + +In the first example the \plc{vxv()} function is called within a \code{parallel} region, +a \code{target} region, and in a sequential part of the program. Two function variants, \plc{p\_vxv()} and \plc{t\_vxv()}, +are defined for the first two regions by using \plc{parallel} and \plc{target} selectors (within +the \plc{construct} trait set) in a \code{match} clause. The \plc{p\_vxv()} function variant includes +a \code{for} construct (\code{do} construct for Fortran) for the \code{parallel} region, +while \plc{t\_vxv()} includes a \code{distribute}~\code{simd} construct for the \code{target} region. +The \plc{t\_vxv()} function is explicitly compiled for the device using a \code{declare}~\code{target} directive. + +Since the two \code{declare}~\code{variant} directives have no selectors that match traits for the context +of the base function call in the sequential part of the program, the base \plc{vxv()} function is used there, +as expected. +(The vectors in the \plc{p\_vxv} and \plc{t\_vxv} functions have been multiplied +by 3 and 2, respectively, for checking the validity of the replacement. Normally +the purpose of a function variant is to produce the same results by a different method.) + +%Note: a \code{target teams} construct is used to direct execution onto a device, with a +%\code{distribute simd} construct in the function variant. As of the OpenMP 5.0 implementation +%no intervening code is allowed between a \code{target} and \code{teams} construct. So +%using a \code{target} construct to direct execution onto a device, and including +%\code{teams distribute simd} in the variant function would produce non conforming code. + +%\pagebreak +\cexample{declare_variant}{1} + +\ffreeexample{declare_variant}{1} + + +%\pagebreak + +In this example, traits from the \plc{device} set are used to select a function variant. +In the \code{declare}~\code{variant} directive, an \plc{isa} selector +specifies that if the implementation of the ``\plc{core-avx512}'' +instruction set is detected at compile time the \plc{avx512\_saxpy()} +variant function is used for the call to \plc{base\_saxpy()}. + +A compilation of \plc{avx512\_saxpy()} is aware of +the AVX-512 instruction set that supports 512-bit vector extensions (for Xeon or Xeon Phi architectures). +Within \plc{avx512\_saxpy()}, the \code{parallel}~\code{for}~\code{simd} construct performs parallel execution, and +takes advantage of 64-byte data alignment. +When the \plc{avx512\_saxpy()} function variant is not selected, the base \plc{base\_saxpy()} function variant +containing only a basic \code{parallel}~\code{for} construct is used for the call to \plc{base\_saxpy()}. + +%Note: +%An allocator is used to set the alignment to 64 bytes when an OpenMP compilation is performed. +%Details about allocator variable declarations and functions +%can be found in the allocator example of the Memory Management Chapter. + +%\pagebreak +\cexample{declare_variant}{2} + +\ffreeexample{declare_variant}{2} diff --git a/Foreword_Chapt.tex b/Foreword_Chapt.tex new file mode 100644 index 0000000..14e1d46 --- /dev/null +++ b/Foreword_Chapt.tex @@ -0,0 +1,23 @@ +\pagebreak +\chapter*{Foreword} +\label{chap:foreword} +\addcontentsline{toc}{chapter}{\protect\numberline{}Foreword} + +The OpenMP Examples document has been updated with new features +found in the OpenMP 5.0 Specification. The additional examples and updates +are referenced in the Document Revision History of the Appendix, \specref{sec:history_45_to_50}. + +Text describing an example with a 5.0 feature specifically states +that the feature support begins in the OpenMP 5.0 Specification. Also, +an \plc{omp\_5.0} keyword has been added to metadata in the source code. +These distinctions are presented to remind readers that a 5.0 compliant +OpenMP implementation is necessary to use these features in codes. + +Examples for most of the 5.0 features are included in this document, +and incremental releases will become available as more feature examples +and updates are submitted, and approved by the OpenMP Examples Subcommittee. + +\bigskip +Examples Subcommitee Co-chairs: \smallskip\linebreak +Henry Jin (\textsc{NASA} Ames Research Center) \linebreak +Kent Milfeld (\textsc{TACC}, Texas Advanced Research Center) diff --git a/History.tex b/History.tex index 477a551..665a42e 100644 --- a/History.tex +++ b/History.tex @@ -1,39 +1,84 @@ \chapter{Document Revision History} \label{chap:history} +\section{Changes from 4.5.0 to 5.0.0} +\label{sec:history_45_to_50} + +\begin{itemize} +\item Added the following examples for the 5.0 features: + +\begin{itemize} +\item Extended \code{teams} construct for host execution (\specref{sec:host_teams}) +\item \code{loop} and \code{teams}~\code{loop} constructs specify loop iterations that can execute concurrently + (\specref{sec:loop}) +\item Task data affinity is indicated by \code{affinity} clause of \code{task} construct + (\specref{sec: task_affinity}) +\item Display thread affinity with \code{OMP\_DISPLAY\_AFFINITY} environment variable or \code{omp\_display\_affinity()} API routine + (\specref{sec:affinity_display}) +\item \code{taskwait} with dependences (\specref{subsec:taskwait_depend}) +\item \code{mutexinoutset} task dependences (\specref{subsec:task_dep_mutexinoutset}) +\item Multidependence Iterators (in \code{depend} clauses) (\specref{subsec:depend_iterator}) +\item Combined constructs: \code{parallel}~\code{master}~\code{taskloop} and \code{parallel}~\code{master}~\code{taskloop}~\code{simd} + (\specref{sec:parallel_master_taskloop}) +\item Reverse Offload through \plc{ancestor} modifier of \code{device} clause. (\specref{subsec:target_reverse_offload}) +\item Array Shaping with the \plc{shape-operator} (\specref{sec:array-shaping}) +\item The \code{declare}~\code{mapper} construct (\specref{sec:declare_mapper}) +\item Acquire and Release Semantics Synchronization: Memory ordering + clauses \code{acquire}, \code{release}, and \code{acq\_rel} were added + to flush and atomic constructs + (\specref{sec:acquire_and_release_semantics}) +\item \code{depobj} construct provides dependence objects for subsequent use in \code{depend} clauses + (\specref{sec:depobj}) +\item \code{reduction} clause for \code{task} construct (\specref{subsec:task_reduction}) +\item \code{reduction} clause for \code{taskloop} construct (\specref{subsec:taskloop_reduction}) +\item \code{reduction} clause for \code{taskloop}~\code{simd} construct (\specref{subsec:taskloop_reduction}) +\item Memory Allocators for making OpenMP memory requests with traits (\specref{sec:allocators}) +\item \code{requires} directive specifies required features of implementation (\specref{sec:requires}) +\item \code{declare}~\code{variant} directive - for function variants (\specref{sec:declare_variant}) +\item \code{metadirective} directive - for directive variants (\specref{sec:metadirective}) +\end{itemize} + +\item Included the following additional examples for the 4.x features: +\begin{itemize} +\item more taskloop examples (\specref{sec:taskloop}) +\item user-defined reduction (UDR) (\specref{subsec:UDR}) +\end{itemize} +\end{itemize} + \section{Changes from 4.0.2 to 4.5.0} \begin{itemize} \item Reorganized into chapters of major topics \item Included file extensions in example labels to indicate source type \item Applied the explicit \code{map(tofrom)} for scalar variables -in a number of examples to comply with -the change of the default behavior for scalar variables from -\code{map(tofrom)} to \code{firstprivate} in the 4.5 specification + in a number of examples to comply with + the change of the default behavior for scalar variables from + \code{map(tofrom)} to \code{firstprivate} in the 4.5 specification \item Added the following new examples: + \begin{itemize} -\item \code{linear} clause in loop constructs (\specref{sec:linear_in_loop}) -\item task priority (\specref{sec:task_priority}) -\item \code{taskloop} construct (\specref{sec:taskloop}) +\item \code{linear} clause in loop constructs (\specref{sec:linear_in_loop}) +\item \code{priority} clause for \code{task} construct (\specref{sec:task_priority}) +\item \code{taskloop} construct (\specref{sec:taskloop}) \item \plc{directive-name} modifier in multiple \code{if} clauses on -a combined construct (\specref{subsec:target_if}) -\item unstructured data mapping (\specref{sec:target_enter_exit_data}) +a combined construct (\specref{subsec:target_if}) +\item unstructured data mapping (\specref{sec:target_enter_exit_data}) \item \code{link} clause for \code{declare}~\code{target} directive -(\specref{subsec:declare_target_link}) + (\specref{subsec:declare_target_link}) \item asynchronous target execution with \code{nowait} clause (\specref{sec:async_target_exec_depend}) -\item device memory routines and device pointers -(\specref{subsec:target_mem_and_device_ptrs}) -\item doacross loop nest (\specref{sec:doacross}) -\item locks with hints (\specref{sec:locks}) -\item C/C++ array reduction (\specref{sec:reduction}) +\item device memory routines and device pointers (\specref{subsec:target_mem_and_device_ptrs}) +\item doacross loop nest (\specref{sec:doacross}) +\item locks with hints (\specref{sec:locks}) +\item C/C++ array reduction (\specref{subsec:reduction}) \item C++ reference types in data sharing clauses (\specref{sec:cpp_reference}) \end{itemize} + \end{itemize} \section{Changes from 4.0.1 to 4.0.2} \begin{itemize} \item Names of examples were changed from numbers to mnemonics -\item Added SIMD examples (\specref{sec:SIMD}) +\item Added SIMD examples (\specref{sec:SIMD}) \item Applied miscellaneous fixes in several source codes \item Added the revision history \end{itemize} @@ -42,27 +87,29 @@ a combined construct (\specref{subsec:target_if}) Added the following new examples: \begin{itemize} -\item the \code{proc\_bind} clause (\specref{sec:affinity}) +\item the \code{proc\_bind} clause (\specref{sec:affinity}) \item the \code{taskgroup} construct (\specref{sec:taskgroup}) \end{itemize} \section{Changes from 3.1 to 4.0} -Beginning with OpenMP 4.0, examples were placed in a separate document -from the specification document. - -Version 4.0 added the following new examples: \begin{itemize} -\item task dependences (\specref{sec:task_depend}) -\item \code{target} construct (\specref{sec:target}) -\item \code{target} \code{data} construct (\specref{sec:target_data}) -\item \code{target} \code{update} construct (\specref{sec:target_update}) -\item \code{declare} \code{target} construct (\specref{sec:declare_target}) -\item \code{teams} constructs (\specref{sec:teams}) -\item asynchronous execution of a \code{target} region using tasks - (\specref{subsec:async_target_with_tasks}) -\item array sections in device constructs (\specref{sec:array_sections}) -\item device runtime routines (\specref{sec:device}) -\item Fortran ASSOCIATE construct (\specref{sec:associate}) -\item cancellation constructs (\specref{sec:cancellation}) +\item Beginning with OpenMP 4.0, examples were placed in a separate document + from the specification document. +\item Version 4.0 added the following new examples: + +\begin{itemize} +\item task dependences (\specref{sec:task_depend}) +\item \code{target} construct (\specref{sec:target}) +\item \code{target}~\code{data} construct (\specref{sec:target_data}) +\item \code{target}~\code{update} construct (\specref{sec:target_update}) +\item \code{declare}~\code{target} construct (\specref{sec:declare_target}) +\item \code{teams} constructs (\specref{sec:teams}) +\item asynchronous execution of a \code{target} region using tasks (\specref{subsec:async_target_with_tasks}) +\item array sections in device constructs (\specref{sec:array_sections}) +\item device runtime routines (\specref{sec:device}) +\item Fortran ASSOCIATE construct (\specref{sec:associate}) +\item cancellation constructs (\specref{sec:cancellation}) +\end{itemize} + \end{itemize} diff --git a/Introduction_Chapt.tex b/Introduction_Chapt.tex index 2b493df..d042c4a 100644 --- a/Introduction_Chapt.tex +++ b/Introduction_Chapt.tex @@ -40,12 +40,6 @@ Memory Parallelization specifications, and is not part of the formal specificati assumes familiarity with the OpenMP specifications, and shares the typographical conventions used in that document. -\notestart -\noteheader ā€“ This first release of the OpenMP Examples reflects the OpenMP Version 4.5 -specifications. Additional examples are being developed and will be published in future -releases of this document. -\noteend - The OpenMP API specification provides a model for parallel programming that is portable across shared memory architectures from different vendors. Compilers from numerous vendors support the OpenMP API. diff --git a/Makefile b/Makefile index 3257577..12530b3 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,12 @@ # Makefile for the OpenMP Examples document in LaTex format. # For more information, see the master document, openmp-examples.tex. -version=4.5.0 +version=5.0.0 default: openmp-examples.pdf CHAPTERS=Title_Page.tex \ + Foreword_Chapt.tex \ Introduction_Chapt.tex \ Examples_*.tex \ History.tex diff --git a/README b/README index 1779c77..15c589d 100644 --- a/README +++ b/README @@ -27,7 +27,7 @@ For copyright information, please see omp_copyright.txt. 2) Tags (meta data) for example sources - @@name: .[c|f] + @@name: .[c|cpp|f|f90] @@type: C|C++|F-fixed|F-free @@compilable: yes|no|maybe @@linkable: yes|no|maybe @@ -47,12 +47,16 @@ For copyright information, please see omp_copyright.txt. 3) LaTeX macros for examples - Source code with language h-rules - \cexample{}{c} - \fexample{}{f} + \cexample{}{} % for C/C++ examples + \cppexample{}{} % for C++ examples + \fexample{}{} % for fixed-form Fortran examples + \ffreeexample{}{} % for free-form Fortran examples - Source code without language h-rules - \cnexample{}{c} - \fnexample{}{f} + \cnexample{}{} + \cppnexample{}{} + \fnexample{}{} + \ffreenexample{}{} - Language h-rules \cspecificstart, \cspecificend diff --git a/Title_Page.tex b/Title_Page.tex index 4040e22..13e69aa 100644 --- a/Title_Page.tex +++ b/Title_Page.tex @@ -17,17 +17,17 @@ \vspace{1.0in} - \textbf{Version \VER{} -- \VERDATE} + \textbf{Version \PVER{} -- \VERDATE} \end{center} \end{adjustwidth} \vspace{2.3in} %was 3.0 -Source codes for OpenMP \VER{} Examples can be downloaded from +Source codes for OpenMP \PVER{} Examples can be downloaded from \href{https://github.com/OpenMP/Examples/tree/v\VER}{github}.\\ \begin{adjustwidth}{0pt}{1em}\setlength{\parskip}{0.25\baselineskip}% -Copyright Ā© 1997-2016 OpenMP Architecture Review Board.\\ +Copyright Ā© 1997-2019 OpenMP Architecture Review Board.\\ Permission to copy without fee all or part of this material is granted, provided the OpenMP Architecture Review Board copyright notice and the title of this document appear. Notice is given that copying is by @@ -42,7 +42,9 @@ permission of OpenMP Architecture Review Board.\end{adjustwidth} \phantom{a} \emph{This page intentionally left blank} -%This working version enacted the following tickets: 180, 295, 299, 342, 381, +%For final version, uncomment the line above, comment out the lines below +%This working version enacted the following tickets: 287, 519, 550, 593, +%674, 688, 689, %and a few other editorial changes. \vfill diff --git a/omp_copyright.txt b/omp_copyright.txt index c81aa8f..226040e 100644 --- a/omp_copyright.txt +++ b/omp_copyright.txt @@ -1,4 +1,4 @@ -Copyright (c) 1997-2016 OpenMP Architecture Review Board. +Copyright (c) 1997-2019 OpenMP Architecture Review Board. All rights reserved. Permission to redistribute and use without fee all or part of the source diff --git a/openmp-example.tex b/openmp-example.tex new file mode 100644 index 0000000..acd49a7 --- /dev/null +++ b/openmp-example.tex @@ -0,0 +1,83 @@ +% Welcome to openmp-examples.tex. +% This is the master LaTex file for the OpenMP Examples document. +% +% The files in this set include: +% +% openmp-examples.tex - this file, the master file +% Makefile - makes the document +% openmp.sty - the main style file +% Title_Page.tex - the title page +% openmplogo.png - the logo +% Introduction_Chapt.tex - unnumbered introductory chapter +% Examples_Chapt.tex - unnumbered chapter +% Examples_Sects.tex - examples +% sources/*.c, *.f - C/C++/Fortran example source files +% +% When editing this file: +% +% 1. To change formatting, appearance, or style, please edit openmp.sty. +% +% 2. Custom commands and macros are defined in openmp.sty. +% +% 3. Be kind to other editors -- keep a consistent style by copying-and-pasting to +% create new content. +% +% 4. We use semantic markup, e.g. (see openmp.sty for a full list): +% \code{} % for bold monospace keywords, code, operators, etc. +% \plc{} % for italic placeholder names, grammar, etc. +% +% 5. Other recommendations: +% Use the convenience macros defined in openmp.sty for the minor headers +% such as Comments, Syntax, etc. +% +% To keep items together on the same page, prefer the use of +% \begin{samepage}.... Avoid \parbox for text blocks as it interrupts line numbering. +% When possible, avoid \filbreak, \pagebreak, \newpage, \clearpage unless that's +% what you mean. Use \needspace{} cautiously for troublesome paragraphs. +% +% Avoid absolute lengths and measures in this file; use relative units when possible. +% Vertical space can be relative to \baselineskip or ex units. Horizontal space +% can be relative to \linewidth or em units. +% +% Prefer \emph{} to italicize terminology, e.g.: +% This is a \emph{definition}, not a placeholder. +% This is a \plc{var-name}. +% + +% The following says letter size, but the style sheet may change the size +\documentclass[10pt,letterpaper,twoside,makeidx,hidelinks]{scrreprt} + +% Text to appear in the footer on even-numbered pages: +\newcommand{\VER}{5.0.0} +\newcommand{\PVER}{\VER{}p1} +\newcommand{\VERDATE}{February 2018} +\newcommand{\footerText}{OpenMP Examples Version \PVER{} - \VERDATE} + +% Unified style sheet for OpenMP documents: +\input{openmp.sty} + + +\begin{document} + \pagenumbering{roman} + + \setcounter{page}{0} + \setcounter{tocdepth}{2} + + + % Uncomment the next line to enable line numbering on the main body text: + \linenumbers\pagewiselinenumbers + + \newpage\pagenumbering{arabic} + + \setcounter{chapter}{0} % start chapter numbering here + + % \input{Chap_Single} + \input{Example} + + %\setcounter{chapter}{0} % restart chapter numbering with "letter A" + %\renewcommand{\thechapter}{\Alph{chapter}}% + %\appendix + %\input{History} + +\end{document} + diff --git a/openmp-examples.tex b/openmp-examples.tex index 7cf79cf..dc7f8ef 100644 --- a/openmp-examples.tex +++ b/openmp-examples.tex @@ -8,6 +8,7 @@ % openmp.sty - the main style file % Title_Page.tex - the title page % openmplogo.png - the logo +% Forward_Chapt.tex - unnumbered introductory chapter % Introduction_Chapt.tex - unnumbered introductory chapter % Examples_Chapt.tex - unnumbered chapter % Examples_Sects.tex - examples @@ -48,9 +49,10 @@ \documentclass[10pt,letterpaper,twoside,makeidx,hidelinks]{scrreprt} % Text to appear in the footer on even-numbered pages: -\newcommand{\VER}{4.5.0} -\newcommand{\VERDATE}{November 2016} -\newcommand{\footerText}{OpenMP Examples Version \VER{} - \VERDATE} +\newcommand{\VER}{5.0.0} +\newcommand{\PVER}{\VER{}} +\newcommand{\VERDATE}{November 2019} +\newcommand{\footerText}{OpenMP Examples Version \PVER{} - \VERDATE} % Unified style sheet for OpenMP documents: \input{openmp.sty} @@ -70,6 +72,8 @@ % Uncomment the next line to enable line numbering on the main body text: \linenumbers\pagewiselinenumbers + \input{Foreword_Chapt} + \newpage\pagenumbering{arabic} \input{Introduction_Chapt} @@ -80,6 +84,7 @@ \input{Chap_parallel_execution} \input{Examples_ploop} \input{Examples_parallel} + \input{Examples_host_teams} \input{Examples_nthrs_nesting} \input{Examples_nthrs_dynamic} \input{Examples_fort_do} @@ -92,12 +97,15 @@ \input{Examples_single} \input{Examples_workshare} \input{Examples_master} + \input{Examples_loop} \input{Examples_pra_iterator} \input{Examples_set_dynamic_nthrs} \input{Examples_get_nthrs} \input{Chap_affinity} \input{Examples_affinity} + \input{Examples_task_affinity} + \input{Examples_affinity_display} \input{Examples_affinity_query} \input{Chap_tasking} @@ -107,9 +115,15 @@ \input{Examples_taskgroup} \input{Examples_taskyield} \input{Examples_taskloop} + \input{Examples_parallel_master_taskloop} \input{Chap_devices} \input{Examples_target} + \input{Examples_target_pointer_mapping} + \input{Examples_target_structure_mapping} + \input{Examples_array_sections} + \input{Examples_array_shaping} + \input{Examples_target_mapper} \input{Examples_target_data} \input{Examples_target_unstructured_data} \input{Examples_target_update} @@ -122,8 +136,8 @@ %New subsection \input{Examples_async_target_nowait} \input{Examples_async_target_nowait_depend} - \input{Examples_array_sections} - % Structure Element in map 487 + % \input{Examples_array_sections} moved after struct_ptr_map + % Structure Element in map 487 no 579 \input{Examples_device} % MemoryRoutine and Device ptr 473 @@ -140,7 +154,9 @@ \input{Examples_atomic} \input{Examples_atomic_restrict} \input{Examples_flush_nolist} + \input{Examples_acquire_release} \input{Examples_ordered} + \input{Examples_depobj} % Doacross loop 405 \input{Examples_doacross} \input{Examples_locks} @@ -165,7 +181,7 @@ \input{Examples_lastprivate} \input{Examples_reduction} % User UDR 287 - % C array reduction 377 + \input{Examples_udr} \input{Examples_copyin} \input{Examples_copyprivate} \input{Examples_cpp_reference} @@ -174,6 +190,7 @@ \input{Chap_memory_model} \input{Examples_mem_model} + \input{Examples_allocators} \input{Examples_fort_race} \input{Chap_program_control} @@ -182,9 +199,13 @@ % If multi-ifs 471 \input{Examples_standalone} \input{Examples_cancellation} + \input{Examples_requires} + \input{Examples_variant} + \input{Examples_metadirective} % New Section Nested Regions - \input{Examples_nested_loop} - \input{Examples_nesting_restrict} + \input{Examples_nested_loop} + \input{Examples_nesting_restrict} + \input{Examples_target_offload} \setcounter{chapter}{0} % restart chapter numbering with "letter A" diff --git a/sources/Example_acquire_release.1.c b/sources/Example_acquire_release.1.c new file mode 100644 index 0000000..dd8e6af --- /dev/null +++ b/sources/Example_acquire_release.1.c @@ -0,0 +1,32 @@ +/* +* @@name: acquire_release.1.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ + +#include +#include + +int main() +{ + int x = 0, y = 0; + #pragma omp parallel num_threads(2) + { + int thrd = omp_get_thread_num(); + if (thrd == 0) { + x = 10; + #pragma omp critical + { y = 1; } + } else { + int tmp = 0; + while (tmp == 0) { + #pragma omp critical + { tmp = y; } + } + printf("x = %d\n", x); // always "x = 10" + } + } + return 0; +} diff --git a/sources/Example_acquire_release.1.f90 b/sources/Example_acquire_release.1.f90 new file mode 100644 index 0000000..568fc24 --- /dev/null +++ b/sources/Example_acquire_release.1.f90 @@ -0,0 +1,29 @@ +! @@name: acquire_release.1.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success + +program rel_acq_ex1 + use omp_lib + integer :: x, y, thrd, tmp + x = 0 + y = 0 + !$omp parallel num_threads(2) private(thrd, tmp) + thrd = omp_get_thread_num() + if (thrd == 0) then + x = 10 + !$omp critical + y = 1 + !$omp end critical + else + tmp = 0 + do while (tmp == 0) + !$omp critical + tmp = y + !$omp end critical + end do + print *, "x = ", x !! always "x = 10" + end if + !$omp end parallel +end program diff --git a/sources/Example_acquire_release.2.c b/sources/Example_acquire_release.2.c new file mode 100644 index 0000000..2dae839 --- /dev/null +++ b/sources/Example_acquire_release.2.c @@ -0,0 +1,32 @@ +/* +* @@name: acquire_release.2.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ + +#include +#include + +int main() +{ + int x = 0, y = 0; + #pragma omp parallel num_threads(2) + { + int thrd = omp_get_thread_num(); + if (thrd == 0) { + x = 10; + #pragma omp atomic write release // or seq_cst + y = 1; + } else { + int tmp = 0; + while (tmp == 0) { + #pragma omp atomic read acquire // or seq_cst + tmp = y; + } + printf("x = %d\n", x); // always "x = 10" + } + } + return 0; +} diff --git a/sources/Example_acquire_release.2.f90 b/sources/Example_acquire_release.2.f90 new file mode 100644 index 0000000..38a2f9d --- /dev/null +++ b/sources/Example_acquire_release.2.f90 @@ -0,0 +1,29 @@ +! @@name: acquire_release.2.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success + +program rel_acq_ex2 + use omp_lib + integer :: x, y, thrd, tmp + x = 0 + y = 0 + !$omp parallel num_threads(2) private(thrd, tmp) + thrd = omp_get_thread_num() + if (thrd == 0) then + x = 10 + !$omp atomic write release ! or seq_cst + y = 1 + !$omp end atomic + else + tmp = 0 + do while (tmp == 0) + !$omp atomic read acquire ! or seq_cst + tmp = y + !$omp end atomic + end do + print *, "x = ", x !! always "x = 10" + end if + !$omp end parallel +end program diff --git a/sources/Example_acquire_release.3.c b/sources/Example_acquire_release.3.c new file mode 100644 index 0000000..b3f0793 --- /dev/null +++ b/sources/Example_acquire_release.3.c @@ -0,0 +1,34 @@ +/* +* @@name: acquire_release.3.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ + +#include +#include + +int main() +{ + int x = 0, y = 0; + #pragma omp parallel num_threads(2) + { + int thrd = omp_get_thread_num(); + if (thrd == 0) { + x = 10; + #pragma omp flush // or with acq_rel or release clause + #pragma omp atomic write // or with relaxed clause + y = 1; + } else { + int tmp = 0; + while (tmp == 0) { + #pragma omp atomic read // or with relaxed clause + tmp = y; + } + #pragma omp flush // or with acq_rel or acquire clause + printf("x = %d\n", x); // always "x = 10" + } + } + return 0; +} diff --git a/sources/Example_acquire_release.3.f90 b/sources/Example_acquire_release.3.f90 new file mode 100644 index 0000000..9caa292 --- /dev/null +++ b/sources/Example_acquire_release.3.f90 @@ -0,0 +1,31 @@ +! @@name: acquire_release.3.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success + +program rel_acq_ex3 + use omp_lib + integer :: x, y, thrd, tmp + x = 0 + y = 0 + !$omp parallel num_threads(2) private(thrd, tmp) + thrd = omp_get_thread_num() + if (thrd == 0) then + x = 10 + !$omp flush ! or with acq_rel or release clause + !$omp atomic write + y = 1 + !$omp end atomic + else + tmp = 0 + do while (tmp == 0) + !$omp atomic read + tmp = y + !$omp end atomic + end do + !$omp flush ! or with acq_rel or acquire clause + print *, "x = ", x !! always "x = 10" + end if + !$omp end parallel +end program diff --git a/sources/Example_acquire_release_broke.4.c b/sources/Example_acquire_release_broke.4.c new file mode 100644 index 0000000..0ecc2b6 --- /dev/null +++ b/sources/Example_acquire_release_broke.4.c @@ -0,0 +1,41 @@ +/* +* @@name: acquire_release.4.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ + +#include +#include + +int main() +{ + +// !!! THIS CODE WILL FAIL TO PRODUCE CONSISTENT RESULTS !!!!!!! +// !!! DO NOT PROGRAM SYNCHRONIZATION THIS WAY !!!!!!! + + int x = 0, y; + #pragma omp parallel num_threads(2) + { + int thrd = omp_get_thread_num(); + if (thrd == 0) { + #pragma omp critical + { x = 10; } + // an explicit flush directive that provides + // release semantics is needed here + // to complete the synchronization. + #pragma omp atomic write + y = 1; + } else { + int tmp = 0; + while (tmp == 0) { + #pragma omp atomic read acquire // or seq_cst + tmp = y; + } + #pragma omp critical + { printf("x = %d\n", x); } // !! NOT ALWAYS 10 + } + } + return 0; +} diff --git a/sources/Example_acquire_release_broke.4.f90 b/sources/Example_acquire_release_broke.4.f90 new file mode 100644 index 0000000..b1c677f --- /dev/null +++ b/sources/Example_acquire_release_broke.4.f90 @@ -0,0 +1,40 @@ +! @@name: acquire_release.4.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success + +program rel_acq_ex4 + use omp_lib + integer :: x, y, thrd + integer :: tmp + x = 0 + +!! !!! THIS CODE WILL FAIL TO PRODUCE CONSISTENT RESULTS !!!!!!! +!! !!! DO NOT PROGRAM SYNCHRONIZATION THIS WAY !!!!!!! + + !$omp parallel num_threads private(thrd) private(tmp) + thrd = omp_get_thread_num() + if (thrd == 0) then + !$omp critical + x = 10 + !$omp end critical + ! an explicit flush directive that provides + ! release semantics is needed here to + ! complete the synchronization. + !$omp atomic write + y = 1 + !$omp end atomic + else + tmp = 0 + do while(tmp == 0) + !$omp atomic read acquire ! or seq_cst + tmp = x + !$omp end atomic + end do + !$omp critical + print *, "x = ", x !! !! NOT ALWAYS 10 + !$omp end critical + end if + !$omp end parallel +end program diff --git a/sources/Example_affinity.1.c b/sources/Example_affinity.1.c index 82222cf..5e85909 100644 --- a/sources/Example_affinity.1.c +++ b/sources/Example_affinity.1.c @@ -5,12 +5,17 @@ * @@linkable: yes * @@expect: success */ + void work(); -int main() + +int main() { + #pragma omp parallel proc_bind(spread) num_threads(4) { work(); } + return 0; + } diff --git a/sources/Example_affinity.6.c b/sources/Example_affinity.6.c index 4ea9c95..e599505 100644 --- a/sources/Example_affinity.6.c +++ b/sources/Example_affinity.6.c @@ -1,38 +1,27 @@ /* -* @@name: affinity.6c -* @@type: C -* @@compilable: yes -* @@linkable: no -* @@expect: success +* @@name: affinity.1.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: no +* @@expect: success */ -#include -#include +double * alloc_init_B(double *A, int N); +void compute_on_B(double *B, int N); -void socket_init(int socket_num) +void task_affinity(double *A, int N) { - int n_procs; - - n_procs = omp_get_place_num_procs(socket_num); - #pragma omp parallel num_threads(n_procs) proc_bind(close) + double * B; + #pragma omp task depend(out:B) shared(B) affinity(A[0:N]) { - printf("Reporting in from socket num, thread num: %d %d\n", - socket_num,omp_get_thread_num() ); + B = alloc_init_B(A,N); } + + #pragma omp task depend( in:B) shared(B) affinity(A[0:N]) + { + compute_on_B(B,N); + } + + #pragma omp taskwait } -int main() -{ - int n_sockets, socket_num; - - omp_set_nested(1); // or export OMP_NESTED=true - omp_set_max_active_levels(2); // or export OMP_MAX_ACTIVE_LEVELS=2 - - n_sockets = omp_get_num_places(); - #pragma omp parallel num_threads(n_sockets) private(socket_num) \ - proc_bind(spread) - { - socket_num = omp_get_place_num(); - socket_init(socket_num); - } -} diff --git a/sources/Example_affinity.6.f90 b/sources/Example_affinity.6.f90 index b4cab83..364543c 100644 --- a/sources/Example_affinity.6.f90 +++ b/sources/Example_affinity.6.f90 @@ -1,34 +1,24 @@ -! @@name: affinity.6f -! @@type: F-free -! @@compilable: yes -! @@linkable: no -! @@expect: success +! @@name: affinity.6f +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success -subroutine socket_init(socket_num) - use omp_lib - integer :: socket_num, n_procs +subroutine task_affinity(A, N) - n_procs = omp_get_place_num_procs(socket_num) - !$omp parallel num_threads(n_procs) proc_bind(close) + external alloc_init_B + external compute_on_B + double precision, allocatable :: B(:) + + !$omp task depend(out:B) shared(B) affinity(A) + call alloc_init_B(B,A) + !$omp end task + + !$omp task depend(in:B) shared(B) affinity(A) + call compute_on_B(B) + !$omp end task + + !$omp taskwait - print*,"Reporting in from socket num, thread num: ", & - socket_num,omp_get_thread_num() - !$omp end parallel end subroutine -program numa_teams - use omp_lib - integer :: n_sockets, socket_num - - call omp_set_nested(.true.) ! or export OMP_NESTED=true - call omp_set_max_active_levels(2) ! or export OMP_MAX_ACTIVE_LEVELS=2 - - n_sockets = omp_get_num_places() - !$omp parallel num_threads(n_sockets) private(socket_num) & - !$omp& proc_bind(spread) - - socket_num = omp_get_place_num() - call socket_init(socket_num) - - !$omp end parallel -end program diff --git a/sources/Example_affinity_display.1.c b/sources/Example_affinity_display.1.c new file mode 100644 index 0000000..337c092 --- /dev/null +++ b/sources/Example_affinity_display.1.c @@ -0,0 +1,62 @@ +/* +* @@name: affinity_display.1.c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +#include + +int main(void){ //MAX threads = 8, single socket system + + omp_display_affinity(NULL); //API call-- Displays Affinity of Master Thread + +// API CALL OUTPUT (default format): +//team_num= 0, nesting_level= 0, thread_num= 0, thread_affinity= 0,1,2,3,4,5,6,7 + + + // OMP_DISPLAY_AFFINITY=TRUE, OMP_NUM_THREADS=8 + #pragma omp parallel num_threads(omp_get_num_procs()) + { + if(omp_get_thread_num()==0) + printf("1st Parallel Region -- Affinity Reported \n"); + + // DISPLAY OUTPUT (default format) has been sorted: + // team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0 + // team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 1 + // ... + // team_num= 0, nesting_level= 1, thread_num= 7, thread_affinity= 7 + + // doing work here + } + + #pragma omp parallel num_threads( omp_get_num_procs() ) + { + if(omp_get_thread_num()==0) + printf("%s%s\n","Same Affinity as in Previous Parallel Region", + " -- no Affinity Reported\n"); + + // NO AFFINITY OUTPUT: + //(output in 1st parallel region only for OMP_DISPLAY_AFFINITY=TRUE) + + // doing more work here + } + + // Report Affinity for 1/2 number of threads + #pragma omp parallel num_threads( omp_get_num_procs()/2 ) + { + if(omp_get_thread_num()==0) + printf("Report Affinity for using 1/2 of max threads.\n"); + + // DISPLAY OUTPUT (default format) has been sorted: + // team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0,1 + // team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 2,3 + // team_num= 0, nesting_level= 1, thread_num= 2, thread_affinity= 4,5 + // team_num= 0, nesting_level= 1, thread_num= 3, thread_affinity= 6,7 + + // do work + } + + return 0; +} diff --git a/sources/Example_affinity_display.1.f90 b/sources/Example_affinity_display.1.f90 new file mode 100644 index 0000000..60519dd --- /dev/null +++ b/sources/Example_affinity_display.1.f90 @@ -0,0 +1,66 @@ +! @@name: affinity_display.1.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success +program affinity_display ! MAX threads = 8, single socket system + + use omp_lib + implicit none + character(len=0) :: null + + call omp_display_affinity(null) !API call- Displays Affinity of Master Thread + +! API CALL OUTPUT (default format): +! team_num= 0, nesting_level= 0, thread_num= 0, thread_affinity= 0,1,2,3,4,5,6,7 + + + ! OMP_DISPLAY_AFFINITY=TRUE, OMP_NUM_THREADS=8 + + !$omp parallel num_threads(omp_get_num_procs()) + + if(omp_get_thread_num()==0) then + print*, "1st Parallel Region -- Affinity Reported" + endif + + ! DISPLAY OUTPUT (default format) has been sorted: + ! team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0 + ! team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 1 + ! ... + ! team_num= 0, nesting_level= 1, thread_num= 7, thread_affinity= 7 + + ! doing work here + + !$omp end parallel + + !$omp parallel num_threads( omp_get_num_procs() ) + + if(omp_get_thread_num()==0) then + print*, "Same Affinity in Parallel Region -- no Affinity Reported" + endif + + ! NO AFFINITY OUTPUT: + !(output in 1st parallel region only for OMP_DISPLAY_AFFINITY=TRUE) + + ! doing more work here + + !$omp end parallel + + ! Report Affinity for 1/2 number of threads + !$omp parallel num_threads( omp_get_num_procs()/2 ) + + if(omp_get_thread_num()==0) then + print*, "Different Affinity in Parallel Region -- Affinity Reported" + endif + + ! DISPLAY OUTPUT (default format) has been sorted: + ! team_num= 0, nesting_level= 1, thread_num= 0, thread_affinity= 0,1 + ! team_num= 0, nesting_level= 1, thread_num= 1, thread_affinity= 2,3 + ! team_num= 0, nesting_level= 1, thread_num= 2, thread_affinity= 4,5 + ! team_num= 0, nesting_level= 1, thread_num= 3, thread_affinity= 6,7 + + ! do work + + !$omp end parallel + +end program diff --git a/sources/Example_affinity_display.2.c b/sources/Example_affinity_display.2.c new file mode 100644 index 0000000..5ae2dbe --- /dev/null +++ b/sources/Example_affinity_display.2.c @@ -0,0 +1,74 @@ +/* +* @@name: affinity_display.2c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +#include +#include + +void socket_work(int socket_num, int n_thrds); + +int main(void) +{ + int n_sockets, socket_num, n_thrds_on_socket; + + omp_set_nested(1); // or env var= OMP_NESTED=true + omp_set_max_active_levels(2); // or env var= OMP_MAX_ACTIVE_LEVELS=2 + + n_sockets = omp_get_num_places(); + n_thrds_on_socket = omp_get_place_num_procs(0); + + // OMP_NUM_THREADS=2,4 + // OMP_PLACES="{0,2,4,6},{1,3,5,7}" #2 sockets; even/odd proc-ids + // OMP_AFFINITY_FORMAT=\ + // "nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A" + + #pragma omp parallel num_threads(n_sockets) private(socket_num) + { + socket_num = omp_get_place_num(); + + if(socket_num==0) + printf(" LEVEL 1 AFFINITIES 1 thread/socket, %d sockets:\n\n", n_sockets); + + omp_display_affinity(NULL); // not needed if OMP_DISPLAY_AFFINITY=TRUE + + // OUTPUT: + // LEVEL 1 AFFINITIES 1 thread/socket, 2 sockets: + // nest_level= 1, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0,2,4,6 + // nest_level= 1, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 1,3,5,7 + + socket_work(socket_num, n_thrds_on_socket); + } + + return 0; +} + +void socket_work(int socket_num, int n_thrds) +{ + #pragma omp parallel num_threads(n_thrds) + { + if(omp_get_thread_num()==0) + printf(" LEVEL 2 AFFINITIES, %d threads on socket %d\n",n_thrds, socket_num); + + omp_display_affinity(NULL); // not needed if OMP_DISPLAY_AFFINITY=TRUE + + // OUTPUT: + // LEVEL 2 AFFINITIES, 4 threads on socket 0 + // nest_level= 2, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0 + // nest_level= 2, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 2 + // nest_level= 2, parent_thrd_num= 0, thrd_num= 2, thrd_affinity= 4 + // nest_level= 2, parent_thrd_num= 0, thrd_num= 3, thrd_affinity= 6 + + // LEVEL 2 AFFINITIES, 4 threads on socket 1 + // nest_level= 2, parent_thrd_num= 1, thrd_num= 0, thrd_affinity= 1 + // nest_level= 2, parent_thrd_num= 1, thrd_num= 1, thrd_affinity= 3 + // nest_level= 2, parent_thrd_num= 1, thrd_num= 2, thrd_affinity= 5 + // nest_level= 2, parent_thrd_num= 1, thrd_num= 3, thrd_affinity= 7 + + // ... Do Some work on Socket + + } +} diff --git a/sources/Example_affinity_display.2.f90 b/sources/Example_affinity_display.2.f90 new file mode 100644 index 0000000..c8e0634 --- /dev/null +++ b/sources/Example_affinity_display.2.f90 @@ -0,0 +1,76 @@ +! @@name: affinity_display.2.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success +program affinity_display + + use omp_lib + implicit none + character(len=0) :: null + integer :: n_sockets, socket_num, n_thrds_on_socket; + + call omp_set_nested(.true.) ! or env var= OMP_NESTED=true + call omp_set_max_active_levels(2) ! or env var= OMP_MAX_ACTIVE_LEVELS=2 + + n_sockets = omp_get_num_places() + n_thrds_on_socket = omp_get_place_num_procs(0) + + ! OMP_NUM_THREADS=2,4 + ! OMP_PLACES="{0,2,4,6},{1,3,5,7}" #2 sockets; even/odd proc-ids + ! OMP_AFFINITY_FORMAT=\ + ! "nest_level= %L, parent_thrd_num= %a, thrd_num= %n, thrd_affinity= %A" + + !$omp parallel num_threads(n_sockets) private(socket_num) + + socket_num = omp_get_place_num() + + if(socket_num==0) then + write(*,'("LEVEL 1 AFFINITIES 1 thread/socket ",i0," sockets")')n_sockets + endif + + call omp_display_affinity(null) !not needed if OMP_DISPLAY_AFFINITY=TRUE + + ! OUTPUT: + ! LEVEL 1 AFFINITIES 1 thread/socket, 2 sockets: + ! nest_level= 1, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0,2,4,6 + ! nest_level= 1, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 1,3,5,7 + + call socket_work(socket_num, n_thrds_on_socket) + + !$omp end parallel + +end program + +subroutine socket_work(socket_num, n_thrds) + implicit none + integer :: socket_num, n_thrds + character(len=0) :: null + + !$omp parallel num_threads(n_thrds) + + if(omp_get_thread_num()==0) then + write(*,'("LEVEL 2 AFFINITIES, ",i0," threads on socket ",i0)') & + n_thrds,socket_num + endif + + call omp_display_affinity(null); !not needed if OMP_DISPLAY_AFFINITY=TRUE + + ! OUTPUT: + ! LEVEL 2 AFFINITIES, 4 threads on socket 0 + ! nest_level= 2, parent_thrd_num= 0, thrd_num= 0, thrd_affinity= 0 + ! nest_level= 2, parent_thrd_num= 0, thrd_num= 1, thrd_affinity= 2 + ! nest_level= 2, parent_thrd_num= 0, thrd_num= 2, thrd_affinity= 4 + ! nest_level= 2, parent_thrd_num= 0, thrd_num= 3, thrd_affinity= 6 + + ! LEVEL 2 AFFINITIES, 4 thrds on socket 1 + ! nest_level= 2, parent_thrd_num= 1, thrd_num= 0, thrd_affinity= 1 + ! nest_level= 2, parent_thrd_num= 1, thrd_num= 1, thrd_affinity= 3 + ! nest_level= 2, parent_thrd_num= 1, thrd_num= 2, thrd_affinity= 5 + ! nest_level= 2, parent_thrd_num= 1, thrd_num= 3, thrd_affinity= 7 + + ! ... Do Some work on Socket + + !$omp end parallel + +end subroutine diff --git a/sources/Example_affinity_display.3.c b/sources/Example_affinity_display.3.c new file mode 100644 index 0000000..f4d5765 --- /dev/null +++ b/sources/Example_affinity_display.3.c @@ -0,0 +1,88 @@ +/* +* @@name: affinity_display.3.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ +#include +#include // also null is in +#include +#include +#include + +#define FORMAT_STORE 80 +#define BUFFER_STORE 80 + +int main(void){ + + int i, n, thrd_num, max_req_store; + size_t nchars; + + char default_format[FORMAT_STORE]; + char my_format[] = "host=%20H thrd_num=%0.4n binds_to=%A"; + char **buffer; + + +// CODE SEGMENT 1 AFFINITY FORMAT + +// Get and Display Default Affinity Format + + nchars = omp_get_affinity_format(default_format,(size_t)FORMAT_STORE); + printf("Default Affinity Format is: %s\n",default_format); + + if(nchars >= FORMAT_STORE){ + printf("Caution: Reported Format is truncated. Increase\n"); + printf(" FORMAT_STORE to %d.\n", nchars+1); + } + +// Set Affinity Format + + omp_set_affinity_format(my_format); + printf("Affinity Format set to: %s\n",my_format); + + +// CODE SEGMENT 2 CAPTURE AFFINITY + +// Set up buffer for affinity of n threads + + n = omp_get_num_procs(); + buffer = (char **)malloc( sizeof(char *) * n ); + for(i=0;in) exit(1); //safety: don't exceed # of buffers + + thrd_num=omp_get_thread_num(); + nchars=omp_capture_affinity(buffer[thrd_num],(size_t)BUFFER_STORE,NULL); + if(nchars > max_req_store) max_req_store=nchars; + + // ... + } + + for(i=0;i=BUFFER_STORE){ + printf("Caution: Affinity string truncated. Increase\n"); + printf(" BUFFER_STORE to %d\n",max_req_store+1); + } + + for(i=0;i FORMAT_STORE) then + print*,"Caution: Reported Format is truncated. Increase" + print*," FORMAT_STORE to ", nchars + endif + +! Set Affinity Format + + call omp_set_affinity_format(my_format) + print*,"Affinity Format set to: ", my_format + + +! CODE SEGMENT 2 CAPTURE AFFINITY + +! Set up buffer for affinity of n threads + + n = omp_get_num_procs() + allocate( character(len=BUFFER_STORE)::buffer(0:n-1) ) + +! Capture Affinity using Affinity Format set above. +! Use max reduction to check size of buffer areas + max_req_store = 0 + !$omp parallel private(thrd_num,nchars) reduction(max:max_req_store) + + if(omp_get_num_threads()>n) stop "ERROR: increase buffer lines" + + thrd_num=omp_get_thread_num() + nchars=omp_capture_affinity(buffer(thrd_num),null) + if(nchars>max_req_store) max_req_store=nchars + ! ... + + !$omp end parallel + + do i = 0, n-1 + print*, "thrd_num= ",i," affinity:", trim(buffer(i)) + end do + ! For 4 threads with OMP_PLACES='{0,1},{2,3},{4,5},{6,7}' + ! Format: host=%20H thrd_num=%0.4n binds_to=%A + + ! affinity: host=hpc.cn567 thrd_num=0000 binds_to=0,1 + ! affinity: host=hpc.cn567 thrd_num=0001 binds_to=2,3 + ! affinity: host=hpc.cn567 thrd_num=0002 binds_to=4,5 + ! affinity: host=hpc.cn567 thrd_num=0003 binds_to=6,7 + + if(max_req_store > BUFFER_STORE) then + print*, "Caution: Affinity string truncated. Increase" + print*, " BUFFER_STORE to ",max_req_store + endif + + deallocate(buffer) +end program diff --git a/sources/Example_affinity_query.1.c b/sources/Example_affinity_query.1.c new file mode 100644 index 0000000..c89af2b --- /dev/null +++ b/sources/Example_affinity_query.1.c @@ -0,0 +1,39 @@ +/* +* @@name: affinity_query.1c +* @@type: C +* @@compilable: yes +* @@linkable: no +* @@expect: success +*/ +#include +#include + +void socket_init(int socket_num) +{ + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + #pragma omp parallel num_threads(n_procs) proc_bind(close) + { + printf("Reporting in from socket num, thread num: %d %d\n", + socket_num,omp_get_thread_num() ); + } +} + +int main() +{ + int n_sockets, socket_num; + + omp_set_nested(1); // or export OMP_NESTED=true + omp_set_max_active_levels(2); // or export OMP_MAX_ACTIVE_LEVELS=2 + + n_sockets = omp_get_num_places(); + #pragma omp parallel num_threads(n_sockets) private(socket_num) \ + proc_bind(spread) + { + socket_num = omp_get_place_num(); + socket_init(socket_num); + } + + return 0; +} diff --git a/sources/Example_affinity_query.1.f90 b/sources/Example_affinity_query.1.f90 new file mode 100644 index 0000000..cdd08ef --- /dev/null +++ b/sources/Example_affinity_query.1.f90 @@ -0,0 +1,33 @@ +! @@name: affinity_query.1f +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success +subroutine socket_init(socket_num) + use omp_lib + integer :: socket_num, n_procs + + n_procs = omp_get_place_num_procs(socket_num) + !$omp parallel num_threads(n_procs) proc_bind(close) + + print*,"Reporting in from socket num, thread num: ", & + socket_num,omp_get_thread_num() + !$omp end parallel +end subroutine + +program numa_teams + use omp_lib + integer :: n_sockets, socket_num + + call omp_set_nested(.true.) ! or export OMP_NESTED=true + call omp_set_max_active_levels(2) ! or export OMP_MAX_ACTIVE_LEVELS=2 + + n_sockets = omp_get_num_places() + !$omp parallel num_threads(n_sockets) private(socket_num) & + !$omp& proc_bind(spread) + + socket_num = omp_get_place_num() + call socket_init(socket_num) + + !$omp end parallel +end program diff --git a/sources/Example_allocators.1.c b/sources/Example_allocators.1.c new file mode 100644 index 0000000..86e3128 --- /dev/null +++ b/sources/Example_allocators.1.c @@ -0,0 +1,47 @@ +/* +* @@name: allocators.1c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ +#include +#include +#include +#include +#define N 1000 + +int main() +{ + float *x, *y; + float s=2.0; + +omp_memspace_handle_t xy_memspace = omp_default_mem_space; +omp_alloctrait_t xy_traits[1]={omp_atk_alignment, 64}; +omp_allocator_handle_t xy_alloc = omp_init_allocator(xy_memspace,1,xy_traits); + + + x=(float *)omp_alloc(N*sizeof(float), xy_alloc); + y=(float *)omp_alloc(N*sizeof(float), xy_alloc); + + if( ((intptr_t)(y))%64 != 0 || ((intptr_t)(x))%64 != 0 ) + { printf("ERROR: x|y not 64-Byte aligned\n"); exit(1); } + + #pragma omp parallel + { + #pragma omp for simd simdlen(16) aligned(x,y:64) + for(int i=0; i void test() { diff --git a/sources/Example_collapse.2.f b/sources/Example_collapse.2.f index 6c7d12d..189f565 100644 --- a/sources/Example_collapse.2.f +++ b/sources/Example_collapse.2.f @@ -3,6 +3,7 @@ ! @@compilable: yes ! @@linkable: yes ! @@expect: success + program test !$omp parallel !$omp do private(j,k) collapse(2) lastprivate(jlast, klast) diff --git a/sources/Example_declare_target.6.c b/sources/Example_declare_target.6.c index df832ef..99b3332 100644 --- a/sources/Example_declare_target.6.c +++ b/sources/Example_declare_target.6.c @@ -7,10 +7,10 @@ */ #define N 100000000 -#pragma omp declare target link(sp,sv1,sv2) \ - link(dp,dv1,dv2) float sp[N], sv1[N], sv2[N]; double dp[N], dv1[N], dv2[N]; +#pragma omp declare target link(sp,sv1,sv2) \ + link(dp,dv1,dv2) void s_init(float *, float *, int); void d_init(double *, double *, int); diff --git a/sources/Example_declare_variant.1.c b/sources/Example_declare_variant.1.c new file mode 100644 index 0000000..40d2f08 --- /dev/null +++ b/sources/Example_declare_variant.1.c @@ -0,0 +1,57 @@ +/* +* @@name: declare_variant.1c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ + +#define N 100 +#include +#include + +void p_vxv(int *v1,int *v2,int *v3,int n); +void t_vxv(int *v1,int *v2,int *v3,int n); + +#pragma omp declare variant( p_vxv ) match( construct={parallel} ) +#pragma omp declare variant( t_vxv ) match( construct={target} ) +void vxv(int *v1,int *v2,int *v3,int n) // base function +{ + for (int i= 0; i< n; i++) v3[i] = v1[i] * v2[i]; +} + +void p_vxv(int *v1,int *v2,int *v3,int n) // function variant +{ + #pragma omp for + for (int i= 0; i< n; i++) v3[i] = v1[i] * v2[i]*3; +} + +#pragma omp declare target +void t_vxv(int *v1,int *v2,int *v3,int n) // function variant +{ + #pragma omp distribute simd + for (int i= 0; i< n; i++) v3[i] = v1[i] * v2[i]*2; +} + +int main() +{ + int v1[N], v2[N], v3[N]; + for(int i=0; i + +void base_saxpy(int, float, float *, float *); +void avx512_saxpy(int, float, float *, float *); + +#pragma omp declare variant( avx512_saxpy ) \ + match( device={isa("core-avx512")} ) +void base_saxpy(int n, float s, float *x, float *y) // base function +{ + #pragma omp parallel for + for(int i=0; i +#include +#include +#define N 1000 + +int main() +{ + static float x[N],y[N] __attribute__ ((aligned(64))); + float s=2.0; + // Check for 64-byte aligned + if( ((intptr_t)y)%64 != 0 || ((intptr_t)x)%64 != 0 ) + { printf("ERROR: x|y not 64-Byte aligned\n"); exit(1); } + + for(int i=0; i +#include + +#define N 100 +#define TRUE 1 +#define FALSE 0 + +void driver(int update, float a[], float b[], int n, omp_depend_t *obj); + +void update_copy(int update, float a[], float b[], int n); +void checkpoint(float a[],int n); +void init(float a[], int n); + + +int main(){ + + float a[N],b[N]; + omp_depend_t obj; + + init(a, N); + + #pragma omp depobj(obj) depend(inout: a) + + driver(TRUE, a,b,N, &obj); // updating a occurs + + #pragma omp depobj(obj) update(in) + + driver(FALSE, a,b,N, &obj); // no updating of a + + #pragma omp depobj(obj) destroy // obj is set to uninitilized state, + // resources are freed + return 0; + +} + +void driver(int update, float a[], float b[], int n, omp_depend_t *obj) +{ + #pragma omp parallel num_threads(2) + #pragma omp single + { + + #pragma omp task depend(depobj: *obj) // Task 1, uses depend object + update_copy(update, a,b,n); // update a or not, always copy a to b + + #pragma omp task depend(in: a[:n]) // Task 2, only read a + checkpoint(a,n); + + } +} + +void update_copy(int update, float a[], float b[], int n) +{ + if(update) for(int i=0;i void work(int i); -void incorrect() -{ +void incorrect() { int np, i; np = omp_get_num_threads(); /* misplaced */ diff --git a/sources/Example_host_teams.1.c b/sources/Example_host_teams.1.c new file mode 100644 index 0000000..d26df9d --- /dev/null +++ b/sources/Example_host_teams.1.c @@ -0,0 +1,61 @@ +/* +* @@name: host_teams.2.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ +#include +#include +#include +#include +#define N 1000 + +int main(){ + int nteams_required=2, max_thrds, tm_id; + float sp_x[N], sp_y[N], sp_a=0.0001e0; + double dp_x[N], dp_y[N], dp_a=0.0001e0; + + // Create 2 teams, each team works in a different precision + #pragma omp teams num_teams(nteams_required) \ + thread_limit(max_thrds) private(tm_id) + { + tm_id = omp_get_team_num(); + + if( omp_get_num_teams() != 2 ) //if only getting 1, quit + { printf("error: Insufficient teams on host, 2 required\n"); + exit(0); + } + + if(tm_id == 0) // Do Single Precision Work (SAXPY) with this team + { + #pragma omp parallel + { + #pragma omp for //init + for(int i=0; i -omp_lock_t *new_locks() -{ +omp_lock_t *new_locks() { int i; omp_lock_t *lock = new omp_lock_t[1000]; #pragma omp parallel for private(i) for (i=0; i<1000; i++) - { - omp_init_lock(&lock[i]); - } + { omp_init_lock(&lock[i]); } + return lock; } diff --git a/sources/Example_init_lock.1.f b/sources/Example_init_lock.1.f index 7142e65..62e5957 100644 --- a/sources/Example_init_lock.1.f +++ b/sources/Example_init_lock.1.f @@ -6,7 +6,6 @@ FUNCTION NEW_LOCKS() USE OMP_LIB ! or INCLUDE "omp_lib.h" INTEGER(OMP_LOCK_KIND), DIMENSION(1000) :: NEW_LOCKS - INTEGER I !$OMP PARALLEL DO PRIVATE(I) diff --git a/sources/Example_init_lock_with_hint.1.cpp b/sources/Example_init_lock_with_hint.1.cpp index 27ae7e4..d50b62d 100644 --- a/sources/Example_init_lock_with_hint.1.cpp +++ b/sources/Example_init_lock_with_hint.1.cpp @@ -16,7 +16,8 @@ omp_lock_t *new_locks() for (i=0; i<1000; i++) { omp_init_lock_with_hint(&lock[i], - omp_lock_hint_contended | omp_lock_hint_speculative); + static_cast(omp_lock_hint_contended | + omp_lock_hint_speculative)); } return lock; } diff --git a/sources/Example_loop.1.c b/sources/Example_loop.1.c new file mode 100644 index 0000000..4b6dc85 --- /dev/null +++ b/sources/Example_loop.1.c @@ -0,0 +1,22 @@ +/* +* @@name: loop.2c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ +#include +#define N 100 +int main() +{ + float x[N], y[N]; + float a = 2.0; + for(int i=0;i +#include + +void work_on_chunk(int idev, int i); + +int main() //Driver +{ + int i,idev; + + for (idev=0; idev +#include +#define N 1000 + +#pragma omp declare target +void exp_pi_diff(double *d, double my_pi){ + #pragma omp metadirective \ + when( construct={target}: distribute parallel for ) \ + default( parallel for simd) + for(int i = 0; i +using namespace std; + +#pragma omp requires unified_shared_memory + +typedef struct mypoints +{ + double res; + double data[500]; +} mypoints_t; + +void do_something_with_p(mypoints_t *p, int q); + +int main() +{ + mypoints_t p; + int q=0; + + #pragma omp target // no map clauses needed + { // q is firstprivate + q++; + do_something_with_p(&p,q); + } + cout<< p.res << " " << q << endl; // output 1 0 + return 0; +} +void do_something_with_p(mypoints_t *p, int q) +{ + p->res = q; + for(int i=0;idata)/sizeof(double);i++) + p->data[i]=q*i; +} diff --git a/sources/Example_requires.1.f90 b/sources/Example_requires.1.f90 new file mode 100644 index 0000000..f8044f0 --- /dev/null +++ b/sources/Example_requires.1.f90 @@ -0,0 +1,39 @@ +! @@name: requires.1f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success + +module data +!$omp requires unified_shared_memory + type,public :: mypoints + double precision :: res + double precision :: data(500) + end type +end module + +program main + use data + type(mypoints) :: p + integer :: q=0 + + !$omp target !! no map clauses needed + q = q + 1 !! q is firstprivate + call do_something_with_p(p,q) + !$omp end target + + write(*,'(f5.0,i5)') p%res, q !! output 1. 0 + +end program + +subroutine do_something_with_p(p,q) + use data + type(mypoints) :: p + integer :: q + + p%res = q; + do i=1,size(p%data) + p%data(i)=q*i + enddo + +end subroutine diff --git a/sources/Example_standalone.1.f90 b/sources/Example_standalone.1.f90 index de8c3bb..ed2d55b 100644 --- a/sources/Example_standalone.1.f90 +++ b/sources/Example_standalone.1.f90 @@ -3,9 +3,14 @@ ! @@compilable: no ! @@linkable: no ! @@expect: failure + + SUBROUTINE STANDALONE_WRONG() + INTEGER A + A = 1 + ! the FLUSH directive must not be the action statement ! in an IF statement IF (A .NE. 0) !$OMP FLUSH(A) diff --git a/sources/Example_target_data.3.c b/sources/Example_target_data.3.c index 06fc4a0..05cb61d 100644 --- a/sources/Example_target_data.3.c +++ b/sources/Example_target_data.3.c @@ -5,8 +5,10 @@ * @@linkable: no * @@expect: success */ + #include #define COLS 100 + void gramSchmidt(float Q[][COLS], const int rows) { int cols = COLS; @@ -14,6 +16,7 @@ void gramSchmidt(float Q[][COLS], const int rows) for(int k=0; k < cols; k++) { double tmp = 0.0; + #pragma omp target map(tofrom: tmp) #pragma omp parallel for reduction(+:tmp) for(int i=0; i < rows; i++) diff --git a/sources/Example_target_data.4.c b/sources/Example_target_data.4.c index 11574ce..8f66094 100644 --- a/sources/Example_target_data.4.c +++ b/sources/Example_target_data.4.c @@ -5,23 +5,36 @@ * @@linkable: no * @@expect: success */ + + void vec_mult(float*, float*, float*, int); + extern void init(float*, float*, int); extern void output(float*, int); + + void foo(float *p0, float *v1, float *v2, int N) { init(v1, v2, N); + #pragma omp target data map(to: v1[0:N], v2[:N]) map(from: p0[0:N]) { vec_mult(p0, v1, v2, N); } + output(p0, N); } + + + void vec_mult(float *p1, float *v3, float *v4, int N) { int i; #pragma omp target map(to: v3[0:N], v4[:N]) map(from: p1[0:N]) #pragma omp parallel for for (i=0; i +#include +#define N 100 + +typedef struct myvec{ + size_t len; + double *data; +} myvec_t; + +#pragma omp declare mapper(myvec_t v) \ + map(v, v.data[0:v.len]) +void init(myvec_t *s); + +int main(){ + + myvec_t s; + + s.data = (double *)calloc(N,sizeof(double)); + s.len = N; + + #pragma omp target + init(&s); + + printf("s.data[%d]=%lf\n",N-1,s.data[N-1]); //s.data[99]=99.000000 +} + +void init(myvec_t *s) +{ for(int i=0; ilen; i++) s->data[i]=i; } + diff --git a/sources/Example_target_mapper.1.f90 b/sources/Example_target_mapper.1.f90 new file mode 100644 index 0000000..17af08f --- /dev/null +++ b/sources/Example_target_mapper.1.f90 @@ -0,0 +1,38 @@ +! @@name: target_mapper.1.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: no +! @@expect: success +module my_structures + type myvec_t + integer :: len + double precision, pointer :: data(:) + end type +end module + +program main + use my_structures + integer, parameter :: N=100 + + !$omp declare mapper(myvec_t :: v) & + !$omp& map(v, v%data(1:v%len)) + + type(myvec_t) :: s + + allocate(s%data(N)) + s%data(1:N) = 0.0d0 + s%len = N + + !$omp target + call init(s) + !$omp end target + + print*,"s%data(",N,")=",s%data(N) !! s%data( 100 )=100.000000000000 +end program + +subroutine init(s) + use my_structures + type(myvec_t) :: s + + s%data = [ (i, i=1,s%len) ] +end subroutine diff --git a/sources/Example_target_mapper.2.c b/sources/Example_target_mapper.2.c new file mode 100644 index 0000000..f1c86d1 --- /dev/null +++ b/sources/Example_target_mapper.2.c @@ -0,0 +1,54 @@ +/* +* @@name: target_mapper_map.2.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: no +* @@expect: success +*/ +#include +// N MUST BE EVEN +#define N 100 + + typedef struct dzmat + { + double r_m[N][N]; + double i_m[N][N]; + } dzmat_t; + + #pragma omp declare mapper( top_id: dzmat_t v) \ + map(v.r_m[0:N/2][0:N], \ + v.i_m[0:N/2][0:N] ) + + #pragma omp declare mapper(bottom_id: dzmat_t v) \ + map(v.r_m[N/2:N/2][0:N], \ + v.i_m[N/2:N/2][0:N] ) + +void dzmat_init(dzmat_t *z, int is, int ie, int n); //initialization +void host_add( dzmat_t *a, dzmat_t *b, dzmat_t *c, int n); //matrix add: c=a+b + + +int main() +{ + dzmat_t a,b,c; + int is,ie; + + is=0; ie=N/2-1; //top N/2 rows on device 0 + #pragma omp target map(mapper(top_id), tofrom: a,b) device(0) \ + firstprivate(is,ie) nowait + { + dzmat_init(&a,is,ie,N); + dzmat_init(&b,is,ie,N); + } + + is=N/2; ie=N-1; //bottom N/2 rows on device 1 + #pragma omp target map(mapper(bottom_id), tofrom: a,b) device(1) \ + firstprivate(is,ie) nowait + { + dzmat_init(&a,is,ie,N); + dzmat_init(&b,is,ie,N); + } + + #pragma omp taskwait + + host_add(&a,&b,&c,N); +} diff --git a/sources/Example_target_mapper.2.f90 b/sources/Example_target_mapper.2.f90 new file mode 100644 index 0000000..663d76d --- /dev/null +++ b/sources/Example_target_mapper.2.f90 @@ -0,0 +1,48 @@ +! @@name: target_mapper.2.f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success +module complex_mats + + integer, parameter :: N=100 !N must be even + type dzmat_t + double precision :: r_m(N,N), i_m(N,N) + end type + + !$omp declare mapper( left_id: dzmat_t :: v) map( v%r_m(N, 1:N/2), & + !$omp& v%i_m(N, 1:N/2)) + + !$omp declare mapper(right_id: dzmat_t :: v) map( v%r_m(N,N/2+1:N), & + !$omp& v%i_m(N,N/2+1:N)) + +end module + + +program main + use complex_mats + type(dzmat_t) :: a,b,c + external dzmat_init, host_add !initialization and matrix add: a=b+c + + integer :: is,ie + + + is=1; ie=N/2 !left N/2 columns on device 0 + !$omp target map(mapper( left_id), tofrom: a,b) device(0) & + !$omp& firstprivate(is,ie) nowait + call dzmat_init(a,is,ie) + call dzmat_init(b,is,ie) + !$omp end target + + is=N/2+1; ie=N !right N/2 columns on device 1 + !$omp target map(mapper(right_id), tofrom: a,b) device(1) & + !$omp& firstprivate(is,ie) nowait + call dzmat_init(a,is,ie) + call dzmat_init(b,is,ie) + !$omp end target + + !omp taskwait + + call host_add(a,b,c) + +end program main diff --git a/sources/Example_target_mapper.3.c b/sources/Example_target_mapper.3.c new file mode 100644 index 0000000..dc680ed --- /dev/null +++ b/sources/Example_target_mapper.3.c @@ -0,0 +1,43 @@ +/* +* @@name: target_mapper_map.3.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: no +* @@expect: success +*/ + +#include +#include + +#define N 100 + +typedef struct myvec { + size_t len; + double *data; +} myvec_t; + +#pragma omp declare mapper(myvec_t v) \ + map(v, v.data[0:v.len]) + +typedef struct mypoints { + struct myvec scratch; + struct myvec *x; + double hostonly_data[500000]; +} mypoints_t; + +#pragma omp declare mapper(mypoints_t v) \ + map(v.x, v.x[0] ) map(alloc:v.scratch) + +void init_mypts_array(mypoints_t *P, int n); +void eval_mypts_array(mypoints_t *P, int n); + +int main(){ + + mypoints_t P; + + init_mypts_array(&P, N); + + #pragma omp target map(P) + eval_mypts_array(&P, N); + +} diff --git a/sources/Example_target_mapper.3.f90 b/sources/Example_target_mapper.3.f90 new file mode 100644 index 0000000..1e3d326 --- /dev/null +++ b/sources/Example_target_mapper.3.f90 @@ -0,0 +1,38 @@ +! @@name: target_mapper.3.f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success + +module my_structures + type myvec_t + integer :: len + double precision, pointer :: data(:) + end type + !$omp declare mapper(myvec_t :: v) & + !$omp& map(v) + + type mypoints_t + type(myvec_t) :: scratch + type(myvec_t), pointer :: x(:) + double precision :: hostonly_data(500000) + end type + !$omp declare mapper(mypoints_t :: v) & + !$omp& map(v%x, v%x(1)) map(alloc:v%scratch) + +end module + + +program main + use my_structures + external init_mypts_array, eval_mypts_array + + type(mypoints_t) :: P + + call init_mypts_array(P) + + !$omp target map(P) + call eval_mypts_array(P) + +end program + diff --git a/sources/Example_target_offload_control.1.c b/sources/Example_target_offload_control.1.c new file mode 100644 index 0000000..201127c --- /dev/null +++ b/sources/Example_target_offload_control.1.c @@ -0,0 +1,78 @@ +/* +* @@name: target_offload_control.1c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ + +#include +#include +#include +#include +#include + +typedef enum offload_policy +{MANDATORY, DISABLED, DEFAULT, UNKNOWN, NOTSET} offload_policy_t; + + +offload_policy_t get_offload_policy() +{ + char *env, *end; + size_t n; + + env = getenv("OMP_TARGET_OFFLOAD"); + if(env == NULL) return NOTSET; + + end = env + strlen(env); //Find trimmed beginning/end + while ( *env && isspace(*(env )) ) env++; + while (end != env && isspace(*(end-1)) ) end--; + n = (int)(end - env); + + //Find ONLY string -nothing more, case insensitive + if (n == 9 && !strncasecmp(env, "MANDATORY",n)) return MANDATORY; + else if (n == 8 && !strncasecmp(env, "DISABLED" ,n)) return DISABLED ; + else if (n == 7 && !strncasecmp(env, "DEFAULT" ,n)) return DEFAULT ; + else return UNKNOWN ; +} + + +int main() +{ + int i; + int device_num, on_init_dev; + + // get policy from OMP_TARGET_OFFLOAD variable + offload_policy_t policy = get_offload_policy(); + + if(_OPENMP< 201811) + { + printf("Warning: OMP_TARGET_OFFLOAD NOT supported by VER. %d\n",_OPENMP ); + printf(" If OMP_TARGET_OFFLOAD is set, it will be ignored.\n"); + } + + // Set target device number to an unavailable + // device to test offload policy. + device_num = omp_get_num_devices() + 1; + + // Policy: + printf("OMP_TARGET_OFFLOAD Policy: "); + if (policy==MANDATORY) printf("MANDATORY-Terminate if dev. not avail\n"); + else if(policy==DISABLED ) printf("DISABLED -(if supported) Only on Host\n"); + else if(policy==DEFAULT ) printf("DEFAULT -On host if device not avail\n"); + else if(policy==UNKNOWN ) printf("OMP_TARGET_OFFLOAD has unknown value\n" ); + else if(policy==NOTSET ) printf("OMP_TARGET_OFFLOAD not set\n" ); + + + on_init_dev = 1; + // device# out of range--not supported + #pragma omp target device(device_num) map(tofrom: on_init_dev) + on_init_dev=omp_is_initial_device(); + + if (policy == MANDATORY && _OPENMP >= 201811) + printf("ERROR: OpenMP 5.0 implementation ignored MANDATORY policy.\n"); + + printf("Target region executed on init dev %s\n", on_init_dev ? "TRUE":"FALSE"); + + return 0; +} diff --git a/sources/Example_target_offload_control.1.f90 b/sources/Example_target_offload_control.1.f90 new file mode 100644 index 0000000..6561606 --- /dev/null +++ b/sources/Example_target_offload_control.1.f90 @@ -0,0 +1,81 @@ +! @@name: target_offload_control.1f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success + + +module offload_policy + implicit none + integer, parameter :: LEN_POLICY=10 +contains + character(LEN_POLICY) function get_offload_policy() + character(64) :: env + integer :: length, i + env=repeat(' ',len(env)) + !policy is blank if not found * + call get_environment_variable("OMP_TARGET_OFFLOAD",env,length) + + do i = 1,len(env) !Makes a-z upper case + if(iachar(env(i:i))>96) env(i:i)=achar(iachar(env(i:i))-32) + end do + + get_offload_policy = trim(adjustl(env)) !remove peripheral spaces + + if(length==0) get_offload_policy="NOTSET" + + return + + end function + +end module + +program policy_test + + use omp_lib + use offload_policy + + integer :: i, device_num + logical :: on_init_dev + character(LEN_POLICY) :: policy + + policy = get_offload_policy() !!Get OMP_TARGET_OFFLOAD value + + if (OPENMP_VERSION < 201811) then + print*,"Warning: OMP_TARGET_OFFLOAD NOT supported by VER.",OPENMP_VERSION + print*," If OMP_TARGET_OFFLOAD is set, it will be ignored." + endif + + !Set target device number to an unavailable device to test offload policy. + device_num = omp_get_num_devices() + 1 + + !!Report OMP_TARGET_OFFOAD value + select CASE (policy) + case("MANDATORY") + print*,"Policy: MANDATORY-Terminate if dev. not avail." + case("DISABLED") + print*,"Policy: DISABLED-(if supported) Only on Host." + case("DEFAULT") + print*,"Policy: DEFAULT On host if device not avail." + case("NOTSET") + print*," OMP_TARGET_OFFLOAD is not set." + case DEFAULT + print*," OMP_TARGET_OFFLOAD has unknown value." + print*," UPPER CASE VALUE=",policy + end select + + + on_init_dev = .FALSE. + !! device# out of range--not supported + !$omp target device(device_num) map(tofrom: on_init_dev) + on_init_dev=omp_is_initial_device() + !$omp end target + + if (policy=="MANDATORY" .and. OPENMP_VERSION>=201811) then + print*,"OMP ERROR: OpenMP 5.0 implementation ignored MANDATORY policy." + print*," Termination should have occurred at target directive." + endif + + print*, "Target executed on init dev (T|F): ", on_init_dev + +end program policy_test diff --git a/sources/Example_target_ptr_map.1.c b/sources/Example_target_ptr_map.1.c new file mode 100644 index 0000000..404cfee --- /dev/null +++ b/sources/Example_target_ptr_map.1.c @@ -0,0 +1,49 @@ +/* +* @@name: target_pointer_map.1 +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +#include +#define N 100 + +int main() +{ + int *ptr1; + int *ptr2; + int *ptr3; + int aray[N]; + + ptr1 = (int *)malloc(sizeof(int)*N); + ptr2 = (int *)malloc(sizeof(int)*N); + + #pragma omp target map(ptr1, ptr1[:N]) map(ptr2[:N] ) + { + for (int i=0; i +#include +#define N 100 + +#pragma omp declare target +int *p; +extern void use_arg_p(int *p, int n); +extern void use_global_p( int n); +#pragma omp end declare target + +int main() +{ + int i; + p = (int *)malloc(sizeof(int)*N); + + #pragma omp target map(p[:N]) // device p attached to array section + { + for (i=0; i +#include + +#define N 100 + +#pragma omp requires reverse_offload + +void error_handler(int wrong_value, int index) +{ + printf(" Error in offload: A[%d]=%d\n", index,wrong_value); + printf(" Expecting: A[i ]=i\n"); + exit(1); +// output: Error in offload: A[99]=-1 +// Expecting: A[i ]=i + +} +#pragma omp declare target device_type(host) to(error_handler) + +int main() +{ + int A[N]; + + for (int i=0; i +#include +#define N 100 +#define BAZILLION 2000000 + +struct foo { + char buffera[BAZILLION]; + char bufferb[BAZILLION]; + float x; + float a, b; + float *p; +}; + +#pragma omp declare target +void saxpyfun(struct foo *S) +{ + int i; + for(i=0; ip[i] = S->p[i]*S->a + S->b; +} +#pragma omp end declare target + +int main() +{ + struct foo S; + int i; + + S.a = 2.0; + S.b = 4.0; + S.p = (float *)malloc(sizeof(float)*N); + for(i=0; i +#include +#define N 100 + +#pragma omp declare target +int a; +#pragma omp end declare target + +int main(){ + int i; + int *p; + + #pragma omp target data map(p) + { + p = (int *)malloc(sizeof(int)*N); + for (i=0; i +#include +#define N 100 + +class SAXPY { + private: + float a, b, *p; + public: + float buffer[N]; + + SAXPY(float arg_a, float arg_b){ a=arg_a; b=arg_b; } + void driver(); + void saxpyfun(float *p); +}; + +#pragma omp declare target +void SAXPY::saxpyfun(float *q) +{ + for(int i=0; i -int main() -{ +int main() { int x = 1; #pragma omp parallel #pragma omp single diff --git a/sources/Example_task_dep.10.c b/sources/Example_task_dep.10.c new file mode 100644 index 0000000..32b9ce0 --- /dev/null +++ b/sources/Example_task_dep.10.c @@ -0,0 +1,24 @@ +/* +* @@name: task_dep.7c +* @@type: C +* @@compilable: no +* @@linkable: no +* @@expect: failure +*/ +void foo (void) +{ + int a, b, c; + c = 0; + #pragma omp parallel + #pragma omp single + { + #pragma omp task depend(out: a) + a = longTaskA(); + #pragma omp task depend(out: b) + b = shortTaskB(); + #pragma omp task depend(in: a) depend(mutexinoutset: c) + c = shortTaskAC(a,c); + #pragma omp task depend(in: b) depend(mutexinoutset: c) + c = longTaskBC(b,c); + } +} diff --git a/sources/Example_task_dep.10.f90 b/sources/Example_task_dep.10.f90 new file mode 100644 index 0000000..bf9bec6 --- /dev/null +++ b/sources/Example_task_dep.10.f90 @@ -0,0 +1,25 @@ +! @@name: task_dep.7f +! @@type: F-free +! @@compilable: no +! @@linkable: no +! @@expect: failure +subroutine foo + integer :: a,b,c + c = 0 + !$omp parallel + !$omp single + !$omp task depend(out: a) + a = longTaskA() + !$omp end task + !$omp task depend(out: b) + b = shortTaskB() + !$omp end task + !$omp task depend(in: a) depend(mutexinoutset: c) + c = shortTaskAC(a,c) + !$omp end task + !$omp task depend(in: b) depend(mutexinoutset: c) + c = longTaskBC(b,c) + !$omp end task + !$omp end single + !$omp end parallel +end subroutine foo diff --git a/sources/Example_task_dep.11.c b/sources/Example_task_dep.11.c new file mode 100644 index 0000000..06f3c71 --- /dev/null +++ b/sources/Example_task_dep.11.c @@ -0,0 +1,39 @@ +/* +* @@name: task_dep.11c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: no +* @@expect: success +*/ + +#include + +void set_an_element(int *p, int val) { + *p = val; +} + +void print_all_elements(int *v, int n) { + int i; + for (i = 0; i < n; ++i) { + printf("%d, ", v[i]); + } + printf("\n"); +} + +void parallel_computation(int n) { + int v[n]; + #pragma omp parallel + #pragma omp single + { + int i; + for (i = 0; i < n; ++i) + #pragma omp task depend(out: v[i]) + set_an_element(&v[i], i); + + #pragma omp task depend(iterator(it = 0:n), in: v[it]) + // #pragma omp task depend(in: v[0:n]) Violates Array section restriction. + print_all_elements(v, n); + + } +} + diff --git a/sources/Example_task_dep.11.f90 b/sources/Example_task_dep.11.f90 new file mode 100644 index 0000000..6f733d9 --- /dev/null +++ b/sources/Example_task_dep.11.f90 @@ -0,0 +1,43 @@ +! @@name: task_dep.11f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: no +! @@expect: success + +subroutine set_an_element(e, val) + implicit none + integer :: e, val + + e = val + +end subroutine + +subroutine print_all_elements(v, n) + implicit none + integer :: n, v(n) + + print *, v + +end subroutine + +subroutine parallel_computation(n) + implicit none + integer :: n + integer :: i, v(n) + + !$omp parallel + !$omp single + do i=1, n + !$omp task depend(out: v(i)) + all set_an_element(v(i), i) + !$omp end task + enddo + + !$omp task depend(iterator(it = 1:n), in: v(it)) + !!$omp task depend(in: v(1:n)) Violates Array section restriction. + call print_all_elements(v, n) + !$omp end task + + !$omp end single + !$omp end parallel +end subroutine diff --git a/sources/Example_task_dep.3.c b/sources/Example_task_dep.3.c index e3147d4..eb79f00 100644 --- a/sources/Example_task_dep.3.c +++ b/sources/Example_task_dep.3.c @@ -6,8 +6,7 @@ * @@expect: success */ #include -int main() -{ +int main() { int x; #pragma omp parallel #pragma omp single diff --git a/sources/Example_task_dep.4.c b/sources/Example_task_dep.4.c index f476e9e..32de6e5 100644 --- a/sources/Example_task_dep.4.c +++ b/sources/Example_task_dep.4.c @@ -6,8 +6,7 @@ * @@expect: success */ #include -int main() -{ +int main() { int x = 1; #pragma omp parallel #pragma omp single diff --git a/sources/Example_task_dep.4.f90 b/sources/Example_task_dep.4.f90 index 9106625..69512f3 100644 --- a/sources/Example_task_dep.4.f90 +++ b/sources/Example_task_dep.4.f90 @@ -3,20 +3,27 @@ ! @@compilable: yes ! @@linkable: yes ! @@expect: success + program example integer :: x + x = 1 + !$omp parallel !$omp single + !$omp task shared(x) depend(out: x) x = 2 !$omp end task + !$omp task shared(x) depend(in: x) print*, "x + 1 = ", x+1, "." !$omp end task + !$omp task shared(x) depend(in: x) print*, "x + 2 = ", x+2, "." !$omp end task + !$omp end single !$omp end parallel end program diff --git a/sources/Example_task_dep.6.c b/sources/Example_task_dep.6.c new file mode 100644 index 0000000..af2874f --- /dev/null +++ b/sources/Example_task_dep.6.c @@ -0,0 +1,40 @@ +/* +* @@name: task_depend.6.c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ + +#include + +void foo() +{ + int x = 0, y = 2; + + #pragma omp task depend(inout: x) shared(x) + x++; // 1st child task + + #pragma omp task shared(y) + y--; // 2nd child task + + #pragma omp taskwait depend(in: x) // 1st taskwait + + printf("x=%d\n",x); + + // Second task may not be finished. + // Accessing y here will create a race condition. + + #pragma omp taskwait // 2nd taskwait + + printf("y=%d\n",y); +} + +int main() +{ + #pragma omp parallel + #pragma omp single + foo(); + + return 0; +} diff --git a/sources/Example_task_dep.6.f90 b/sources/Example_task_dep.6.f90 new file mode 100644 index 0000000..1bb8c71 --- /dev/null +++ b/sources/Example_task_dep.6.f90 @@ -0,0 +1,43 @@ +! @@name: task_depend.6.f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success + + +subroutine foo() + implicit none + integer :: x, y + + x = 0 + y = 2 + + !$omp task depend(inout: x) shared(x) + x = x + 1 !! 1st child task + !$omp end task + + !$omp task shared(y) + y = y - 1 !! 2nd child task + !$omp end task + + !$omp taskwait depend(in: x) !! 1st taskwait + + print*, "x=", x + + !! Second task may not be finished. + !! Accessing y here will create a race condition. + + !$omp taskwait !! 2nd taskwait + + print*, "y=", y + +end subroutine foo + +program p + implicit none + !$omp parallel + !$omp single + call foo() + !$omp end single + !$omp end parallel +end program p diff --git a/sources/Example_task_dep.7.c b/sources/Example_task_dep.7.c new file mode 100644 index 0000000..1089785 --- /dev/null +++ b/sources/Example_task_dep.7.c @@ -0,0 +1,41 @@ +/* +* @@name: task_depend.7.c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ + +#include + +void foo() +{ + int x = 0, y = 2; + + #pragma omp task depend(inout: x) shared(x) + x++; // 1st child task + + #pragma omp task depend(in: x) depend(inout: y) shared(x, y) + y -= x; // 2nd child task + + #pragma omp taskwait depend(in: x) // 1st taskwait + + printf("x=%d\n",x); + + // Second task may not be finished. + // Accessing y here would create a race condition. + + #pragma omp taskwait // 2nd taskwait + + printf("y=%d\n",y); + +} + +int main() +{ + #pragma omp parallel + #pragma omp single + foo(); + + return 0; +} diff --git a/sources/Example_task_dep.7.f90 b/sources/Example_task_dep.7.f90 new file mode 100644 index 0000000..3980b46 --- /dev/null +++ b/sources/Example_task_dep.7.f90 @@ -0,0 +1,43 @@ +! @@name: task_depend.7.f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success + + +subroutine foo() +implicit none +integer :: x, y + + x = 0 + y = 2 + + !$omp task depend(inout: x) shared(x) + x = x + 1 !! 1st child task + !$omp end task + + !$omp task depend(in: x) depend(inout: y) shared(x, y) + y = y - x !! 2nd child task + !$omp end task + + !$omp taskwait depend(in: x) !! 1st taskwait + + print*, "x=", x + + !! Second task may not be finished. + !! Accessing y here would create a race condition. + + !$omp taskwait !! 2nd taskwait + + print*, "y=", y + +end subroutine foo + +program p +implicit none + !$omp parallel + !$omp single + call foo() + !$omp end single + !$omp end parallel +end program p diff --git a/sources/Example_task_dep.8.c b/sources/Example_task_dep.8.c new file mode 100644 index 0000000..70b1527 --- /dev/null +++ b/sources/Example_task_dep.8.c @@ -0,0 +1,35 @@ +/* +* @@name: task_depend.8.c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ + +#include + +void foo() +{ + int x = 0, y = 2; + + #pragma omp task depend(inout: x) shared(x) + x++; // 1st child task + + #pragma omp task depend(in: x) depend(inout: y) shared(x, y) + y -= x; // 2st child task + + #pragma omp taskwait depend(in: x,y) + + printf("x=%d\n",x); + printf("y=%d\n",y); + +} + +int main() +{ + #pragma omp parallel + #pragma omp single + foo(); + + return 0; +} diff --git a/sources/Example_task_dep.8.f90 b/sources/Example_task_dep.8.f90 new file mode 100644 index 0000000..2987a05 --- /dev/null +++ b/sources/Example_task_dep.8.f90 @@ -0,0 +1,37 @@ +! @@name: task_depend.8.f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success + + +subroutine foo() +implicit nonE +integer :: x, y + + x = 0 + y = 2 + + !$omp task depend(inout: x) shared(x) + x = x + 1 !! 1st child task + !$omp end task + + !$omp task depend(in: x) depend(inout: y) shared(x, y) + y = y - x !! 2nd child task + !$omp end task + + !$omp taskwait depend(in: x,y) + + print*, "x=", x + print*, "y=", y + +end subroutine foo + +program p +implicit none + !$omp parallel + !$omp single + call foo() + !$omp end single + !$omp end parallel +end program p diff --git a/sources/Example_task_dep.9.c b/sources/Example_task_dep.9.c new file mode 100644 index 0000000..2ffde0c --- /dev/null +++ b/sources/Example_task_dep.9.c @@ -0,0 +1,30 @@ +/* +* @@name: task_dep.6c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ +#include +int main() +{ + int a, b, c, d; + #pragma omp parallel + #pragma omp single + { + #pragma omp task depend(out: c) + c = 1; /* Task T1 */ + #pragma omp task depend(out: a) + a = 2; /* Task T2 */ + #pragma omp task depend(out: b) + b = 3; /* Task T3 */ + #pragma omp task depend(in: a) depend(mutexinoutset: c) + c += a; /* Task T4 */ + #pragma omp task depend(in: b) depend(mutexinoutset: c) + c += b; /* Task T5 */ + #pragma omp task depend(in: c) + d = c; /* Task T6 */ + } + printf("%d\n", d); + return 0; +} diff --git a/sources/Example_task_dep.9.f90 b/sources/Example_task_dep.9.f90 new file mode 100644 index 0000000..cdf4ce2 --- /dev/null +++ b/sources/Example_task_dep.9.f90 @@ -0,0 +1,31 @@ +! @@name: task_dep.6f +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +program example + integer :: a, b, c, d + !$omp parallel + !$omp single + !$omp task depend(out: c) + c = 1 ! Task T1 + !$omp end task + !$omp task depend(out: a) + a = 2 ! Task T2 + !$omp end task + !$omp task depend(out: b) + b = 3 ! Task T3 + !$omp end task + !$omp task depend(in: a) depend(mutexinoutset: c) + c = c + a ! Task T4 + !$omp end task + !$omp task depend(in: b) depend(mutexinoutset: c) + c = c + b ! Task T5 + !$omp end task + !$omp task depend(in: c) + d = c ! Task T6 + !$omp end task + !$omp end single + !$omp end parallel + print *, d +end program diff --git a/sources/Example_task_reduction.1.c b/sources/Example_task_reduction.1.c new file mode 100644 index 0000000..e46aa4c --- /dev/null +++ b/sources/Example_task_reduction.1.c @@ -0,0 +1,65 @@ +/* +* @@name: task_reduction.1c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ + +#include +#include +#define N 10 + +typedef struct node_tag { + int val; + struct node_tag *next; +} node_t; + +int linked_list_sum(node_t *p) +{ + int res = 0; + + #pragma omp taskgroup task_reduction(+: res) + { + node_t* aux = p; + while(aux != 0) + { + #pragma omp task in_reduction(+: res) + res += aux->val; + + aux = aux->next; + } + } + return res; +} + + +int main(int argc, char *argv[]) +{ + int i; +// Create the root node. + node_t* root = (node_t*) malloc(sizeof(node_t)); + root->val = 1; + + node_t* aux = root; + +// Create N-1 more nodes. + for(i=2;i<=N;++i) + { + aux->next = (node_t*) malloc(sizeof(node_t)); + aux = aux->next; + aux->val = i; + } + + aux->next = 0; + + #pragma omp parallel + #pragma omp single + { + int result = linked_list_sum(root); + printf( "Calculated: %d Analytic:%d\n", result, (N*(N+1)/2) ); + } + + return 0; +} + diff --git a/sources/Example_task_reduction.1.f90 b/sources/Example_task_reduction.1.f90 new file mode 100644 index 0000000..2103212 --- /dev/null +++ b/sources/Example_task_reduction.1.f90 @@ -0,0 +1,72 @@ +! @@name: task_reduction.1f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success + +module m + type node_t + integer :: val + type(node_t), pointer :: next + end type +end module m + +function linked_list_sum(p) result(res) + use m + implicit none + type(node_t), pointer :: p + type(node_t), pointer :: aux + integer :: res + + res = 0 + + !$omp taskgroup task_reduction(+: res) + aux => p + do while (associated(aux)) + !$omp task in_reduction(+: res) + res = res + aux%val + !$omp end task + aux => aux%next + end do + !$omp end taskgroup +end function linked_list_sum + + +program main + use m + implicit none + type(node_t), pointer :: root, aux + integer :: res, i + integer, parameter :: N=10 + + interface + function linked_list_sum(p) result(res) + use m + implicit none + type(node_t), pointer :: p + integer :: res + end function + end interface +! Create the root node. + allocate(root) + root%val = 1 + aux => root + +! Create N-1 more nodes. + do i = 2,N + allocate(aux%next) + aux => aux%next + aux%val = i + end do + + aux%next => null() + + !$omp parallel + !$omp single + res = linked_list_sum(root) + print *, "Calculated:", res, " Analytic:", (N*(N+1))/2 + !$omp end single + !$omp end parallel + +end program main + diff --git a/sources/Example_tasking.1.c b/sources/Example_tasking.1.c index b8b855c..c4ab84b 100644 --- a/sources/Example_tasking.1.c +++ b/sources/Example_tasking.1.c @@ -5,12 +5,16 @@ * @@linkable: no * @@expect: success */ + struct node { struct node *left; struct node *right; }; + extern void process(struct node *); -void traverse( struct node *p ) { + +void traverse( struct node *p ) +{ if (p->left) #pragma omp task // p is firstprivate by default traverse(p->left); diff --git a/sources/Example_tasking.1.f90 b/sources/Example_tasking.1.f90 index bbc3a04..fa9aee3 100644 --- a/sources/Example_tasking.1.f90 +++ b/sources/Example_tasking.1.f90 @@ -3,11 +3,13 @@ ! @@compilable: yes ! @@linkable: no ! @@expect: success + RECURSIVE SUBROUTINE traverse ( P ) TYPE Node TYPE(Node), POINTER :: left, right END TYPE Node TYPE(Node) :: P + IF (associated(P%left)) THEN !$OMP TASK ! P is firstprivate by default CALL traverse(P%left) @@ -19,4 +21,5 @@ !$OMP END TASK ENDIF CALL process ( P ) + END SUBROUTINE diff --git a/sources/Example_tasking.3.c b/sources/Example_tasking.3.c index c83d36c..fe30eab 100644 --- a/sources/Example_tasking.3.c +++ b/sources/Example_tasking.3.c @@ -5,6 +5,7 @@ * @@linkable: no * @@expect: success */ + typedef struct node node; struct node { int data; @@ -15,6 +16,7 @@ void process(node * p) { /* do work here */ } + void increment_list_items(node * head) { #pragma omp parallel diff --git a/sources/Example_tasking.3.f90 b/sources/Example_tasking.3.f90 index ccf1741..98d6615 100644 --- a/sources/Example_tasking.3.f90 +++ b/sources/Example_tasking.3.f90 @@ -3,17 +3,21 @@ ! @@compilable: yes ! @@linkable: no ! @@expect: success + MODULE LIST TYPE NODE INTEGER :: PAYLOAD TYPE (NODE), POINTER :: NEXT END TYPE NODE CONTAINS + SUBROUTINE PROCESS(p) TYPE (NODE), POINTER :: P ! do work here END SUBROUTINE + SUBROUTINE INCREMENT_LIST_ITEMS (HEAD) + TYPE (NODE), POINTER :: HEAD TYPE (NODE), POINTER :: P !$OMP PARALLEL PRIVATE(P) @@ -29,5 +33,7 @@ END DO !$OMP END SINGLE !$OMP END PARALLEL + END SUBROUTINE + END MODULE diff --git a/sources/Example_taskloop.2.c b/sources/Example_taskloop.2.c new file mode 100644 index 0000000..e135ef7 --- /dev/null +++ b/sources/Example_taskloop.2.c @@ -0,0 +1,33 @@ +/* +* @@name: taskloop.2c +* @@type: C +* @@compilable: yes +* @@linkable: no +* @@expect: success +*/ +#include + +#define T 16 +#define N 1024 + +void parallel_work() { + int x1 = 0, x2 = 0; + + #pragma omp parallel shared(x1,x2) num_threads(T) + { + #pragma omp taskloop + for (int i = 0; i < N; ++i) { + #pragma omp atomic + x1++; // executed T*N times + } + + #pragma omp single + #pragma omp taskloop + for (int i = 0; i < N; ++i) { + #pragma omp atomic + x2++; // executed N times + } + } + + printf("x1 = %d, x2 = %d\n", x1, x2); +} diff --git a/sources/Example_taskloop.2.f90 b/sources/Example_taskloop.2.f90 new file mode 100644 index 0000000..7ab507b --- /dev/null +++ b/sources/Example_taskloop.2.f90 @@ -0,0 +1,36 @@ +! @@name: taskloop.2f +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success +subroutine parallel_work + implicit none + integer :: x1, x2 + integer :: i + integer, parameter :: T = 16 + integer, parameter :: N = 1024 + + x1 = 0 + x2 = 0 + !$omp parallel shared(x1,x2) num_threads(T) + !$omp taskloop + do i = 1,N + !$omp atomic + x1 = x1 + 1 ! executed T*N times + !$omp end atomic + end do + !$omp end taskloop + + !$omp single + !$omp taskloop + do i = 1,N + !$omp atomic + x2 = x2 + 1 ! executed N times + !$omp end atomic + end do + !$omp end taskloop + !$omp end single + !$omp end parallel + + write (*,'(A,I0,A,I0)') 'x1 = ', x1, ', x2 = ',x2 +end subroutine diff --git a/sources/Example_taskloop_reduction.1.c b/sources/Example_taskloop_reduction.1.c new file mode 100644 index 0000000..c592b27 --- /dev/null +++ b/sources/Example_taskloop_reduction.1.c @@ -0,0 +1,32 @@ +/* +* @@name: taskloop_reduction.1.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ +#include + +int array_sum(int n, int *v) { + int i; + int res = 0; + + #pragma omp taskloop reduction(+: res) + for(i = 0; i < n; ++i) + res += v[i]; + + return res; +} + +int main(int argc, char *argv[]) { + int n = 10; + int v[10] = {1,2,3,4,5,6,7,8,9,10}; + + #pragma omp parallel + #pragma omp single + { + int res = array_sum(n, v); + printf("The result is %d\n", res); + } + return 0; +} diff --git a/sources/Example_taskloop_reduction.1.f90 b/sources/Example_taskloop_reduction.1.f90 new file mode 100644 index 0000000..5346330 --- /dev/null +++ b/sources/Example_taskloop_reduction.1.f90 @@ -0,0 +1,38 @@ +! @@name: taskloop_reduction.1.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success +function array_sum(n, v) result(res) + implicit none + integer :: n, v(n), res + integer :: i + + res = 0 + !$omp taskloop reduction(+: res) + do i=1, n + res = res + v(i) + end do + !$omp end taskoop + +end function array_sum + +program main + implicit none + integer :: n, v(10), res + integer :: i + + integer, external :: array_sum + + n = 10 + do i=1, n + v(i) = i + end do + + !$omp parallel + !$omp single + res = array_sum(n, v) + print *, "The result is", res + !$omp end single + !$omp end parallel +end program main diff --git a/sources/Example_taskloop_reduction.2.c b/sources/Example_taskloop_reduction.2.c new file mode 100644 index 0000000..19f8bb8 --- /dev/null +++ b/sources/Example_taskloop_reduction.2.c @@ -0,0 +1,40 @@ +/* +* @@name: taskloop_reduction.2.c +* @@type: C +* @@compilable: yes, omp_5.0 +* @@linkable: yes +* @@expect: success +*/ +#include + +int array_sum(int n, int *v) { + int i; + int res = 0; + + #pragma omp taskgroup task_reduction(+: res) + { + if (n > 0) { + #pragma omp task in_reduction(+: res) + res = res + v[0]; + + #pragma omp taskloop in_reduction(+: res) nogroup + for(i = 1; i < n; ++i) + res += v[i]; + } + } + + return res; +} + +int main(int argc, char *argv[]) { + int n = 10; + int v[10] = {1,2,3,4,5,6,7,8,9,10}; + + #pragma omp parallel + #pragma omp single + { + int res = array_sum(n, v); + printf("The result is %d\n", res); + } + return 0; +} diff --git a/sources/Example_taskloop_reduction.2.f90 b/sources/Example_taskloop_reduction.2.f90 new file mode 100644 index 0000000..b2ce6b2 --- /dev/null +++ b/sources/Example_taskloop_reduction.2.f90 @@ -0,0 +1,46 @@ +! @@name: taskloop_reduction.2.f90 +! @@type: F-free +! @@compilable: yes, omp_5.0 +! @@linkable: yes +! @@expect: success +function array_sum(n, v) result(res) + implicit none + integer :: n, v(n), res + integer :: i + + res = 0 + !$omp taskgroup task_reduction(+: res) + if (n > 0) then + !$omp task in_reduction(+: res) + res = res + v(1) + !$omp end task + + !$omp taskloop in_reduction(+: res) nogroup + do i=2, n + res = res + v(i) + end do + !$omp end taskoop + endif + !$omp end taskgroup + +end function array_sum + +program main + implicit none + integer :: n, v(10), res + integer :: i + + integer, external :: array_sum + + n = 10 + do i=1, n + v(i) = i + end do + + !$omp parallel + !$omp single + res = array_sum(n, v) + print *, "The result is", res + !$omp end single + !$omp end parallel +end program main diff --git a/sources/Example_taskloop_simd_reduction.1.c b/sources/Example_taskloop_simd_reduction.1.c new file mode 100644 index 0000000..1708239 --- /dev/null +++ b/sources/Example_taskloop_simd_reduction.1.c @@ -0,0 +1,53 @@ +/* +* @@name: taskloop_simd_reduction.1c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ + +#include +#define N 100 + +int main(){ + int i, a[N], asum=0; + + for(i=0;i +#include + +struct point { + int x; + int y; +}; + +#pragma omp declare reduction(min : struct point : \ + omp_out.x = omp_in.x > omp_out.x ? omp_out.x : omp_in.x, \ + omp_out.y = omp_in.y > omp_out.y ? omp_out.y : omp_in.y ) \ + initializer( omp_priv = { INT_MAX, INT_MAX } ) + +#pragma omp declare reduction(max : struct point : \ + omp_out.x = omp_in.x < omp_out.x ? omp_out.x : omp_in.x, \ + omp_out.y = omp_in.y < omp_out.y ? omp_out.y : omp_in.y ) \ + initializer( omp_priv = { 0, 0 } ) + +void find_enclosing_rectangle ( int n, struct point points[] ) +{ + struct point minp = { INT_MAX, INT_MAX }, maxp = {0,0}; + int i; + +#pragma omp parallel for reduction(min:minp) reduction(max:maxp) + for ( i = 0; i < n; i++ ) { + if ( points[i].x < minp.x ) minp.x = points[i].x; + if ( points[i].y < minp.y ) minp.y = points[i].y; + if ( points[i].x > maxp.x ) maxp.x = points[i].x; + if ( points[i].y > maxp.y ) maxp.y = points[i].y; + } + printf("min = (%d, %d)\n", minp.x, minp.y); + printf("max = (%d, %d)\n", maxp.x, maxp.y); +} diff --git a/sources/Example_udr.2.f90 b/sources/Example_udr.2.f90 new file mode 100644 index 0000000..461ba8d --- /dev/null +++ b/sources/Example_udr.2.f90 @@ -0,0 +1,44 @@ +! @@name: udr.2.f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: no +! @@expect: success +module data_type + + type :: point + integer :: x + integer :: y + end type + +end module data_type + +subroutine find_enclosing_rectangle ( n, points ) + use data_type + implicit none + integer :: n + type(point) :: points(*) + + !$omp declare reduction( min : point : & + !$omp& omp_out = point(min( omp_out%x, omp_in%x ), & + !$omp& min( omp_out%y, omp_in%y )) ) & + !$omp& initializer( omp_priv = point( HUGE(0), HUGE(0) ) ) + + !$omp declare reduction( max : point : & + !$omp& omp_out = point(max( omp_out%x, omp_in%x ), & + !$omp& max( omp_out%y, omp_in%y )) ) & + !$omp& initializer( omp_priv = point( 0, 0 ) ) + + type(point) :: minp = point( HUGE(0), HUGE(0) ), maxp = point( 0, 0 ) + integer :: i + + !$omp parallel do reduction(min: minp) reduction(max: maxp) + do i = 1, n + minp%x = min(minp%x, points(i)%x) + minp%y = min(minp%y, points(i)%y) + maxp%x = max(maxp%x, points(i)%x) + maxp%y = max(maxp%y, points(i)%y) + end do + print *, "min = (", minp%x, minp%y, ")" + print *, "max = (", maxp%x, maxp%y, ")" + +end subroutine diff --git a/sources/Example_udr.3.c b/sources/Example_udr.3.c new file mode 100644 index 0000000..07af689 --- /dev/null +++ b/sources/Example_udr.3.c @@ -0,0 +1,71 @@ +/* +* @@name: udr.3.c +* @@type: C +* @@compilable: yes +* @@linkable: yes +* @@expect: success +*/ + +#include +#define N 100 + +struct mx_s { + float value; + int index; +}; + + +/* prototype functions for combiner and initializer in + the declare reduction */ + +void mx_combine(struct mx_s *out, struct mx_s *in); + +void mx_init(struct mx_s *priv, struct mx_s *orig); + +#pragma omp declare reduction(maxloc: struct mx_s: \ + mx_combine(&omp_out, &omp_in)) \ + initializer(mx_init(&omp_priv, &omp_orig)) + +void mx_combine(struct mx_s *out, struct mx_s *in) +{ + if ( out->value < in->value ) { + out->value = in->value; + out->index = in->index; + } +} + + +void mx_init(struct mx_s *priv, struct mx_s *orig) +{ + priv->value = orig->value; + priv->index = orig->index; +} + + +int main(void) +{ + struct mx_s mx; + float val[N], d; + int i, count = N; + + for (i = 0; i < count; i++) { + d = (N*0.8f - i); + val[i] = N * N - d * d; + } + + mx.value = val[0]; + mx.index = 0; + #pragma omp parallel for reduction(maxloc: mx) + for (i = 1; i < count; i++) { + if (mx.value < val[i]) + { + mx.value = val[i]; + mx.index = i; + } + } + + printf("max value = %g, index = %d\n", mx.value, mx.index); + /* prints 10000, 80 */ + + return 0; +} diff --git a/sources/Example_udr.3.f90 b/sources/Example_udr.3.f90 new file mode 100644 index 0000000..581444a --- /dev/null +++ b/sources/Example_udr.3.f90 @@ -0,0 +1,64 @@ +! @@name: udr.3.f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +program max_loc + implicit none + + type :: mx_s + real value + integer index + end type + + !$omp declare reduction(maxloc: mx_s: & + !$omp& mx_combine(omp_out, omp_in)) & + !$omp& initializer(mx_init(omp_priv, omp_orig)) + + integer, parameter :: N = 100 + type(mx_s) :: mx + real :: val(N), d + integer :: i, count + + count = N + do i = 1, count + d = N*0.8 - i + 1 + val(i) = N * N - d * d + enddo + + mx%value = val(1) + mx%index = 1 + !$omp parallel do reduction(maxloc: mx) + do i = 2, count + if (mx%value < val(i)) then + mx%value = val(i) + mx%index = i + endif + enddo + + print *, 'max value = ', mx%value, ' index = ', mx%index + ! prints 10000, 81 + + contains + + subroutine mx_combine(out, in) + implicit none + type(mx_s), intent(inout) :: out + type(mx_s), intent(in) :: in + + if ( out%value < in%value ) then + out%value = in%value + out%index = in%index + endif + end subroutine mx_combine + + subroutine mx_init(priv, orig) + implicit none + type(mx_s), intent(out) :: priv + type(mx_s), intent(in) :: orig + + priv%value = orig%value + priv%index = orig%index + end subroutine mx_init + +end program diff --git a/sources/Example_udr.4.f90 b/sources/Example_udr.4.f90 new file mode 100644 index 0000000..2444ae6 --- /dev/null +++ b/sources/Example_udr.4.f90 @@ -0,0 +1,58 @@ +! @@name: udr.4.f90 +! @@type: F-free +! @@compilable: yes +! @@linkable: yes +! @@expect: success +module data_red +! Declare data type. + type dt + real :: r1 + real :: r2 + end type + +! Declare the user-defined operator .add. + interface operator(.add.) + module procedure addc + end interface + +! Declare the user-defined reduction operator .add. +!$omp declare reduction(.add.:dt:omp_out=omp_out.add.omp_in) & +!$omp& initializer(dt_init(omp_priv)) + + contains +! Declare the initialization routine. + subroutine dt_init(u) + type(dt) :: u + u%r1 = 0.0 + u%r2 = 0.0 + end subroutine + +! Declare the specific procedure for the .add. operator. + function addc(x1, x2) result(xresult) + type(dt), intent(in) :: x1, x2 + type(dt) :: xresult + xresult%r1 = x1%r1 + x2%r2 + xresult%r2 = x1%r2 + x2%r1 + end function + +end module data_red + +program main + use data_red, only : dt, dt_init, operator(.add.) + + type(dt) :: xdt1, xdt2 + integer :: i + + xdt1 = dt(1.0,2.0) + xdt2 = dt(2.0,3.0) + +! The reduction operation +!$omp parallel do reduction(.add.: xdt1) + do i = 1, 10 + xdt1 = xdt1 .add. xdt2 + end do +!$omp end parallel do + + print *, xdt1 + +end program diff --git a/sources/Example_udr.5.cpp b/sources/Example_udr.5.cpp new file mode 100644 index 0000000..9695f22 --- /dev/null +++ b/sources/Example_udr.5.cpp @@ -0,0 +1,21 @@ +/* +* @@name: udr.5.cpp +* @@type: C++ +* @@compilable: no +* @@linkable: no +* @@expect: success +*/ +class V { + float *p; + int n; + +public: + V( int _n ) : n(_n) { p = new float[n]; } + V( const V& m ) : n(m.n) { p = new float[n]; } + ~V() { delete[] p; } + + V& operator+= ( const V& ); + + #pragma omp declare reduction( + : V : omp_out += omp_in ) \ + initializer(omp_priv(omp_orig)) +}; diff --git a/sources/Example_udr.6.cpp b/sources/Example_udr.6.cpp new file mode 100644 index 0000000..cba1a03 --- /dev/null +++ b/sources/Example_udr.6.cpp @@ -0,0 +1,20 @@ +/* +* @@name: udr.6.cpp +* @@type: C++ +* @@compilable: no +* @@linkable: no +* @@expect: success +*/ +#include +#include +#include + +#pragma omp declare reduction( + : std::vector : \ + std::transform (omp_out.begin(), omp_out.end(), \ + omp_in.begin(), omp_in.end(),std::plus())) + +#pragma omp declare reduction( merge : std::vector : \ + omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) + +#pragma omp declare reduction( merge : std::list : \ + omp_out.merge(omp_in))