From 910c4b4dcd740f231915739f1b685164eeb6f378 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Krup=C4=8D=C3=ADk?= <lukas.krupcik@vsb.cz>
Date: Thu, 15 Oct 2020 11:36:18 +0200
Subject: [PATCH] 	modified:  
 g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED-2.3.8.eb 
 modified:   g/GROMACS/GROMACS-2020.2-intel-2020a-PLUMED-2.6.1.eb 
 deleted:   
 g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED-2.4.1.eb 
 deleted:    g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED.eb 
 deleted:    g/GROMACS/GROMACS-2016.5-intel-2017b-serial.eb 	deleted:   
 g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED.eb 	deleted:   
 g/GROMACS/GROMACS-2018-intel-2017b-serial.eb 	deleted:   
 g/GROMACS/GROMACS-2018.1-intel-2017c-hybrid-single-PLUMED.eb 	deleted:   
 g/GROMACS/GROMACS-2018.3-intel-2017c-hybrid-single-PLUMED.eb 	deleted:   
 g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.06-mpi.eb 	deleted:   
 g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.11-mpi.eb 	deleted:   
 g/GROMACS/GROMACS-4.6.7-CrayIntel-2015.11-mpi.eb 	deleted:   
 g/GROMACS/GROMACS-5.0.4-gompi-2015e-hybrid-single-PLUMED.eb 	deleted:   
 g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single-PLUMED.eb 	deleted:   
 g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single.eb 	deleted:   
 g/GROMACS/GROMACS-5.1.2-goolf-1.7.20-mt.eb 	deleted:   
 g/GROMACS/GROMACS-5.1.4-foss-2017a-hybrid-single-PLUMED.eb 	deleted:   
 g/GROMACS/gromacs-4.6.7-plumed-2.1.3-mpi.patch 	deleted:   
 g/GROMACS/gromacs-5.0.4-plumed-2.1.3-mpi.patch 	deleted:   
 g/GROMACS/gromacs-5.0.4-plumed-2.1.3.patch 	deleted:   
 g/GROMACS/gromacs-5.1.4-plumed-2.3.0-mpi.patch

---
 ...-intel-2017c-hybrid-single-PLUMED-2.4.1.eb |   36 -
 ...2016.4-intel-2017c-hybrid-single-PLUMED.eb |   36 -
 .../GROMACS-2016.5-intel-2017b-serial.eb      |   33 -
 ...-intel-2017c-hybrid-single-PLUMED-2.3.8.eb |    5 -
 ...2016.5-intel-2017c-hybrid-single-PLUMED.eb |   36 -
 g/GROMACS/GROMACS-2018-intel-2017b-serial.eb  |   32 -
 ...2018.1-intel-2017c-hybrid-single-PLUMED.eb |   36 -
 ...2018.3-intel-2017c-hybrid-single-PLUMED.eb |   36 -
 ...GROMACS-2020.2-intel-2020a-PLUMED-2.6.1.eb |    4 -
 .../GROMACS-4.6.7-CrayGNU-2015.06-mpi.eb      |   45 -
 .../GROMACS-4.6.7-CrayGNU-2015.11-mpi.eb      |   45 -
 .../GROMACS-4.6.7-CrayIntel-2015.11-mpi.eb    |   45 -
 ...-5.0.4-gompi-2015e-hybrid-single-PLUMED.eb |   26 -
 ...-5.0.4-ictce-7.3.5-hybrid-single-PLUMED.eb |   26 -
 ...GROMACS-5.0.4-ictce-7.3.5-hybrid-single.eb |   24 -
 g/GROMACS/GROMACS-5.1.2-goolf-1.7.20-mt.eb    |   42 -
 ...S-5.1.4-foss-2017a-hybrid-single-PLUMED.eb |   38 -
 .../gromacs-4.6.7-plumed-2.1.3-mpi.patch      | 9676 -----------------
 .../gromacs-5.0.4-plumed-2.1.3-mpi.patch      | 9575 ----------------
 g/GROMACS/gromacs-5.0.4-plumed-2.1.3.patch    | 9575 ----------------
 .../gromacs-5.1.4-plumed-2.3.0-mpi.patch      | 9575 ----------------
 21 files changed, 38946 deletions(-)
 delete mode 100644 g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED-2.4.1.eb
 delete mode 100644 g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED.eb
 delete mode 100644 g/GROMACS/GROMACS-2016.5-intel-2017b-serial.eb
 delete mode 100644 g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED.eb
 delete mode 100644 g/GROMACS/GROMACS-2018-intel-2017b-serial.eb
 delete mode 100644 g/GROMACS/GROMACS-2018.1-intel-2017c-hybrid-single-PLUMED.eb
 delete mode 100644 g/GROMACS/GROMACS-2018.3-intel-2017c-hybrid-single-PLUMED.eb
 delete mode 100644 g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.06-mpi.eb
 delete mode 100644 g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.11-mpi.eb
 delete mode 100644 g/GROMACS/GROMACS-4.6.7-CrayIntel-2015.11-mpi.eb
 delete mode 100644 g/GROMACS/GROMACS-5.0.4-gompi-2015e-hybrid-single-PLUMED.eb
 delete mode 100644 g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single-PLUMED.eb
 delete mode 100644 g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single.eb
 delete mode 100644 g/GROMACS/GROMACS-5.1.2-goolf-1.7.20-mt.eb
 delete mode 100644 g/GROMACS/GROMACS-5.1.4-foss-2017a-hybrid-single-PLUMED.eb
 delete mode 100644 g/GROMACS/gromacs-4.6.7-plumed-2.1.3-mpi.patch
 delete mode 100644 g/GROMACS/gromacs-5.0.4-plumed-2.1.3-mpi.patch
 delete mode 100644 g/GROMACS/gromacs-5.0.4-plumed-2.1.3.patch
 delete mode 100644 g/GROMACS/gromacs-5.1.4-plumed-2.3.0-mpi.patch

diff --git a/g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED-2.4.1.eb b/g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED-2.4.1.eb
deleted file mode 100644
index 52cb2eef..00000000
--- a/g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED-2.4.1.eb
+++ /dev/null
@@ -1,36 +0,0 @@
-# IT4Innovations 2018
-
-name = 'GROMACS'
-version = '2016.4'
-versionsuffix = '-hybrid-single-PLUMED-2.4.1'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'intel', 'version': '2017c'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['ftp://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-#preconfigopts = 'plumed patch -p -e gromacs-2016.5 &&'
-#preconfigopts = 'plumed patch -p --runtime &&'
-
-configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AVX2_256  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-
-builddependencies = [
-    ('CMake', '3.5.2', '', True),
-]
-
-dependencies = [
-    ('Boost', '1.67.0', '-serial'),
-    ('PLUMED', '2.4.1')
-]
-
-sanity_check_paths = {
-    'files': ['bin/gmx_mpi'],
-    'dirs': [''],
-}
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED.eb b/g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED.eb
deleted file mode 100644
index c9497bdf..00000000
--- a/g/GROMACS/GROMACS-2016.4-intel-2017c-hybrid-single-PLUMED.eb
+++ /dev/null
@@ -1,36 +0,0 @@
-# IT4Innovations 2018
-
-name = 'GROMACS'
-version = '2016.4'
-versionsuffix = '-hybrid-single-PLUMED'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'intel', 'version': '2017c'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['http://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-#preconfigopts = 'plumed patch -p -e gromacs-2016.5 &&'
-#preconfigopts = 'plumed patch -p --runtime &&'
-
-configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AVX2_256  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-
-builddependencies = [
-    ('CMake', '3.5.2', '', True),
-]
-
-dependencies = [
-    ('Boost', '1.67.0', '-serial'),
-    ('PLUMED', '2.3.5')
-]
-
-sanity_check_paths = {
-    'files': ['bin/gmx_mpi'],
-    'dirs': [''],
-}
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-2016.5-intel-2017b-serial.eb b/g/GROMACS/GROMACS-2016.5-intel-2017b-serial.eb
deleted file mode 100644
index 5b7e47d2..00000000
--- a/g/GROMACS/GROMACS-2016.5-intel-2017b-serial.eb
+++ /dev/null
@@ -1,33 +0,0 @@
-name = 'GROMACS'
-version = '2016.5'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'intel', 'version': '2017b'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['http://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-checksums = ['f41807e5b2911ccb547a3fd11f105d47']
-
-#configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AVX2_256  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-# anselm
-configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=SSE4.1  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-
-builddependencies = [
-    ('CMake', '3.9.1', '', True),
-]
-
-dependencies = [
-    ('Boost', '1.66.0', '-serial'),
-]
-
-sanity_check_paths = {
-    'files': ['bin/gmx_mpi'],
-    'dirs': [''],
-}
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED-2.3.8.eb b/g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED-2.3.8.eb
index da56a60b..f32b1be9 100644
--- a/g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED-2.3.8.eb
+++ b/g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED-2.3.8.eb
@@ -15,11 +15,6 @@ toolchainopts = {'openmp': True, 'usempi': True}
 source_urls = ['http://ftp.gromacs.org/pub/gromacs/']
 sources = [SOURCELOWER_TAR_GZ]
 
-#preconfigopts = 'plumed patch -p -e gromacs-2016.5 &&'
-#preconfigopts = 'plumed patch -p --runtime &&'
-
-#configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AVX2_256  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-
 builddependencies = [
     ('CMake', '3.5.2', '', True),
 ]
diff --git a/g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED.eb b/g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED.eb
deleted file mode 100644
index bff8ea2f..00000000
--- a/g/GROMACS/GROMACS-2016.5-intel-2017c-hybrid-single-PLUMED.eb
+++ /dev/null
@@ -1,36 +0,0 @@
-# IT4Innovations 2018
-
-name = 'GROMACS'
-version = '2016.5'
-versionsuffix = '-hybrid-single-PLUMED'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'intel', 'version': '2017c'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['ftp://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-#preconfigopts = 'plumed patch -p -e gromacs-2016.5 &&'
-#preconfigopts = 'plumed patch -p --runtime &&'
-
-configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AVX2_256  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-
-builddependencies = [
-    ('CMake', '3.5.2', '', True),
-]
-
-dependencies = [
-    ('Boost', '1.67.0', '-serial'),
-    ('PLUMED', '2.3.5')
-]
-
-sanity_check_paths = {
-    'files': ['bin/gmx_mpi'],
-    'dirs': [''],
-}
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-2018-intel-2017b-serial.eb b/g/GROMACS/GROMACS-2018-intel-2017b-serial.eb
deleted file mode 100644
index 4ef68757..00000000
--- a/g/GROMACS/GROMACS-2018-intel-2017b-serial.eb
+++ /dev/null
@@ -1,32 +0,0 @@
-name = 'GROMACS'
-version = '2018'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'intel', 'version': '2017b'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['http://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-checksums = ['6467ffb1575b8271548a13abfba6374c']
-
-#configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AVX2_256  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AUTO  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-
-builddependencies = [
-    ('CMake', '3.9.1', '', True),
-]
-
-dependencies = [
-    ('Boost', '1.66.0', '-serial'),
-]
-
-sanity_check_paths = {
-    'files': ['bin/gmx_mpi'],
-    'dirs': [''],
-}
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-2018.1-intel-2017c-hybrid-single-PLUMED.eb b/g/GROMACS/GROMACS-2018.1-intel-2017c-hybrid-single-PLUMED.eb
deleted file mode 100644
index 0ca827a4..00000000
--- a/g/GROMACS/GROMACS-2018.1-intel-2017c-hybrid-single-PLUMED.eb
+++ /dev/null
@@ -1,36 +0,0 @@
-# IT4Innovations 2018
-
-name = 'GROMACS'
-version = '2018.1'
-versionsuffix = '-hybrid-single-PLUMED'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'intel', 'version': '2017c'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['http://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-#preconfigopts = 'plumed patch -p -e gromacs-2016.5 &&'
-#preconfigopts = 'plumed patch -p --runtime &&'
-
-configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AVX2_256  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-
-builddependencies = [
-    ('CMake', '3.13.1', '', True),
-]
-
-dependencies = [
-    ('Boost', '1.68.0', '-serial'),
-    ('PLUMED', '2.4.2')
-]
-
-sanity_check_paths = {
-    'files': ['bin/gmx_mpi'],
-    'dirs': [''],
-}
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-2018.3-intel-2017c-hybrid-single-PLUMED.eb b/g/GROMACS/GROMACS-2018.3-intel-2017c-hybrid-single-PLUMED.eb
deleted file mode 100644
index 26af7ea9..00000000
--- a/g/GROMACS/GROMACS-2018.3-intel-2017c-hybrid-single-PLUMED.eb
+++ /dev/null
@@ -1,36 +0,0 @@
-# IT4Innovations 2018
-
-name = 'GROMACS'
-version = '2018.3'
-versionsuffix = '-hybrid-single-PLUMED'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'intel', 'version': '2017c'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['http://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-#preconfigopts = 'plumed patch -p -e gromacs-2016.5 &&'
-#preconfigopts = 'plumed patch -p --runtime &&'
-
-configopts = ' -DGMX_GPU=OFF  -DBUILD_SHARED_LIBS=OFF  -DGMX_PREFER_STATIC_LIBS=ON -DGMX_DOUBLE=OFF  -DGMX_SIMD=AVX2_256  -DGMX_BUILD_OWN_FFTW=ON  -DGMX_MPI=ON'
-
-builddependencies = [
-    ('CMake', '3.9.1', '', True),
-]
-
-dependencies = [
-    ('Boost', '1.68.0', '-serial'),
-    ('PLUMED', '2.4.2')
-]
-
-sanity_check_paths = {
-    'files': ['bin/gmx_mpi'],
-    'dirs': [''],
-}
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-2020.2-intel-2020a-PLUMED-2.6.1.eb b/g/GROMACS/GROMACS-2020.2-intel-2020a-PLUMED-2.6.1.eb
index 97f9cc0d..f856912b 100644
--- a/g/GROMACS/GROMACS-2020.2-intel-2020a-PLUMED-2.6.1.eb
+++ b/g/GROMACS/GROMACS-2020.2-intel-2020a-PLUMED-2.6.1.eb
@@ -22,10 +22,6 @@ source_urls = [
     'ftp://ftp.gromacs.org/pub/gromacs/',
 ]
 sources = [SOURCELOWER_TAR_GZ]
-#atches = [
-#   'GROMACS-2018_fix_search_for_nvml_include.patch',
-#   'GROMACS-2018_amend_search_for_nvml_lib.patch',
-#
 
 builddependencies = [
     ('CMake', '3.16.4'),
diff --git a/g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.06-mpi.eb b/g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.06-mpi.eb
deleted file mode 100644
index b69a2609..00000000
--- a/g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.06-mpi.eb
+++ /dev/null
@@ -1,45 +0,0 @@
-##
-# This file is an EasyBuild reciPY as per https://github.com/hpcugent/easybuild
-#
-# Copyright:: Copyright 2012-2013 University of Luxembourg / LCSB, Cyprus Institute / CaSToRC, Ghent University
-# Authors::   Wiktor Jurkowski <wiktor.jurkowski@uni.lu>, Fotis Georgatos <fotis.georgatos@uni.lu>, \
-#             George Tsouloupas <g.tsouloupas@cyi.ac.cy>, Kenneth Hoste
-# License::   MIT/GPL
-# $Id$
-#
-# This work implements a part of the HPCBIOS project and is a local_component of the policy:
-# http://hpcbios.readthedocs.org/en/latest/HPCBIOS_2012-93.html
-##
-name = 'GROMACS'
-version = '4.6.7'
-versionsuffix = '-mpi'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'CrayGNU', 'version': '2015.06'}
-toolchainopts = {'usempi': True}
-
-# eg. ftp://ftp.gromacs.org/pub/gromacs/gromacs-4.6.tar.gz
-source_urls = [
-    'ftp://ftp.gromacs.org/pub/gromacs/',  # GROMACS sources
-    'http://gerrit.gromacs.org/download/',  # regression tests sources
-]
-sources = [
-    SOURCELOWER_TAR_GZ,
-    'regressiontests-%(version)s.tar.gz',
-]
-
-preconfigopts = "export CMAKE_LIBRARY_PATH=$CMAKE_LIBRARY_PATH:${EBROOTFFTW}/lib && "
-preconfigopts += "export CMAKE_INCLUDE_PATH=$CMAKE_INCLUDE_PATH:${EBROOTFFTW}/include && "
-
-dependencies = [
-    ('fftw/3.3.4.3', EXTERNAL_MODULE),
-]
-
-builddependencies = [('CMake', '3.2.2')]
-
-runtest = False
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.11-mpi.eb b/g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.11-mpi.eb
deleted file mode 100644
index 3ef3a4e4..00000000
--- a/g/GROMACS/GROMACS-4.6.7-CrayGNU-2015.11-mpi.eb
+++ /dev/null
@@ -1,45 +0,0 @@
-##
-# This file is an EasyBuild reciPY as per https://github.com/hpcugent/easybuild
-#
-# Copyright:: Copyright 2012-2013 University of Luxembourg / LCSB, Cyprus Institute / CaSToRC, Ghent University
-# Authors::   Wiktor Jurkowski <wiktor.jurkowski@uni.lu>, Fotis Georgatos <fotis.georgatos@uni.lu>, \
-#             George Tsouloupas <g.tsouloupas@cyi.ac.cy>, Kenneth Hoste
-# License::   MIT/GPL
-# $Id$
-#
-# This work implements a part of the HPCBIOS project and is a local_component of the policy:
-# http://hpcbios.readthedocs.org/en/latest/HPCBIOS_2012-93.html
-##
-name = 'GROMACS'
-version = '4.6.7'
-versionsuffix = '-mpi'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'CrayGNU', 'version': '2015.11'}
-toolchainopts = {'usempi': True}
-
-# eg. ftp://ftp.gromacs.org/pub/gromacs/gromacs-4.6.tar.gz
-source_urls = [
-    'ftp://ftp.gromacs.org/pub/gromacs/',  # GROMACS sources
-    'http://gerrit.gromacs.org/download/',  # regression tests sources
-]
-sources = [
-    SOURCELOWER_TAR_GZ,
-    'regressiontests-%(version)s.tar.gz',
-]
-
-preconfigopts = "export CMAKE_LIBRARY_PATH=$CMAKE_LIBRARY_PATH:$EBROOTFFTW/lib && "
-preconfigopts += "export CMAKE_INCLUDE_PATH=$CMAKE_INCLUDE_PATH:$EBROOTFFTW/include && "
-
-dependencies = [
-    ('fftw/3.3.4.5', EXTERNAL_MODULE),
-]
-
-builddependencies = [('CMake', '3.2.2')]
-
-runtest = False
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-4.6.7-CrayIntel-2015.11-mpi.eb b/g/GROMACS/GROMACS-4.6.7-CrayIntel-2015.11-mpi.eb
deleted file mode 100644
index a03812a7..00000000
--- a/g/GROMACS/GROMACS-4.6.7-CrayIntel-2015.11-mpi.eb
+++ /dev/null
@@ -1,45 +0,0 @@
-##
-# This file is an EasyBuild reciPY as per https://github.com/hpcugent/easybuild
-#
-# Copyright:: Copyright 2012-2013 University of Luxembourg / LCSB, Cyprus Institute / CaSToRC, Ghent University
-# Authors::   Wiktor Jurkowski <wiktor.jurkowski@uni.lu>, Fotis Georgatos <fotis.georgatos@uni.lu>, \
-#             George Tsouloupas <g.tsouloupas@cyi.ac.cy>, Kenneth Hoste
-# License::   MIT/GPL
-# $Id$
-#
-# This work implements a part of the HPCBIOS project and is a local_component of the policy:
-# http://hpcbios.readthedocs.org/en/latest/HPCBIOS_2012-93.html
-##
-name = 'GROMACS'
-version = '4.6.7'
-versionsuffix = '-mpi'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'CrayIntel', 'version': '2015.11'}
-toolchainopts = {'usempi': True}
-
-# eg. ftp://ftp.gromacs.org/pub/gromacs/gromacs-4.6.tar.gz
-source_urls = [
-    'ftp://ftp.gromacs.org/pub/gromacs/',  # GROMACS sources
-    'http://gerrit.gromacs.org/download/',  # regression tests sources
-]
-sources = [
-    SOURCELOWER_TAR_GZ,
-    'regressiontests-%(version)s.tar.gz',
-]
-
-preconfigopts = "export CMAKE_LIBRARY_PATH=$CMAKE_LIBRARY_PATH:$EBROOTFFTW/lib && "
-preconfigopts += "export CMAKE_INCLUDE_PATH=$CMAKE_INCLUDE_PATH:$EBROOTFFTW/include && "
-
-dependencies = [
-    ('fftw/3.3.4.5', EXTERNAL_MODULE),
-]
-
-builddependencies = [('CMake', '3.2.2')]
-
-runtest = False
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-5.0.4-gompi-2015e-hybrid-single-PLUMED.eb b/g/GROMACS/GROMACS-5.0.4-gompi-2015e-hybrid-single-PLUMED.eb
deleted file mode 100644
index 663369fe..00000000
--- a/g/GROMACS/GROMACS-5.0.4-gompi-2015e-hybrid-single-PLUMED.eb
+++ /dev/null
@@ -1,26 +0,0 @@
-name = 'GROMACS'
-version = '5.0.4'
-versionsuffix = '-hybrid-single-PLUMED'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'gompi', 'version': '2015e'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-patches = ['gromacs-5.0.4-plumed-2.1.3.patch']
-
-source_urls = ['ftp://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-builddependencies = [
-    # ('CMake', '3.0.0'), # We are using system CMake
-    ('libxml2', '2.9.2')
-]
-
-configopts = ' -DGMX_GPU=OFF -DGMX_DOUBLE=OFF -DGMX_SIMD=AVX2'
-
-dependencies = [('Boost', '1.58.0', '-Python-2.7.9')]
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single-PLUMED.eb b/g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single-PLUMED.eb
deleted file mode 100644
index a455e6fe..00000000
--- a/g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single-PLUMED.eb
+++ /dev/null
@@ -1,26 +0,0 @@
-name = 'GROMACS'
-version = '5.0.4'
-versionsuffix = '-hybrid-single-PLUMED'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'ictce', 'version': '7.3.5'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-patches = ['gromacs-5.0.4-plumed-2.1.3.patch']
-
-source_urls = ['ftp://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-builddependencies = [
-    ('CMake', '3.0.0'),
-    ('libxml2', '2.9.2')
-]
-
-configopts = ' -DGMX_GPU=OFF -DGMX_DOUBLE=OFF'
-
-dependencies = [('Boost', '1.58.0', '-Python-2.7.9')]
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single.eb b/g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single.eb
deleted file mode 100644
index fcce8863..00000000
--- a/g/GROMACS/GROMACS-5.0.4-ictce-7.3.5-hybrid-single.eb
+++ /dev/null
@@ -1,24 +0,0 @@
-name = 'GROMACS'
-version = '5.0.4'
-versionsuffix = '-hybrid-single'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'ictce', 'version': '7.3.5'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['ftp://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-builddependencies = [
-    ('CMake', '3.0.0'),
-    ('libxml2', '2.9.2')
-]
-
-configopts = ' -DGMX_GPU=OFF -DGMX_DOUBLE=OFF'
-
-dependencies = [('Boost', '1.58.0', '-Python-2.7.9')]
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-5.1.2-goolf-1.7.20-mt.eb b/g/GROMACS/GROMACS-5.1.2-goolf-1.7.20-mt.eb
deleted file mode 100644
index 7ce66b9c..00000000
--- a/g/GROMACS/GROMACS-5.1.2-goolf-1.7.20-mt.eb
+++ /dev/null
@@ -1,42 +0,0 @@
-##
-# This file is an EasyBuild reciPY as per https://github.com/hpcugent/easybuild
-#
-# Copyright:: Copyright 2012-2013 University of Luxembourg / LCSB, Cyprus Institute / CaSToRC, Ghent University
-# Authors::   Wiktor Jurkowski <wiktor.jurkowski@uni.lu>, Fotis Georgatos <fotis.georgatos@uni.lu>, \
-#             George Tsouloupas <g.tsouloupas@cyi.ac.cy>, Kenneth Hoste
-# License::   MIT/GPL
-# $Id$
-#
-# This work implements a part of the HPCBIOS project and is a local_component of the policy:
-# http://hpcbios.readthedocs.org/en/latest/HPCBIOS_2012-93.html
-##
-name = 'GROMACS'
-version = '5.1.2'
-versionsuffix = '-mt'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'goolf', 'version': '1.7.20'}
-toolchainopts = {'openmp': True, 'usempi': False}
-
-source_urls = [
-    'ftp://ftp.gromacs.org/pub/gromacs/',
-    'http://gerrit.gromacs.org/download/',
-]
-
-sources = [
-    SOURCELOWER_TAR_GZ,
-    # seems to have disappeared?
-    #    'regressiontests-5.0.2.tar.gz',
-]
-
-builddependencies = [
-    ('CMake', '2.8.12'),
-    ('libxml2', '2.9.3')
-]
-
-dependencies = [('Boost', '1.53.0')]
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/GROMACS-5.1.4-foss-2017a-hybrid-single-PLUMED.eb b/g/GROMACS/GROMACS-5.1.4-foss-2017a-hybrid-single-PLUMED.eb
deleted file mode 100644
index 61f4a62c..00000000
--- a/g/GROMACS/GROMACS-5.1.4-foss-2017a-hybrid-single-PLUMED.eb
+++ /dev/null
@@ -1,38 +0,0 @@
-name = 'GROMACS'
-version = '5.1.4'
-versionsuffix = '-hybrid-single-PLUMED'
-
-homepage = 'http://www.gromacs.org'
-description = """GROMACS is a versatile package to perform molecular dynamics,
- i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles."""
-
-toolchain = {'name': 'foss', 'version': '2017a'}
-toolchainopts = {'openmp': True, 'usempi': True}
-
-source_urls = ['ftp://ftp.gromacs.org/pub/gromacs/']
-sources = [SOURCELOWER_TAR_GZ]
-
-#patches = ['gromacs-%s-plumed-2.3.0-mpi.patch' % version]
-
-builddependencies = [
-    ('CMake', '3.7.2', '', True),  # We are using system CMake
-    ('libxml2', '2.9.2', '', True),
-]
-
-preconfigopts = 'plumed patch -p -e gromacs-5.1.4 --shared &&'
-
-configopts = ' -DGMX_GPU=OFF -DGMX_DOUBLE=OFF -DGMX_SIMD=AVX2_256'
-
-dependencies = [
-    ('Boost', '1.61.0', '-serial'),
-    ('almost', '2.1.0', '', ('foss', '2016a')),
-    ('libmatheval', '1.1.11'),
-    ('PLUMED', '2.3.0')
-]
-
-sanity_check_paths = {
-    'files': ['bin/gmx_mpi'],
-    'dirs': [''],
-}
-
-moduleclass = 'bio'
diff --git a/g/GROMACS/gromacs-4.6.7-plumed-2.1.3-mpi.patch b/g/GROMACS/gromacs-4.6.7-plumed-2.1.3-mpi.patch
deleted file mode 100644
index 6583388a..00000000
--- a/g/GROMACS/gromacs-4.6.7-plumed-2.1.3-mpi.patch
+++ /dev/null
@@ -1,9676 +0,0 @@
-diff --git a/Plumed.cmake b/Plumed.cmake
-new file mode 100644
-index 0000000..01472f0
---- /dev/null
-+++ b/Plumed.cmake
-@@ -0,0 +1,3 @@
-+# PLUMED: shared installation
-+set(PLUMED_LOAD  /apps/all/PLUMED/2.1.3-foss-2015g/lib/plumed///src/lib/libplumed.so -ldl )
-+set(PLUMED_DEPENDENCIES  /apps/all/PLUMED/2.1.3-foss-2015g/lib/plumed///src/lib/libplumed.so)
-diff --git a/Plumed.h b/Plumed.h
-new file mode 100644
-index 0000000..16da74a
---- /dev/null
-+++ b/Plumed.h
-@@ -0,0 +1,494 @@
-+/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-+   Copyright (c) 2011-2014 The plumed team
-+   (see the PEOPLE file at the root of the distribution for a list of names)
-+
-+   See http://www.plumed-code.org for more information.
-+
-+   This file is part of plumed, version 2.
-+
-+   plumed is free software: you can redistribute it and/or modify
-+   it under the terms of the GNU Lesser General Public License as published by
-+   the Free Software Foundation, either version 3 of the License, or
-+   (at your option) any later version.
-+
-+   plumed is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+   GNU Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public License
-+   along with plumed.  If not, see <http://www.gnu.org/licenses/>.
-++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
-+#ifndef __PLUMED_wrapper_Plumed_h
-+#define __PLUMED_wrapper_Plumed_h
-+
-+/**
-+\page ReferencePlumedH Reference for interfacing MD codes with PLUMED
-+
-+  Plumed.h and Plumed.c contain the external plumed interface, which is used to
-+  integrate it with MD engines. This interface is very general, and is expected
-+  not to change across plumed versions. Plumed.c also implements a dummy version
-+  of the interface, so as to allow a code to be fully linked even if the plumed
-+  library is not available yet. These files could be directly included in the official
-+  host MD distribution. In this manner, it will be sufficient to link the plumed
-+  library at link time (on all systems) or directly at runtime (on system where
-+  dynamic loading is enabled) to include plumed features.
-+
-+  Why is Plumed.c written in C and not C++? The reason is that the resulting Plumed.o
-+  needs to be linked with the host MD code immediately (whereas the rest of plumed
-+  could be linked a posteriori). Imagine the MD code is written in FORTRAN: when we
-+  link the Plumed.o file we would like not to need any C++ library linked. In this
-+  manner, we do not need to know which C++ compiler will be used to compile plumed.
-+  The C++ library is only linked to the "rest" of plumed, which actually use it.
-+  Anyway, Plumed.c is written in such a manner to allow its compilation also in C++
-+  (C++ is a bit stricter than C; compatibility is checked when PlumedStatic.cpp,
-+  which basically includes Plumed.c, is compiled with the C++ compiler). This will
-+  allow e.g. MD codes written in C++ to just incorporate Plumed.c (maybe renamed into
-+  Plumed.cpp), without the need of configuring a plain C compiler.
-+
-+  Plumed interface can be used from C, C++ and FORTRAN. Everything concerning plumed
-+  is hidden inside a single object type, which is described in C by a structure
-+  (struct \ref plumed), in C++ by a class (PLMD::Plumed) and in FORTRAN by a
-+  fixed-length string (CHARACTER(LEN=32)). Obviously C++ can use both struct
-+  and class interfaces, but the first should be preferred. The reference interface
-+  is the C one, whereas FORTRAN and C++ interfaces are implemented as wrappers
-+  around it.
-+
-+  In the C++ interface, all the routines are implemented as methods of PLMD::Plumed.
-+  In the C and FORTRAN interfaces, all the routines are named plumed_*, to
-+  avoid potential name clashes. Notice that the entire plumed library
-+  is implemented in C++, and it is hidden inside the PLMD namespace.
-+
-+  Handlers to the plumed object can be converted among different representations,
-+  to allow inter-operability among languages. In C, there are tools to convert
-+  to/from FORTRAN, whereas in C++ there are tools to convert to/from FORTRAN and C.
-+
-+  These handlers only contain a pointer to the real structure, so that
-+  when a plumed object is brought from one language to another,
-+  it brings a reference to the same environment.
-+
-+  Moreover, to simplify life in all cases where a single Plumed object is
-+  required for the entire simulation (which covers most of the practical
-+  applications with conventional MD codes) it is possible to take advantage
-+  of a global interface, which is implicitly referring to a unique global instance.
-+  The global object should still be initialized and finalized properly.
-+
-+  The basic method to send a message to plumed is
-+\verbatim
-+  (C) plumed_cmd
-+  (C++) PLMD::Plumed::cmd
-+  (FORTRAN)  PLUMED_F_CMD
-+\endverbatim
-+
-+  To initialize a plumed object, use:
-+\verbatim
-+  (C)        plumed_create
-+  (C++)      (constructor of PLMD::Plumed)
-+  (FORTRAN)  PLUMED_F_CREATE
-+\endverbatim
-+
-+  To finalize it, use
-+\verbatim
-+  (C)        plumed_finalize
-+  (C++)      (destructor of PLMD::Plumed)
-+  (FORTRAN)  PLUMED_F_FINALIZE
-+\endverbatim
-+
-+  To access to the global-object, use
-+\verbatim
-+  (C)        plumed_gcreate, plumed_gfinalize, plumed_gcmd
-+  (C++)      PLMD::Plumed::gcreate, PLMD::Plumed::gfinalize, PLMD::Plumed::gcmd
-+  (FORTRAN)  PLUMED_F_GCREATE, PLUMED_F_GFINALIZE, PLUMED_F_GCMD
-+\endverbatim
-+
-+  To check if the global object has been initialized, use
-+\verbatim
-+  (C)        plumed_ginitialized
-+  (C++)      PLMD::Plumed::ginitialized
-+  (FORTRAN)  PLUMED_F_GINITIALIZED
-+\endverbatim
-+
-+  To check if plumed library is available (this is useful for runtime linking), use
-+\verbatim
-+  (C)        plumed_installed 
-+  (C++)      PLMD::Plumed::installed
-+  (FORTRAN)  PLUMED_F_INSTALLED
-+\endverbatim
-+
-+  To convert handlers use
-+\verbatim
-+  (C)        plumed_c2f                 (C to FORTRAN)
-+  (C)        plumed_f2c                 (FORTRAN to C)
-+  (C++)      Plumed(plumed) constructor (C to C++)
-+  (C++)      operator plumed() cast     (C++ to C)
-+  (C++)      Plumed(char*)  constructor (FORTRAN to C++)
-+  (C++)      toFortran(char*)           (C++ to FORTRAN)
-+\endverbatim
-+
-+\verbatim
-+  FORTRAN interface
-+    SUBROUTINE PLUMED_F_INSTALLED(i)
-+      INTEGER,           INTENT(OUT)   :: i
-+    SUBROUTINE PLUMED_F_GINITIALIZED(i)
-+      INTEGER,           INTENT(OUT)   :: i
-+    SUBROUTINE PLUMED_F_GCREATE()
-+    SUBROUTINE PLUMED_F_GCMD(key,val)
-+      CHARACTER(LEN=*), INTENT(IN)     :: key
-+      UNSPECIFIED_TYPE, INTENT(INOUT)  :: val(*)
-+    SUBROUTINE PLUMED_F_GFINALIZE()
-+    SUBROUTINE PLUMED_F_GLOBAL(p)
-+      CHARACTER(LEN=32), INTENT(OUT)   :: p
-+    SUBROUTINE PLUMED_F_CREATE(p)
-+      CHARACTER(LEN=32), INTENT(OUT)   :: p
-+    SUBROUTINE PLUMED_F_CMD(p,key,val)
-+      CHARACTER(LEN=32), INTENT(IN)    :: p
-+      CHARACTER(LEN=*),  INTENT(IN)    :: key
-+      UNSPECIFIED_TYPE,  INTENT(INOUT) :: val(*)
-+    SUBROUTINE PLUMED_F_FINALIZE(p)
-+      CHARACTER(LEN=32), INTENT(IN)    :: p
-+\endverbatim
-+
-+  The main routine is "cmd", which accepts two arguments:
-+  key is a string containing the name of the command
-+  val is the argument. it is declared const so as to use allow passing const objects, but in practice plumed
-+      is going to modify val in several cases (using a const_cast).
-+  In some cases val can be omitted: just pass a NULL pointer (in C++, val is optional and can be omitted).
-+  The set of possible keys is the real API of the plumed library, and will be expanded with time.
-+  New commands will be added, but backward compatibility will be retained as long as possible.
-+
-+  To pass plumed a callback function use the following syntax (not available in FORTRAN yet)
-+\verbatim
-+    plumed_function_holder ff;
-+    ff.p=your_function;
-+    plumed_cmd(plumed,"xxxx",&ff);
-+\endverbatim
-+  (this is passing the your_function() function to the "xxxx" command)
-+*/
-+
-+#ifdef __cplusplus
-+ extern "C" {
-+#endif
-+
-+/* Generic function pointer */
-+typedef void (*plumed_function_pointer)(void);
-+
-+/**
-+  \brief Holder for function pointer.
-+
-+  To pass plumed a callback function use the following syntax:
-+\verbatim
-+    plumed_function_holder ff;
-+    ff.p=your_function;
-+    plumed_cmd(plumed,"xxxx",&ff);
-+\endverbatim
-+  (this is going to pass the your_function() function to the "xxxx" command)
-+*/
-+
-+typedef struct {
-+  plumed_function_pointer p;
-+} plumed_function_holder;
-+
-+/**
-+  \brief Main plumed object
-+
-+  This is an object containing a Plumed instance, which should be used in
-+  the MD engine. It should first be initialized with plumed_create(),
-+  then it communicates with the MD engine using plumed_cmd(). Finally,
-+  before the termination, it should be deallocated with plumed_finalize().
-+  Its interface is very simple and general, and is expected
-+  not to change across plumed versions. See \ref ReferencePlumedH.
-+*/
-+typedef struct {
-+/**
-+  \private
-+  \brief Void pointer holding the real PlumedMain structure
-+*/
-+  void*p;
-+} plumed;
-+
-+/** \relates plumed
-+    \brief Constructor
-+
-+    \return The constructed plumed object
-+*/
-+plumed plumed_create(void);
-+
-+/** \relates plumed
-+    \brief Tells p to execute a command
-+
-+    \param p The plumed object on which command is acting
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like plumed_cmd(p,"A","B"),
-+               but for some choice of key it can change the content
-+*/
-+void plumed_cmd(plumed p,const char*key,const void*val);
-+
-+/** \relates plumed
-+    \brief Destructor
-+
-+    \param p The plumed object to be deallocated
-+*/
-+void plumed_finalize(plumed p);
-+
-+/** \relates plumed
-+    \brief Check if plumed is installed (for runtime binding)
-+
-+    \return 1 if plumed is installed, to 0 otherwise
-+*/
-+int plumed_installed(void);
-+
-+/** \relates plumed
-+    \brief Retrieves an handler to the global structure.
-+*/
-+plumed plumed_global(void);
-+
-+/** \relates plumed
-+    \brief Check if the global interface has been initialized
-+
-+    \return 1 if plumed has been initialized, 0 otherwise
-+*/
-+int plumed_ginitialized(void);
-+
-+/* global C interface, working on a global object */
-+
-+/** \relates plumed
-+    \brief Constructor for the global interface.
-+
-+    \note Equivalent to plumed_create(), but initialize a static global plumed object
-+*/
-+void plumed_gcreate(void);
-+
-+/** \relates plumed
-+    \brief Tells to the global interface to execute a command.
-+
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like plumed_gcmd("A","B"),
-+               but for some choice of key it can change the content
-+
-+    \note Equivalent to plumed_cmd(), but skipping the plumed argument
-+*/
-+void plumed_gcmd(const char* key,const void* val);
-+
-+/** \relates plumed
-+    \brief Destructor for the global interface.
-+
-+    \note Equivalent to plumed_finalize(), but skipping the plumed argument
-+*/
-+void plumed_gfinalize(void);
-+
-+/* routines to convert char handler from/to plumed objects */
-+
-+/** \related plumed
-+    \brief Converts a C handler to a FORTRAN handler
-+
-+    \param p The C handler
-+    \param c The FORTRAN handler (a char[32])
-+*/
-+void   plumed_c2f(plumed p,char* c);
-+
-+/** \related plumed
-+    \brief Converts a FORTRAN handler to a C handler
-+    \param c The FORTRAN handler (a char[32])
-+    \return The C handler
-+*/
-+plumed plumed_f2c(const char* c);
-+
-+#ifdef __cplusplus
-+ }
-+#endif
-+
-+#ifdef __cplusplus
-+
-+/* this is to include the NULL pointer */
-+#include <cstdlib>
-+
-+/* C++ interface is hidden in PLMD namespace (same as plumed library) */
-+namespace PLMD {
-+
-+/**
-+  C++ wrapper for \ref plumed.
-+
-+  This class provides a C++ interface to PLUMED.
-+*/
-+
-+class Plumed{
-+  plumed main;
-+/**
-+   keeps track if the object was created from scratch using 
-+   the defaults destructor (cloned=false) or if it was imported
-+   from C or FORTRAN (cloned-true). In the latter case, the
-+   plumed_finalize() method is not called when destructing the object,
-+   since it is expected to be finalized in the C/FORTRAN code
-+*/
-+  bool cloned;
-+public:
-+/**
-+   Check if plumed is installed (for runtime binding)
-+   \return true if plumed is installed, false otherwise
-+*/
-+  static bool installed();
-+/**
-+   Check if global-plumed has been initialized
-+   \return true if global plumed object (see global()) is initialized (i.e. if gcreate() has been
-+           called), false otherwise.
-+*/
-+  static bool ginitialized();
-+/**
-+   Initialize global-plumed
-+*/
-+  static void gcreate();
-+/**
-+   Send a command to global-plumed
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like gcmd("A","B"),
-+               but for some choice of key it can change the content
-+*/
-+  static void gcmd(const char* key,const void* val);
-+/**
-+   Finalize global-plumed
-+*/
-+  static void gfinalize();
-+/**
-+   Returns the Plumed global object
-+   \return The Plumed global object
-+*/
-+  static Plumed global();
-+/**
-+   Constructor
-+*/
-+  Plumed();
-+/**
-+   Clone a Plumed object from a FORTRAN char* handler
-+   \param c The FORTRAN handler (a char[32]).
-+
-+ \attention The Plumed object created in this manner
-+            will not finalize the corresponding plumed structure.
-+            It is expected that the FORTRAN code calls plumed_c_finalize for it
-+*/
-+  Plumed(const char*c);
-+/**
-+   Clone a Plumed object from a C plumed structure
-+   \param p The C plumed structure.
-+
-+ \attention The Plumed object created in this manner
-+            will not finalize the corresponding plumed structure.
-+            It is expected that the C code calls plumed_finalize for it
-+*/
-+  Plumed(plumed p);
-+private:
-+/** Copy constructor is disabled (private and unimplemented)
-+  The problem here is that after copying it will not be clear who is
-+  going to finalize the corresponding plumed structure.
-+*/
-+  Plumed(const Plumed&);
-+/** Assignment operator is disabled (private and unimplemented)
-+  The problem here is that after copying it will not be clear who is
-+  going to finalize the corresponding plumed structure.
-+*/
-+  Plumed&operator=(const Plumed&);
-+public:
-+/**
-+   Retrieve the C plumed structure for this object
-+*/
-+  operator plumed()const;
-+/**
-+   Retrieve a FORTRAN handler for this object
-+    \param c The FORTRAN handler (a char[32]).
-+*/
-+  void toFortran(char*c)const;
-+/**
-+   Send a command to this plumed object
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like p.cmd("A","B"),
-+               but for some choice of key it can change the content
-+*/
-+  void cmd(const char*key,const void*val=NULL);
-+/**
-+   Destructor
-+
-+   Destructor is virtual so as to allow correct inheritance from Plumed object.
-+   To avoid linking problems with g++, I specify "inline" also here (in principle
-+   it should be enough to specify it down in the definition of the function, but
-+   for some reason that I do not understand g++ does not inline it properly in that
-+   case and complains when Plumed.h is included but Plumed.o is not linked. Anyway, the
-+   way it is done here seems to work properly).
-+*/
-+  inline virtual ~Plumed();
-+};
-+
-+/* All methods are inlined so as to avoid the compilation of an extra c++ file */
-+
-+inline
-+bool Plumed::installed(){
-+  return plumed_installed();
-+}
-+
-+inline
-+Plumed::Plumed():
-+  main(plumed_create()),
-+  cloned(false)
-+{}
-+
-+inline
-+Plumed::Plumed(const char*c):
-+  main(plumed_f2c(c)),
-+  cloned(true)
-+{}
-+
-+inline
-+Plumed::Plumed(plumed p):
-+  main(p),
-+  cloned(true)
-+{}
-+
-+inline
-+Plumed::operator plumed()const{
-+  return main;
-+}
-+
-+inline
-+void Plumed::toFortran(char*c)const{
-+  plumed_c2f(main,c);
-+}
-+
-+inline
-+void Plumed::cmd(const char*key,const void*val){
-+  plumed_cmd(main,key,val);
-+}
-+
-+inline
-+Plumed::~Plumed(){
-+  if(!cloned)plumed_finalize(main);
-+}
-+
-+inline
-+bool Plumed::ginitialized(){
-+  return plumed_ginitialized();
-+}
-+
-+inline
-+void Plumed::gcreate(){
-+  plumed_gcreate();
-+}
-+
-+inline
-+void Plumed::gcmd(const char* key,const void* val){
-+  plumed_gcmd(key,val);
-+}
-+
-+inline
-+void Plumed::gfinalize(){
-+  plumed_gfinalize();
-+}
-+
-+inline
-+Plumed Plumed::global(){
-+  return plumed_global();
-+}
-+
-+}
-+
-+#endif
-+
-+
-+#endif
-diff --git a/Plumed.inc b/Plumed.inc
-new file mode 100644
-index 0000000..e1e29a7
---- /dev/null
-+++ b/Plumed.inc
-@@ -0,0 +1,3 @@
-+# PLUMED: shared installation
-+PLUMED_LOAD= /apps/all/PLUMED/2.1.3-foss-2015g/lib/plumed///src/lib/libplumed.so -ldl
-+PLUMED_DEPENDENCIES= /apps/all/PLUMED/2.1.3-foss-2015g/lib/plumed///src/lib/libplumed.so
-diff --git a/src/kernel/CMakeLists.txt b/src/kernel/CMakeLists.txt
-index fea8282..8e108b3 100644
---- a/src/kernel/CMakeLists.txt
-+++ b/src/kernel/CMakeLists.txt
-@@ -33,6 +33,8 @@
- # the research papers on the package. Check out http://www.gromacs.org.
- #
- 
-+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
-+
- set(GMXPREPROCESS_SOURCES 
-     add_par.c       
-     calc_verletbuf.c
-@@ -123,7 +125,7 @@ endforeach()
- 
- add_executable(mdrun ${MDRUN_SOURCES} main.c)
- gmx_add_man_page(mdrun)
--target_link_libraries(mdrun gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
-+target_link_libraries(mdrun gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS} ${PLUMED_LOAD})
- set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}" COMPILE_FLAGS "${OpenMP_C_FLAGS}")
- 
- if(GMX_OPENMM)
-diff --git a/src/kernel/CMakeLists.txt.preplumed b/src/kernel/CMakeLists.txt.preplumed
-new file mode 100644
-index 0000000..fea8282
---- /dev/null
-+++ b/src/kernel/CMakeLists.txt.preplumed
-@@ -0,0 +1,195 @@
-+#
-+# This file is part of the GROMACS molecular simulation package.
-+#
-+# Copyright (c) 2012,2013, by the GROMACS development team, led by
-+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
-+# others, as listed in the AUTHORS file in the top-level source
-+# directory and at http://www.gromacs.org.
-+#
-+# GROMACS is free software; you can redistribute it and/or
-+# modify it under the terms of the GNU Lesser General Public License
-+# as published by the Free Software Foundation; either version 2.1
-+# of the License, or (at your option) any later version.
-+#
-+# GROMACS is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+# Lesser General Public License for more details.
-+#
-+# You should have received a copy of the GNU Lesser General Public
-+# License along with GROMACS; if not, see
-+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+#
-+# If you want to redistribute modifications to GROMACS, please
-+# consider that scientific software is very special. Version
-+# control is crucial - bugs must be traceable. We will be happy to
-+# consider code for inclusion in the official distribution, but
-+# derived work must not be called official GROMACS. Details are found
-+# in the README & COPYING files - if they are missing, get the
-+# official version at http://www.gromacs.org.
-+#
-+# To help us fund GROMACS development, we humbly ask that you cite
-+# the research papers on the package. Check out http://www.gromacs.org.
-+#
-+
-+set(GMXPREPROCESS_SOURCES 
-+    add_par.c       
-+    calc_verletbuf.c
-+    compute_io.c    
-+    convparm.c      
-+    gen_ad.c        
-+    gen_vsite.c     
-+    genhydro.c   
-+    gpp_atomtype.c  
-+    gpp_bond_atomtype.c     
-+    h_db.c          
-+    hackblock.c             
-+    hizzie.c        
-+    nm2type.c
-+    pdb2top.c       
-+    pgutil.c        
-+    readir.c        
-+    readpull.c      
-+    readadress.c      
-+    readrot.c
-+    resall.c        
-+    sorting.c       
-+    specbond.c      
-+    ter_db.c        
-+    tomorse.c       
-+    topdirs.c       
-+    topexcl.c       
-+    topio.c         
-+    toppush.c       
-+    topshake.c      
-+    toputil.c       
-+    tpbcmp.c        
-+    vsite_parm.c    
-+    fflibutil.c
-+    xlate.c)
-+
-+set(MDRUN_SOURCES 
-+    gctio.c    ionize.c runner.c
-+    do_gct.c     repl_ex.c  xutils.c pme_loadbal.c
-+    md.c         mdrun.c    genalg.c membed.c)
-+
-+add_library(gmxpreprocess ${GMXPREPROCESS_SOURCES})
-+target_link_libraries(gmxpreprocess md)
-+set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION}
-+    COMPILE_FLAGS "${OpenMP_C_FLAGS}")
-+
-+
-+if(GMX_GPU)
-+    include_directories(${CMAKE_SOURCE_DIR}/src/gmxlib/gpu_utils)
-+endif()
-+
-+if(GMX_OPENMM)
-+    # Even though the OpenMM build has "moved to contrib", many things
-+    # have be be done from within the scope of the CMakeLists.txt that
-+    # builds its mdrun, and that is here
-+    list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/src/contrib)
-+    find_package(OpenMM)
-+    include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-+    include(${CMAKE_SOURCE_DIR}/src/contrib/BuildMdrunOpenMM.cmake)
-+endif(GMX_OPENMM)
-+
-+if(GMX_GPU OR GMX_FORCE_CXX)
-+    set_source_files_properties(main.c PROPERTIES LANGUAGE CXX)
-+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-+        set_source_files_properties(main.c PROPERTIES COMPILE_FLAGS "-x c++")
-+    endif()
-+endif()
-+
-+if(GMX_FAHCORE)
-+  add_library(fahcore ${MDRUN_SOURCES})
-+else(GMX_FAHCORE)
-+
-+set(GMX_KERNEL_PROGRAMS
-+    grompp tpbconv pdb2gmx g_protonate gmxdump g_x2top gmxcheck)
-+if (NOT GMX_NO_QUOTES)
-+  set(GMX_KERNEL_PROGRAMS ${GMX_KERNEL_PROGRAMS} g_luck)
-+endif (NOT GMX_NO_QUOTES)
-+
-+
-+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS})
-+    add_executable(${PROGRAM} ${PROGRAM}.c main.c)
-+    if (NOT ${PROGRAM} STREQUAL "g_luck")
-+        gmx_add_man_page(${PROGRAM})
-+    endif()
-+    target_link_libraries(${PROGRAM} gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
-+    set_target_properties(${PROGRAM} PROPERTIES OUTPUT_NAME "${PROGRAM}${GMX_BINARY_SUFFIX}")
-+endforeach()
-+
-+add_executable(mdrun ${MDRUN_SOURCES} main.c)
-+gmx_add_man_page(mdrun)
-+target_link_libraries(mdrun gmxpreprocess md gmx ${OpenMP_LINKER_FLAGS})
-+set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}" COMPILE_FLAGS "${OpenMP_C_FLAGS}")
-+
-+if(GMX_OPENMM)
-+    target_link_libraries(mdrun openmm_api_wrapper)
-+endif()
-+
-+# Construct component groups for installation; note that a component may
-+# belong to only one group
-+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS})
-+    set(CPACK_COMPONENT_${PROGRAM}_GROUP tools)
-+endforeach()
-+set(CPACK_COMPONENT_MDRUN_GROUP mdrun)
-+
-+foreach(PROGRAM ${GMX_KERNEL_PROGRAMS} mdrun)
-+    # Manage CPack component dependencies
-+    set(CPACK_COMPONENT_${PROGRAM}_DEPENDS libraries libraries-gmxpreprocess)
-+
-+    # Create custom install-xxxx target
-+    if (BUILD_SHARED_LIBS)
-+    # If shared libraries are used, we need to install the libraries in
-+    # addition to the mdrun binary.
-+       add_custom_target(install-${PROGRAM}
-+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries
-+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
-+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries-gmxpreprocess
-+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
-+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=${PROGRAM}
-+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
-+           COMMENT "Installing ${PROGRAM}")
-+    else()
-+       add_custom_target(install-${PROGRAM}
-+           COMMAND ${CMAKE_COMMAND} -DCOMPONENT=${PROGRAM}
-+                   -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
-+           COMMENT "Installing ${PROGRAM}")
-+    endif()
-+    add_dependencies(install-${PROGRAM} ${PROGRAM})
-+
-+    # Finally, trigger installation
-+    install(
-+      TARGETS ${PROGRAM}
-+      COMPONENT ${PROGRAM}
-+      DESTINATION ${BIN_INSTALL_DIR}
-+      )
-+endforeach()
-+
-+install(TARGETS gmxpreprocess DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries-gmxpreprocess)
-+
-+if (INSTALL_CUDART_LIB) #can be set manual by user
-+    if (GMX_GPU)
-+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
-+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
-+            if(IS_CUDART) #libcuda should not be installed
-+                #install also name-links (linker uses those)
-+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
-+                install(FILES ${CUDA_LIBS} DESTINATION
-+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
-+            endif()
-+        endforeach()
-+    else()
-+        message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
-+    endif()
-+endif ()
-+endif(GMX_FAHCORE)
-+
-+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgmxpreprocess.pc.cmakein ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc @ONLY)
-+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgmxpreprocess.pc
-+        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
-+        RENAME "libgmxpreprocess${GMX_LIBS_SUFFIX}.pc"
-+        COMPONENT development)
-diff --git a/src/kernel/md.c b/src/kernel/md.c
-index 4c4a88c..b4b8c51 100644
---- a/src/kernel/md.c
-+++ b/src/kernel/md.c
-@@ -93,6 +93,12 @@
- #include "types/iteratedconstraints.h"
- #include "nbnxn_cuda_data_mgmt.h"
- 
-+/* PLUMED */
-+#include "../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #ifdef GMX_LIB_MPI
- #include <mpi.h>
- #endif
-@@ -236,6 +242,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-     double               cycles_pmes;
-     gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
- 
-+/* PLUMED */
-+    int plumedNeedsEnergy=0;
-+    int plumedWantsToStop=0;
-+    matrix plumed_vir;
-+/* END PLUMED */
-+
- #ifdef GMX_FAHCORE
-     /* Temporary addition for FAHCORE checkpointing */
-     int chkpt_ret;
-@@ -732,6 +744,53 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-         fprintf(fplog, "\n");
-     }
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      /* detect plumed API version */
-+      int pversion=0;
-+      plumed_cmd(plumedmain,"getApiVersion",&pversion);
-+      /* setting kbT is only implemented with api>1) */
-+      real kbT=ir->opts.ref_t[0]*BOLTZ;
-+      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
-+
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        plumed_cmd(plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }else{
-+          plumed_cmd(plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
-+        }
-+      }
-+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
-+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
-+      plumed_cmd(plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
-+      plumed_cmd(plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }else{
-+          plumed_cmd(plumedmain,"setAtomsNlocal",&mdatoms->homenr);
-+          plumed_cmd(plumedmain,"setAtomsContiguous",&mdatoms->start);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
-+
-     print_start(fplog, cr, runtime, "mdrun");
-     runtime_start(runtime);
-     wallcycle_start(wcycle, ewcRUN);
-@@ -1044,6 +1103,13 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-                                     do_verbose && !bPMETuneRunning);
-                 wallcycle_stop(wcycle, ewcDOMDEC);
-                 /* If using an iterative integrator, reallocate space to match the decomposition */
-+
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+                }
-+                /* END PLUMED */
-             }
-         }
- 
-@@ -1189,12 +1255,45 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-              * This is parallellized as well, and does communication too.
-              * Check comments in sim_util.c
-              */
-+ 
-+            /* PLUMED */
-+            plumedNeedsEnergy=0;
-+            if(plumedswitch){
-+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
-+              plumed_cmd(plumedmain,"setPositions",&state->x[mdatoms->start][0]);
-+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[mdatoms->start]);
-+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[mdatoms->start]);
-+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
-+              plumed_cmd(plumedmain,"prepareCalc",NULL);
-+              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
-+              plumed_cmd(plumedmain,"setForces",&f[mdatoms->start][0]);
-+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+              clear_mat(plumed_vir);
-+              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
-+            }
-+            /* END PLUMED */
-             do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
-                      state->box, state->x, &state->hist,
-                      f, force_vir, mdatoms, enerd, fcd,
-                      state->lambda, graph,
-                      fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
--                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+                     (plumedNeedsEnergy? GMX_FORCE_ENERGY : 0) |(bNS ? GMX_FORCE_NS : 0) | force_flags);
-+            /* PLUMED */
-+            if(plumedswitch){
-+              if(plumedNeedsEnergy){
-+                msmul(force_vir,2.0,plumed_vir);
-+                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+                plumed_cmd(plumedmain,"performCalc",NULL);
-+                msmul(plumed_vir,0.5,force_vir);
-+              } else {
-+                msmul(plumed_vir,0.5,plumed_vir);
-+                m_add(force_vir,plumed_vir,force_vir);
-+              }
-+              if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+                 do_per_step(step,repl_ex_nst)) plumed_cmd(plumedmain,"GREX savePositions",NULL);
-+              if(plumedWantsToStop) ir->nsteps=step_rel+1;
-+            }
-+            /* END PLUMED */
-         }
- 
-         GMX_BARRIER(cr->mpi_comm_mygroup);
-diff --git a/src/kernel/md.c.preplumed b/src/kernel/md.c.preplumed
-new file mode 100644
-index 0000000..4c4a88c
---- /dev/null
-+++ b/src/kernel/md.c.preplumed
-@@ -0,0 +1,2283 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team,
-+ * check out http://www.gromacs.org for more information.
-+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
-+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
-+ * others, as listed in the AUTHORS file in the top-level source
-+ * directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include "typedefs.h"
-+#include "smalloc.h"
-+#include "sysstuff.h"
-+#include "vec.h"
-+#include "statutil.h"
-+#include "vcm.h"
-+#include "mdebin.h"
-+#include "nrnb.h"
-+#include "calcmu.h"
-+#include "index.h"
-+#include "vsite.h"
-+#include "update.h"
-+#include "ns.h"
-+#include "trnio.h"
-+#include "xtcio.h"
-+#include "mdrun.h"
-+#include "md_support.h"
-+#include "md_logging.h"
-+#include "confio.h"
-+#include "network.h"
-+#include "pull.h"
-+#include "xvgr.h"
-+#include "physics.h"
-+#include "names.h"
-+#include "xmdrun.h"
-+#include "ionize.h"
-+#include "disre.h"
-+#include "orires.h"
-+#include "pme.h"
-+#include "mdatoms.h"
-+#include "repl_ex.h"
-+#include "qmmm.h"
-+#include "mpelogging.h"
-+#include "domdec.h"
-+#include "domdec_network.h"
-+#include "partdec.h"
-+#include "topsort.h"
-+#include "coulomb.h"
-+#include "constr.h"
-+#include "shellfc.h"
-+#include "compute_io.h"
-+#include "mvdata.h"
-+#include "checkpoint.h"
-+#include "mtop_util.h"
-+#include "sighandler.h"
-+#include "txtdump.h"
-+#include "string2.h"
-+#include "pme_loadbal.h"
-+#include "bondf.h"
-+#include "membed.h"
-+#include "types/nlistheuristics.h"
-+#include "types/iteratedconstraints.h"
-+#include "nbnxn_cuda_data_mgmt.h"
-+
-+#ifdef GMX_LIB_MPI
-+#include <mpi.h>
-+#endif
-+#ifdef GMX_THREAD_MPI
-+#include "tmpi.h"
-+#endif
-+
-+#ifdef GMX_FAHCORE
-+#include "corewrap.h"
-+#endif
-+
-+static void reset_all_counters(FILE *fplog, t_commrec *cr,
-+                               gmx_large_int_t step,
-+                               gmx_large_int_t *step_rel, t_inputrec *ir,
-+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
-+                               gmx_runtime_t *runtime,
-+                               nbnxn_cuda_ptr_t cu_nbv)
-+{
-+    char sbuf[STEPSTRSIZE];
-+
-+    /* Reset all the counters related to performance over the run */
-+    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
-+                  gmx_step_str(step, sbuf));
-+
-+    if (cu_nbv)
-+    {
-+        nbnxn_cuda_reset_timings(cu_nbv);
-+    }
-+
-+    wallcycle_stop(wcycle, ewcRUN);
-+    wallcycle_reset_all(wcycle);
-+    if (DOMAINDECOMP(cr))
-+    {
-+        reset_dd_statistics_counters(cr->dd);
-+    }
-+    init_nrnb(nrnb);
-+    ir->init_step += *step_rel;
-+    ir->nsteps    -= *step_rel;
-+    *step_rel      = 0;
-+    wallcycle_start(wcycle, ewcRUN);
-+    runtime_start(runtime);
-+    print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
-+}
-+
-+double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-+             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
-+             int nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int stepout, t_inputrec *ir,
-+             gmx_mtop_t *top_global,
-+             t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t ed, t_forcerec *fr,
-+             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
-+             real cpt_period, real max_hours,
-+             const char *deviceOptions,
-+             unsigned long Flags,
-+             gmx_runtime_t *runtime)
-+{
-+    gmx_mdoutf_t   *outf;
-+    gmx_large_int_t step, step_rel;
-+    double          run_time;
-+    double          t, t0, lam0[efptNR];
-+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
-+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
-+                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
-+                    bBornRadii, bStartingFromCpt;
-+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
-+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
-+                      bForceUpdate = FALSE, bCPT;
-+    int               mdof_flags;
-+    gmx_bool          bMasterState;
-+    int               force_flags, cglo_flags;
-+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
-+    int               i, m;
-+    t_trxstatus      *status;
-+    rvec              mu_tot;
-+    t_vcm            *vcm;
-+    t_state          *bufstate = NULL;
-+    matrix           *scale_tot, pcoupl_mu, M, ebox;
-+    gmx_nlheur_t      nlh;
-+    t_trxframe        rerun_fr;
-+    gmx_repl_ex_t     repl_ex = NULL;
-+    int               nchkpt  = 1;
-+    gmx_localtop_t   *top;
-+    t_mdebin         *mdebin = NULL;
-+    t_state          *state    = NULL;
-+    rvec             *f_global = NULL;
-+    int               n_xtc    = -1;
-+    rvec             *x_xtc    = NULL;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f = NULL;
-+    gmx_global_stat_t gstat;
-+    gmx_update_t      upd   = NULL;
-+    t_graph          *graph = NULL;
-+    globsig_t         gs;
-+    gmx_rng_t         mcrng = NULL;
-+    gmx_bool          bFFscan;
-+    gmx_groups_t     *groups;
-+    gmx_ekindata_t   *ekind, *ekind_save;
-+    gmx_shellfc_t     shellfc;
-+    int               count, nconverged = 0;
-+    real              timestep = 0;
-+    double            tcount   = 0;
-+    gmx_bool          bIonize  = FALSE;
-+    gmx_bool          bTCR     = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
-+    gmx_bool          bAppend;
-+    gmx_bool          bResetCountersHalfMaxH = FALSE;
-+    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
-+    gmx_bool          bUpdateDoLR;
-+    real              mu_aver = 0, dvdl_constr;
-+    int               a0, a1, gnx = 0, ii;
-+    atom_id          *grpindex = NULL;
-+    char             *grpname;
-+    t_coupl_rec      *tcr     = NULL;
-+    rvec             *xcopy   = NULL, *vcopy = NULL, *cbuf = NULL;
-+    matrix            boxcopy = {{0}}, lastbox;
-+    tensor            tmpvir;
-+    real              fom, oldfom, veta_save, pcurr, scalevir, tracevir;
-+    real              vetanew = 0;
-+    int               lamnew  = 0;
-+    /* for FEP */
-+    int               nstfep;
-+    real              rate;
-+    double            cycles;
-+    real              saved_conserved_quantity = 0;
-+    real              last_ekin                = 0;
-+    int               iter_i;
-+    t_extmass         MassQ;
-+    int             **trotter_seq;
-+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
-+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
-+    gmx_iterate_t     iterate;
-+    gmx_large_int_t   multisim_nsteps = -1;                        /* number of steps to do  before first multisim
-+                                                                      simulation stops. If equal to zero, don't
-+                                                                      communicate any more between multisims.*/
-+    /* PME load balancing data for GPU kernels */
-+    pme_load_balancing_t pme_loadbal = NULL;
-+    double               cycles_pmes;
-+    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
-+
-+#ifdef GMX_FAHCORE
-+    /* Temporary addition for FAHCORE checkpointing */
-+    int chkpt_ret;
-+#endif
-+
-+    /* Check for special mdrun options */
-+    bRerunMD = (Flags & MD_RERUN);
-+    bIonize  = (Flags & MD_IONIZE);
-+    bFFscan  = (Flags & MD_FFSCAN);
-+    bAppend  = (Flags & MD_APPENDFILES);
-+    if (Flags & MD_RESETCOUNTERSHALFWAY)
-+    {
-+        if (ir->nsteps > 0)
-+        {
-+            /* Signal to reset the counters half the simulation steps. */
-+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
-+        }
-+        /* Signal to reset the counters halfway the simulation time. */
-+        bResetCountersHalfMaxH = (max_hours > 0);
-+    }
-+
-+    /* md-vv uses averaged full step velocities for T-control
-+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
-+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
-+    bVV = EI_VV(ir->eI);
-+    if (bVV) /* to store the initial velocities while computing virial */
-+    {
-+        snew(cbuf, top_global->natoms);
-+    }
-+    /* all the iteratative cases - only if there are constraints */
-+    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
-+    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
-+                                          false in this step.  The correct value, true or false,
-+                                          is set at each step, as it depends on the frequency of temperature
-+                                          and pressure control.*/
-+    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
-+
-+    if (bRerunMD)
-+    {
-+        /* Since we don't know if the frames read are related in any way,
-+         * rebuild the neighborlist at every step.
-+         */
-+        ir->nstlist       = 1;
-+        ir->nstcalcenergy = 1;
-+        nstglobalcomm     = 1;
-+    }
-+
-+    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
-+
-+    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
-+    bGStatEveryStep = (nstglobalcomm == 1);
-+
-+    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
-+    {
-+        fprintf(fplog,
-+                "To reduce the energy communication with nstlist = -1\n"
-+                "the neighbor list validity should not be checked at every step,\n"
-+                "this means that exact integration is not guaranteed.\n"
-+                "The neighbor list validity is checked after:\n"
-+                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
-+                "In most cases this will result in exact integration.\n"
-+                "This reduces the energy communication by a factor of 2 to 3.\n"
-+                "If you want less energy communication, set nstlist > 3.\n\n");
-+    }
-+
-+    if (bRerunMD || bFFscan)
-+    {
-+        ir->nstxtcout = 0;
-+    }
-+    groups = &top_global->groups;
-+
-+    /* Initial values */
-+    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
-+            &(state_global->fep_state), lam0,
-+            nrnb, top_global, &upd,
-+            nfile, fnm, &outf, &mdebin,
-+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags);
-+
-+    clear_mat(total_vir);
-+    clear_mat(pres);
-+    /* Energy terms and groups */
-+    snew(enerd, 1);
-+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
-+                  enerd);
-+    if (DOMAINDECOMP(cr))
-+    {
-+        f = NULL;
-+    }
-+    else
-+    {
-+        snew(f, top_global->natoms);
-+    }
-+
-+    /* Kinetic energy data */
-+    snew(ekind, 1);
-+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
-+    /* needed for iteration of constraints */
-+    snew(ekind_save, 1);
-+    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
-+    /* Copy the cos acceleration to the groups struct */
-+    ekind->cosacc.cos_accel = ir->cos_accel;
-+
-+    gstat = global_stat_init(ir);
-+    debug_gmx();
-+
-+    /* Check for polarizable models and flexible constraints */
-+    shellfc = init_shell_flexcon(fplog,
-+                                 top_global, n_flexible_constraints(constr),
-+                                 (ir->bContinuation ||
-+                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
-+                                 NULL : state_global->x);
-+    if (shellfc && ir->nstcalcenergy != 1)
-+    {
-+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
-+    }
-+    if (shellfc && DOMAINDECOMP(cr))
-+    {
-+        gmx_fatal(FARGS, "In order to run parallel simulations with shells you need to use the -pd flag to mdrun.");
-+    }
-+    if (shellfc && ir->eI == eiNM)
-+    {
-+        /* Currently shells don't work with Normal Modes */
-+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with shells.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
-+    }
-+
-+    if (vsite && ir->eI == eiNM)
-+    {
-+        /* Currently virtual sites don't work with Normal Modes */
-+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
-+    }
-+
-+    if (DEFORM(*ir))
-+    {
-+#ifdef GMX_THREAD_MPI
-+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
-+#endif
-+        set_deform_reference_box(upd,
-+                                 deform_init_init_step_tpx,
-+                                 deform_init_box_tpx);
-+#ifdef GMX_THREAD_MPI
-+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
-+#endif
-+    }
-+
-+    {
-+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
-+        if ((io > 2000) && MASTER(cr))
-+        {
-+            fprintf(stderr,
-+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
-+                    io);
-+        }
-+    }
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        top = dd_init_local_top(top_global);
-+
-+        snew(state, 1);
-+        dd_init_local_state(cr->dd, state_global, state);
-+
-+        if (DDMASTER(cr->dd) && ir->nstfout)
-+        {
-+            snew(f_global, state_global->natoms);
-+        }
-+    }
-+    else
-+    {
-+        if (PAR(cr))
-+        {
-+            /* Initialize the particle decomposition and split the topology */
-+            top = split_system(fplog, top_global, ir, cr);
-+
-+            pd_cg_range(cr, &fr->cg0, &fr->hcg);
-+            pd_at_range(cr, &a0, &a1);
-+        }
-+        else
-+        {
-+            top = gmx_mtop_generate_local_top(top_global, ir);
-+
-+            a0 = 0;
-+            a1 = top_global->natoms;
-+        }
-+
-+        forcerec_set_excl_load(fr, top, cr);
-+
-+        state    = partdec_init_local_state(cr, state_global);
-+        f_global = f;
-+
-+        atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
-+
-+        if (vsite)
-+        {
-+            set_vsite_top(vsite, top, mdatoms, cr);
-+        }
-+
-+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
-+        {
-+            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
-+        }
-+
-+        if (shellfc)
-+        {
-+            make_local_shells(cr, mdatoms, shellfc);
-+        }
-+
-+        setup_bonded_threading(fr, &top->idef);
-+
-+        if (ir->pull && PAR(cr))
-+        {
-+            dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
-+        }
-+    }
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        /* Distribute the charge groups over the nodes from the master node */
-+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
-+                            state_global, top_global, ir,
-+                            state, &f, mdatoms, top, fr,
-+                            vsite, shellfc, constr,
-+                            nrnb, wcycle, FALSE);
-+
-+    }
-+
-+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
-+
-+    if (opt2bSet("-cpi", nfile, fnm))
-+    {
-+        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
-+    }
-+    else
-+    {
-+        bStateFromCP = FALSE;
-+    }
-+
-+    if (ir->bExpanded)
-+    {
-+        init_expanded_ensemble(bStateFromCP,ir,&mcrng,&state->dfhist);
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        if (bStateFromCP)
-+        {
-+            /* Update mdebin with energy history if appending to output files */
-+            if (Flags & MD_APPENDFILES)
-+            {
-+                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
-+            }
-+            else
-+            {
-+                /* We might have read an energy history from checkpoint,
-+                 * free the allocated memory and reset the counts.
-+                 */
-+                done_energyhistory(&state_global->enerhist);
-+                init_energyhistory(&state_global->enerhist);
-+            }
-+        }
-+        /* Set the initial energy history in state by updating once */
-+        update_energyhistory(&state_global->enerhist, mdebin);
-+    }
-+
-+    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
-+    {
-+        /* Set the random state if we read a checkpoint file */
-+        set_stochd_state(upd, state);
-+    }
-+
-+    if (state->flags & (1<<estMC_RNG))
-+    {
-+        set_mc_state(mcrng, state);
-+    }
-+
-+    /* Initialize constraints */
-+    if (constr)
-+    {
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            set_constraints(constr, top, ir, mdatoms, cr);
-+        }
-+    }
-+
-+    /* Check whether we have to GCT stuff */
-+    bTCR = ftp2bSet(efGCT, nfile, fnm);
-+    if (bTCR)
-+    {
-+        if (MASTER(cr))
-+        {
-+            fprintf(stderr, "Will do General Coupling Theory!\n");
-+        }
-+        gnx = top_global->mols.nr;
-+        snew(grpindex, gnx);
-+        for (i = 0; (i < gnx); i++)
-+        {
-+            grpindex[i] = i;
-+        }
-+    }
-+
-+    if (repl_ex_nst > 0)
-+    {
-+        /* We need to be sure replica exchange can only occur
-+         * when the energies are current */
-+        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
-+                        "repl_ex_nst", &repl_ex_nst);
-+        /* This check needs to happen before inter-simulation
-+         * signals are initialized, too */
-+    }
-+    if (repl_ex_nst > 0 && MASTER(cr))
-+    {
-+        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
-+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
-+    }
-+
-+    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
-+     * With perturbed charges with soft-core we should not change the cut-off.
-+     */
-+    if ((Flags & MD_TUNEPME) &&
-+        EEL_PME(fr->eeltype) &&
-+        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
-+        !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) &&
-+        !bRerunMD)
-+    {
-+        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
-+        cycles_pmes = 0;
-+        if (cr->duty & DUTY_PME)
-+        {
-+            /* Start tuning right away, as we can't measure the load */
-+            bPMETuneRunning = TRUE;
-+        }
-+        else
-+        {
-+            /* Separate PME nodes, we can measure the PP/PME load balance */
-+            bPMETuneTry = TRUE;
-+        }
-+    }
-+
-+    if (!ir->bContinuation && !bRerunMD)
-+    {
-+        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
-+        {
-+            /* Set the velocities of frozen particles to zero */
-+            for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
-+            {
-+                for (m = 0; m < DIM; m++)
-+                {
-+                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
-+                    {
-+                        state->v[i][m] = 0;
-+                    }
-+                }
-+            }
-+        }
-+
-+        if (constr)
-+        {
-+            /* Constrain the initial coordinates and velocities */
-+            do_constrain_first(fplog, constr, ir, mdatoms, state, f,
-+                               graph, cr, nrnb, fr, top, shake_vir);
-+        }
-+        if (vsite)
-+        {
-+            /* Construct the virtual sites for the initial configuration */
-+            construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL,
-+                             top->idef.iparams, top->idef.il,
-+                             fr->ePBC, fr->bMolPBC, graph, cr, state->box);
-+        }
-+    }
-+
-+    debug_gmx();
-+
-+    /* set free energy calculation frequency as the minimum
-+       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
-+    nstfep = ir->fepvals->nstdhdl;
-+    if (ir->bExpanded)
-+    {
-+        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl,nstfep);
-+    }
-+    if (repl_ex_nst > 0)
-+    {
-+        nstfep = gmx_greatest_common_divisor(repl_ex_nst,nstfep);
-+    }
-+
-+    /* I'm assuming we need global communication the first time! MRS */
-+    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
-+                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
-+                  | (bVV ? CGLO_PRESSURE : 0)
-+                  | (bVV ? CGLO_CONSTRAINT : 0)
-+                  | (bRerunMD ? CGLO_RERUNMD : 0)
-+                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
-+
-+    bSumEkinhOld = FALSE;
-+    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                    constr, NULL, FALSE, state->box,
-+                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags);
-+    if (ir->eI == eiVVAK)
-+    {
-+        /* a second call to get the half step temperature initialized as well */
-+        /* we do the same call as above, but turn the pressure off -- internally to
-+           compute_globals, this is recognized as a velocity verlet half-step
-+           kinetic energy calculation.  This minimized excess variables, but
-+           perhaps loses some logic?*/
-+
-+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                        constr, NULL, FALSE, state->box,
-+                        top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
-+                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
-+    }
-+
-+    /* Calculate the initial half step temperature, and save the ekinh_old */
-+    if (!(Flags & MD_STARTFROMCPT))
-+    {
-+        for (i = 0; (i < ir->opts.ngtc); i++)
-+        {
-+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
-+        }
-+    }
-+    if (ir->eI != eiVV)
-+    {
-+        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
-+                                     and there is no previous step */
-+    }
-+
-+    /* if using an iterative algorithm, we need to create a working directory for the state. */
-+    if (bIterativeCase)
-+    {
-+        bufstate = init_bufstate(state);
-+    }
-+    if (bFFscan)
-+    {
-+        snew(xcopy, state->natoms);
-+        snew(vcopy, state->natoms);
-+        copy_rvecn(state->x, xcopy, 0, state->natoms);
-+        copy_rvecn(state->v, vcopy, 0, state->natoms);
-+        copy_mat(state->box, boxcopy);
-+    }
-+
-+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
-+       temperature control */
-+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
-+
-+    if (MASTER(cr))
-+    {
-+        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
-+        {
-+            fprintf(fplog,
-+                    "RMS relative constraint deviation after constraining: %.2e\n",
-+                    constr_rmsd(constr, FALSE));
-+        }
-+        if (EI_STATE_VELOCITY(ir->eI))
-+        {
-+            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
-+        }
-+        if (bRerunMD)
-+        {
-+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
-+                    " input trajectory '%s'\n\n",
-+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
-+                        "run input file,\nwhich may not correspond to the time "
-+                        "needed to process input trajectory.\n\n");
-+            }
-+        }
-+        else
-+        {
-+            char tbuf[20];
-+            fprintf(stderr, "starting mdrun '%s'\n",
-+                    *(top_global->name));
-+            if (ir->nsteps >= 0)
-+            {
-+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
-+            }
-+            else
-+            {
-+                sprintf(tbuf, "%s", "infinite");
-+            }
-+            if (ir->init_step > 0)
-+            {
-+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
-+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
-+                        gmx_step_str(ir->init_step, sbuf2),
-+                        ir->init_step*ir->delta_t);
-+            }
-+            else
-+            {
-+                fprintf(stderr, "%s steps, %s ps.\n",
-+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
-+            }
-+        }
-+        fprintf(fplog, "\n");
-+    }
-+
-+    print_start(fplog, cr, runtime, "mdrun");
-+    runtime_start(runtime);
-+    wallcycle_start(wcycle, ewcRUN);
-+
-+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
-+#ifdef GMX_FAHCORE
-+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
-+                                      NULL, 0);
-+    if (chkpt_ret == 0)
-+    {
-+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
-+    }
-+#endif
-+
-+    debug_gmx();
-+    /***********************************************************
-+     *
-+     *             Loop over MD steps
-+     *
-+     ************************************************************/
-+
-+    /* if rerunMD then read coordinates and velocities from input trajectory */
-+    if (bRerunMD)
-+    {
-+        if (getenv("GMX_FORCE_UPDATE"))
-+        {
-+            bForceUpdate = TRUE;
-+        }
-+
-+        rerun_fr.natoms = 0;
-+        if (MASTER(cr))
-+        {
-+            bNotLastFrame = read_first_frame(oenv, &status,
-+                                             opt2fn("-rerun", nfile, fnm),
-+                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
-+            if (rerun_fr.natoms != top_global->natoms)
-+            {
-+                gmx_fatal(FARGS,
-+                          "Number of atoms in trajectory (%d) does not match the "
-+                          "run input file (%d)\n",
-+                          rerun_fr.natoms, top_global->natoms);
-+            }
-+            if (ir->ePBC != epbcNONE)
-+            {
-+                if (!rerun_fr.bBox)
-+                {
-+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
-+                }
-+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
-+                {
-+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
-+                }
-+            }
-+        }
-+
-+        if (PAR(cr))
-+        {
-+            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
-+        }
-+
-+        if (ir->ePBC != epbcNONE)
-+        {
-+            /* Set the shift vectors.
-+             * Necessary here when have a static box different from the tpr box.
-+             */
-+            calc_shifts(rerun_fr.box, fr->shift_vec);
-+        }
-+    }
-+
-+    /* loop over MD steps or if rerunMD to end of input trajectory */
-+    bFirstStep = TRUE;
-+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
-+    bStateFromTPX    = !bStateFromCP;
-+    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
-+    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
-+    bLastStep        = FALSE;
-+    bSumEkinhOld     = FALSE;
-+    bExchanged       = FALSE;
-+
-+    init_global_signals(&gs, cr, ir, repl_ex_nst);
-+
-+    step     = ir->init_step;
-+    step_rel = 0;
-+
-+    if (ir->nstlist == -1)
-+    {
-+        init_nlistheuristics(&nlh, bGStatEveryStep, step);
-+    }
-+
-+    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
-+    {
-+        /* check how many steps are left in other sims */
-+        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
-+    }
-+
-+
-+    /* and stop now if we should */
-+    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
-+                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
-+    while (!bLastStep || (bRerunMD && bNotLastFrame))
-+    {
-+
-+        wallcycle_start(wcycle, ewcSTEP);
-+
-+        GMX_MPE_LOG(ev_timestep1);
-+
-+        if (bRerunMD)
-+        {
-+            if (rerun_fr.bStep)
-+            {
-+                step     = rerun_fr.step;
-+                step_rel = step - ir->init_step;
-+            }
-+            if (rerun_fr.bTime)
-+            {
-+                t = rerun_fr.time;
-+            }
-+            else
-+            {
-+                t = step;
-+            }
-+        }
-+        else
-+        {
-+            bLastStep = (step_rel == ir->nsteps);
-+            t         = t0 + step*ir->delta_t;
-+        }
-+
-+        if (ir->efep != efepNO || ir->bSimTemp)
-+        {
-+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
-+               requiring different logic. */
-+
-+            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
-+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
-+            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
-+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
-+                            && (ir->bExpanded) && (step > 0) && (!bStartingFromCpt));
-+        }
-+
-+        if (bSimAnn)
-+        {
-+            update_annealing_target_temp(&(ir->opts), t);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
-+            {
-+                for (i = 0; i < state_global->natoms; i++)
-+                {
-+                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
-+                }
-+                if (rerun_fr.bV)
-+                {
-+                    for (i = 0; i < state_global->natoms; i++)
-+                    {
-+                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
-+                    }
-+                }
-+                else
-+                {
-+                    for (i = 0; i < state_global->natoms; i++)
-+                    {
-+                        clear_rvec(state_global->v[i]);
-+                    }
-+                    if (bRerunWarnNoV)
-+                    {
-+                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
-+                                "         Ekin, temperature and pressure are incorrect,\n"
-+                                "         the virial will be incorrect when constraints are present.\n"
-+                                "\n");
-+                        bRerunWarnNoV = FALSE;
-+                    }
-+                }
-+            }
-+            copy_mat(rerun_fr.box, state_global->box);
-+            copy_mat(state_global->box, state->box);
-+
-+            if (vsite && (Flags & MD_RERUN_VSITE))
-+            {
-+                if (DOMAINDECOMP(cr))
-+                {
-+                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
-+                }
-+                if (graph)
-+                {
-+                    /* Following is necessary because the graph may get out of sync
-+                     * with the coordinates if we only have every N'th coordinate set
-+                     */
-+                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
-+                    shift_self(graph, state->box, state->x);
-+                }
-+                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
-+                                 top->idef.iparams, top->idef.il,
-+                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
-+                if (graph)
-+                {
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+            }
-+        }
-+
-+        /* Stop Center of Mass motion */
-+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
-+
-+        /* Copy back starting coordinates in case we're doing a forcefield scan */
-+        if (bFFscan)
-+        {
-+            for (ii = 0; (ii < state->natoms); ii++)
-+            {
-+                copy_rvec(xcopy[ii], state->x[ii]);
-+                copy_rvec(vcopy[ii], state->v[ii]);
-+            }
-+            copy_mat(boxcopy, state->box);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            /* for rerun MD always do Neighbour Searching */
-+            bNS      = (bFirstStep || ir->nstlist != 0);
-+            bNStList = bNS;
-+        }
-+        else
-+        {
-+            /* Determine whether or not to do Neighbour Searching and LR */
-+            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
-+
-+            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
-+                   (ir->nstlist == -1 && nlh.nabnsb > 0));
-+
-+            if (bNS && ir->nstlist == -1)
-+            {
-+                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
-+            }
-+        }
-+
-+        /* check whether we should stop because another simulation has
-+           stopped. */
-+        if (MULTISIM(cr))
-+        {
-+            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
-+                 (multisim_nsteps != ir->nsteps) )
-+            {
-+                if (bNS)
-+                {
-+                    if (MASTER(cr))
-+                    {
-+                        fprintf(stderr,
-+                                "Stopping simulation %d because another one has finished\n",
-+                                cr->ms->sim);
-+                    }
-+                    bLastStep         = TRUE;
-+                    gs.sig[eglsCHKPT] = 1;
-+                }
-+            }
-+        }
-+
-+        /* < 0 means stop at next step, > 0 means stop at next NS step */
-+        if ( (gs.set[eglsSTOPCOND] < 0) ||
-+             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
-+        {
-+            bLastStep = TRUE;
-+        }
-+
-+        /* Determine whether or not to update the Born radii if doing GB */
-+        bBornRadii = bFirstStep;
-+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
-+        {
-+            bBornRadii = TRUE;
-+        }
-+
-+        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
-+        do_verbose = bVerbose &&
-+            (step % stepout == 0 || bFirstStep || bLastStep);
-+
-+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
-+        {
-+            if (bRerunMD)
-+            {
-+                bMasterState = TRUE;
-+            }
-+            else
-+            {
-+                bMasterState = FALSE;
-+                /* Correct the new box if it is too skewed */
-+                if (DYNAMIC_BOX(*ir))
-+                {
-+                    if (correct_box(fplog, step, state->box, graph))
-+                    {
-+                        bMasterState = TRUE;
-+                    }
-+                }
-+                if (DOMAINDECOMP(cr) && bMasterState)
-+                {
-+                    dd_collect_state(cr->dd, state, state_global);
-+                }
-+            }
-+
-+            if (DOMAINDECOMP(cr))
-+            {
-+                /* Repartition the domain decomposition */
-+                wallcycle_start(wcycle, ewcDOMDEC);
-+                dd_partition_system(fplog, step, cr,
-+                                    bMasterState, nstglobalcomm,
-+                                    state_global, top_global, ir,
-+                                    state, &f, mdatoms, top, fr,
-+                                    vsite, shellfc, constr,
-+                                    nrnb, wcycle,
-+                                    do_verbose && !bPMETuneRunning);
-+                wallcycle_stop(wcycle, ewcDOMDEC);
-+                /* If using an iterative integrator, reallocate space to match the decomposition */
-+            }
-+        }
-+
-+        if (MASTER(cr) && do_log && !bFFscan)
-+        {
-+            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
-+        }
-+
-+        if (ir->efep != efepNO)
-+        {
-+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
-+        }
-+
-+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
-+        {
-+
-+            /* We need the kinetic energy at minus the half step for determining
-+             * the full step kinetic energy and possibly for T-coupling.*/
-+            /* This may not be quite working correctly yet . . . . */
-+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-+                            constr, NULL, FALSE, state->box,
-+                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
-+                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-+        }
-+        clear_mat(force_vir);
-+
-+        /* Ionize the atoms if necessary */
-+        if (bIonize)
-+        {
-+            ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
-+                   mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
-+        }
-+
-+        /* Update force field in ffscan program */
-+        if (bFFscan)
-+        {
-+            if (update_forcefield(fplog,
-+                                  nfile, fnm, fr,
-+                                  mdatoms->nr, state->x, state->box))
-+            {
-+                gmx_finalize_par();
-+
-+                exit(0);
-+            }
-+        }
-+
-+        GMX_MPE_LOG(ev_timestep2);
-+
-+        /* We write a checkpoint at this MD step when:
-+         * either at an NS step when we signalled through gs,
-+         * or at the last step (but not when we do not want confout),
-+         * but never at the first step or with rerun.
-+         */
-+        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
-+                 (bLastStep && (Flags & MD_CONFOUT))) &&
-+                step > ir->init_step && !bRerunMD);
-+        if (bCPT)
-+        {
-+            gs.set[eglsCHKPT] = 0;
-+        }
-+
-+        /* Determine the energy and pressure:
-+         * at nstcalcenergy steps and at energy output steps (set below).
-+         */
-+        if (EI_VV(ir->eI) && (!bInitStep))
-+        {
-+            /* for vv, the first half of the integration actually corresponds
-+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
-+               but the virial needs to be calculated on both the current step and the 'next' step. Future
-+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
-+
-+            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
-+            bCalcVir  = bCalcEner ||
-+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
-+        }
-+        else
-+        {
-+            bCalcEner = do_per_step(step, ir->nstcalcenergy);
-+            bCalcVir  = bCalcEner ||
-+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
-+        }
-+
-+        /* Do we need global communication ? */
-+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
-+                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
-+                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
-+
-+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
-+
-+        if (do_ene || do_log)
-+        {
-+            bCalcVir  = TRUE;
-+            bCalcEner = TRUE;
-+            bGStat    = TRUE;
-+        }
-+
-+        /* these CGLO_ options remain the same throughout the iteration */
-+        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
-+                      (bGStat ? CGLO_GSTAT : 0)
-+                      );
-+
-+        force_flags = (GMX_FORCE_STATECHANGED |
-+                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
-+                       GMX_FORCE_ALLFORCES |
-+                       GMX_FORCE_SEPLRF |
-+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
-+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
-+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
-+                       );
-+
-+        if (fr->bTwinRange)
-+        {
-+            if (do_per_step(step, ir->nstcalclr))
-+            {
-+                force_flags |= GMX_FORCE_DO_LR;
-+            }
-+        }
-+
-+        if (shellfc)
-+        {
-+            /* Now is the time to relax the shells */
-+            count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step,
-+                                        ir, bNS, force_flags,
-+                                        bStopCM, top, top_global,
-+                                        constr, enerd, fcd,
-+                                        state, f, force_vir, mdatoms,
-+                                        nrnb, wcycle, graph, groups,
-+                                        shellfc, fr, bBornRadii, t, mu_tot,
-+                                        state->natoms, &bConverged, vsite,
-+                                        outf->fp_field);
-+            tcount += count;
-+
-+            if (bConverged)
-+            {
-+                nconverged++;
-+            }
-+        }
-+        else
-+        {
-+            /* The coordinates (x) are shifted (to get whole molecules)
-+             * in do_force.
-+             * This is parallellized as well, and does communication too.
-+             * Check comments in sim_util.c
-+             */
-+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
-+                     state->box, state->x, &state->hist,
-+                     f, force_vir, mdatoms, enerd, fcd,
-+                     state->lambda, graph,
-+                     fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
-+                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+        }
-+
-+        GMX_BARRIER(cr->mpi_comm_mygroup);
-+
-+        if (bTCR)
-+        {
-+            mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
-+                                   mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
-+        }
-+
-+        if (bTCR && bFirstStep)
-+        {
-+            tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
-+            fprintf(fplog, "Done init_coupling\n");
-+            fflush(fplog);
-+        }
-+
-+        if (bVV && !bStartingFromCpt && !bRerunMD)
-+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
-+        {
-+            if (ir->eI == eiVV && bInitStep)
-+            {
-+                /* if using velocity verlet with full time step Ekin,
-+                 * take the first half step only to compute the
-+                 * virial for the first step. From there,
-+                 * revert back to the initial coordinates
-+                 * so that the input is actually the initial step.
-+                 */
-+                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
-+            }
-+            else
-+            {
-+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
-+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
-+            }
-+
-+            /* If we are using twin-range interactions where the long-range component
-+             * is only evaluated every nstcalclr>1 steps, we should do a special update
-+             * step to combine the long-range forces on these steps.
-+             * For nstcalclr=1 this is not done, since the forces would have been added
-+             * directly to the short-range forces already.
-+             *
-+             * TODO Remove various aspects of VV+twin-range in master
-+             * branch, because VV integrators did not ever support
-+             * twin-range multiple time stepping with constraints.
-+             */
-+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
-+                          f, bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                          ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
-+                          cr, nrnb, constr, &top->idef);
-+
-+            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
-+            {
-+                gmx_iterate_init(&iterate, TRUE);
-+            }
-+            /* for iterations, we save these vectors, as we will be self-consistently iterating
-+               the calculations */
-+
-+            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
-+
-+            /* save the state */
-+            if (iterate.bIterationActive)
-+            {
-+                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-+            }
-+
-+            bFirstIterate = TRUE;
-+            while (bFirstIterate || iterate.bIterationActive)
-+            {
-+                if (iterate.bIterationActive)
-+                {
-+                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-+                    if (bFirstIterate && bTrotter)
-+                    {
-+                        /* The first time through, we need a decent first estimate
-+                           of veta(t+dt) to compute the constraints.  Do
-+                           this by computing the box volume part of the
-+                           trotter integration at this time. Nothing else
-+                           should be changed by this routine here.  If
-+                           !(first time), we start with the previous value
-+                           of veta.  */
-+
-+                        veta_save = state->veta;
-+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
-+                        vetanew     = state->veta;
-+                        state->veta = veta_save;
-+                    }
-+                }
-+
-+                bOK = TRUE;
-+                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
-+                {
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, shake_vir, NULL,
-+                                       cr, nrnb, wcycle, upd, constr,
-+                                       bInitStep, TRUE, bCalcVir, vetanew);
-+
-+                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-+                    {
-+                        /* Correct the virial for multiple time stepping */
-+                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-+                    }
-+
-+                    if (!bOK && !bFFscan)
-+                    {
-+                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-+                    }
-+
-+                }
-+                else if (graph)
-+                {
-+                    /* Need to unshift here if a do_force has been
-+                       called in the previous step */
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+
-+                /* if VV, compute the pressure and constraints */
-+                /* For VV2, we strictly only need this if using pressure
-+                 * control, but we really would like to have accurate pressures
-+                 * printed out.
-+                 * Think about ways around this in the future?
-+                 * For now, keep this choice in comments.
-+                 */
-+                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-+                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
-+                bPres = TRUE;
-+                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-+                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
-+                {
-+                    bSumEkinhOld = TRUE;
-+                }
-+                /* for vv, the first half of the integration actually corresponds to the previous step.
-+                   So we need information from the last step in the first half of the integration */
-+                if (bGStat || do_per_step(step-1, nstglobalcomm))
-+                {
-+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                    constr, NULL, FALSE, state->box,
-+                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
-+                                    cglo_flags
-+                                    | CGLO_ENERGY
-+                                    | (bTemp ? CGLO_TEMPERATURE : 0)
-+                                    | (bPres ? CGLO_PRESSURE : 0)
-+                                    | (bPres ? CGLO_CONSTRAINT : 0)
-+                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
-+                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-+                                    | CGLO_SCALEEKIN
-+                                    );
-+                    /* explanation of above:
-+                       a) We compute Ekin at the full time step
-+                       if 1) we are using the AveVel Ekin, and it's not the
-+                       initial step, or 2) if we are using AveEkin, but need the full
-+                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-+                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-+                       EkinAveVel because it's needed for the pressure */
-+                }
-+                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-+                if (!bInitStep)
-+                {
-+                    if (bTrotter)
-+                    {
-+                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
-+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
-+                    }
-+                    else
-+                    {
-+                        if (bExchanged)
-+                        {
-+
-+                            /* We need the kinetic energy at minus the half step for determining
-+                             * the full step kinetic energy and possibly for T-coupling.*/
-+                            /* This may not be quite working correctly yet . . . . */
-+                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-+                                            constr, NULL, FALSE, state->box,
-+                                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
-+                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-+                        }
-+                    }
-+                }
-+
-+                if (iterate.bIterationActive &&
-+                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-+                                   state->veta, &vetanew))
-+                {
-+                    break;
-+                }
-+                bFirstIterate = FALSE;
-+            }
-+
-+            if (bTrotter && !bInitStep)
-+            {
-+                copy_mat(shake_vir, state->svir_prev);
-+                copy_mat(force_vir, state->fvir_prev);
-+                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
-+                {
-+                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
-+                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE);
-+                    enerd->term[F_EKIN] = trace(ekind->ekin);
-+                }
-+            }
-+            /* if it's the initial step, we performed this first step just to get the constraint virial */
-+            if (bInitStep && ir->eI == eiVV)
-+            {
-+                copy_rvecn(cbuf, state->v, 0, state->natoms);
-+            }
-+
-+            GMX_MPE_LOG(ev_timestep1);
-+        }
-+
-+        /* MRS -- now done iterating -- compute the conserved quantity */
-+        if (bVV)
-+        {
-+            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
-+            if (ir->eI == eiVV)
-+            {
-+                last_ekin = enerd->term[F_EKIN];
-+            }
-+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
-+            {
-+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
-+            }
-+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
-+            if (!bRerunMD)
-+            {
-+                sum_dhdl(enerd, state->lambda, ir->fepvals);
-+            }
-+        }
-+
-+        /* ########  END FIRST UPDATE STEP  ############## */
-+        /* ########  If doing VV, we now have v(dt) ###### */
-+        if (bDoExpanded)
-+        {
-+            /* perform extended ensemble sampling in lambda - we don't
-+               actually move to the new state before outputting
-+               statistics, but if performing simulated tempering, we
-+               do update the velocities and the tau_t. */
-+
-+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, mcrng, state->v, mdatoms);
-+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
-+            copy_df_history(&state_global->dfhist,&state->dfhist);
-+        }
-+        /* ################## START TRAJECTORY OUTPUT ################# */
-+
-+        /* Now we have the energies and forces corresponding to the
-+         * coordinates at time t. We must output all of this before
-+         * the update.
-+         * for RerunMD t is read from input trajectory
-+         */
-+        GMX_MPE_LOG(ev_output_start);
-+
-+        mdof_flags = 0;
-+        if (do_per_step(step, ir->nstxout))
-+        {
-+            mdof_flags |= MDOF_X;
-+        }
-+        if (do_per_step(step, ir->nstvout))
-+        {
-+            mdof_flags |= MDOF_V;
-+        }
-+        if (do_per_step(step, ir->nstfout))
-+        {
-+            mdof_flags |= MDOF_F;
-+        }
-+        if (do_per_step(step, ir->nstxtcout))
-+        {
-+            mdof_flags |= MDOF_XTC;
-+        }
-+        if (bCPT)
-+        {
-+            mdof_flags |= MDOF_CPT;
-+        }
-+        ;
-+
-+#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
-+        if (bLastStep)
-+        {
-+            /* Enforce writing positions and velocities at end of run */
-+            mdof_flags |= (MDOF_X | MDOF_V);
-+        }
-+#endif
-+#ifdef GMX_FAHCORE
-+        if (MASTER(cr))
-+        {
-+            fcReportProgress( ir->nsteps, step );
-+        }
-+
-+#if defined(__native_client__)
-+        fcCheckin(MASTER(cr));
-+#endif
-+
-+        /* sync bCPT and fc record-keeping */
-+        if (bCPT && MASTER(cr))
-+        {
-+            fcRequestCheckPoint();
-+        }
-+#endif
-+
-+        if (mdof_flags != 0)
-+        {
-+            wallcycle_start(wcycle, ewcTRAJ);
-+            if (bCPT)
-+            {
-+                if (state->flags & (1<<estLD_RNG))
-+                {
-+                    get_stochd_state(upd, state);
-+                }
-+                if (state->flags  & (1<<estMC_RNG))
-+                {
-+                    get_mc_state(mcrng, state);
-+                }
-+                if (MASTER(cr))
-+                {
-+                    if (bSumEkinhOld)
-+                    {
-+                        state_global->ekinstate.bUpToDate = FALSE;
-+                    }
-+                    else
-+                    {
-+                        update_ekinstate(&state_global->ekinstate, ekind);
-+                        state_global->ekinstate.bUpToDate = TRUE;
-+                    }
-+                    update_energyhistory(&state_global->enerhist, mdebin);
-+                }
-+            }
-+            write_traj(fplog, cr, outf, mdof_flags, top_global,
-+                       step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
-+            if (bCPT)
-+            {
-+                nchkpt++;
-+                bCPT = FALSE;
-+            }
-+            debug_gmx();
-+            if (bLastStep && step_rel == ir->nsteps &&
-+                (Flags & MD_CONFOUT) && MASTER(cr) &&
-+                !bRerunMD && !bFFscan)
-+            {
-+                /* x and v have been collected in write_traj,
-+                 * because a checkpoint file will always be written
-+                 * at the last step.
-+                 */
-+                fprintf(stderr, "\nWriting final coordinates.\n");
-+                if (fr->bMolPBC)
-+                {
-+                    /* Make molecules whole only for confout writing */
-+                    do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
-+                }
-+                write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
-+                                    *top_global->name, top_global,
-+                                    state_global->x, state_global->v,
-+                                    ir->ePBC, state->box);
-+                debug_gmx();
-+            }
-+            wallcycle_stop(wcycle, ewcTRAJ);
-+        }
-+        GMX_MPE_LOG(ev_output_finish);
-+
-+        /* kludge -- virial is lost with restart for NPT control. Must restart */
-+        if (bStartingFromCpt && bVV)
-+        {
-+            copy_mat(state->svir_prev, shake_vir);
-+            copy_mat(state->fvir_prev, force_vir);
-+        }
-+        /*  ################## END TRAJECTORY OUTPUT ################ */
-+
-+        /* Determine the wallclock run time up till now */
-+        run_time = gmx_gettime() - (double)runtime->real;
-+
-+        /* Check whether everything is still allright */
-+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
-+#ifdef GMX_THREAD_MPI
-+            && MASTER(cr)
-+#endif
-+            )
-+        {
-+            /* this is just make gs.sig compatible with the hack
-+               of sending signals around by MPI_Reduce with together with
-+               other floats */
-+            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
-+            {
-+                gs.sig[eglsSTOPCOND] = 1;
-+            }
-+            if (gmx_get_stop_condition() == gmx_stop_cond_next)
-+            {
-+                gs.sig[eglsSTOPCOND] = -1;
-+            }
-+            /* < 0 means stop at next step, > 0 means stop at next NS step */
-+            if (fplog)
-+            {
-+                fprintf(fplog,
-+                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
-+                        gmx_get_signal_name(),
-+                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
-+                fflush(fplog);
-+            }
-+            fprintf(stderr,
-+                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
-+                    gmx_get_signal_name(),
-+                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
-+            fflush(stderr);
-+            handled_stop_condition = (int)gmx_get_stop_condition();
-+        }
-+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
-+                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
-+                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
-+        {
-+            /* Signal to terminate the run */
-+            gs.sig[eglsSTOPCOND] = 1;
-+            if (fplog)
-+            {
-+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
-+            }
-+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
-+        }
-+
-+        if (bResetCountersHalfMaxH && MASTER(cr) &&
-+            run_time > max_hours*60.0*60.0*0.495)
-+        {
-+            gs.sig[eglsRESETCOUNTERS] = 1;
-+        }
-+
-+        if (ir->nstlist == -1 && !bRerunMD)
-+        {
-+            /* When bGStatEveryStep=FALSE, global_stat is only called
-+             * when we check the atom displacements, not at NS steps.
-+             * This means that also the bonded interaction count check is not
-+             * performed immediately after NS. Therefore a few MD steps could
-+             * be performed with missing interactions.
-+             * But wrong energies are never written to file,
-+             * since energies are only written after global_stat
-+             * has been called.
-+             */
-+            if (step >= nlh.step_nscheck)
-+            {
-+                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
-+                                                     nlh.scale_tot, state->x);
-+            }
-+            else
-+            {
-+                /* This is not necessarily true,
-+                 * but step_nscheck is determined quite conservatively.
-+                 */
-+                nlh.nabnsb = 0;
-+            }
-+        }
-+
-+        /* In parallel we only have to check for checkpointing in steps
-+         * where we do global communication,
-+         *  otherwise the other nodes don't know.
-+         */
-+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
-+                           cpt_period >= 0 &&
-+                           (cpt_period == 0 ||
-+                            run_time >= nchkpt*cpt_period*60.0)) &&
-+            gs.set[eglsCHKPT] == 0)
-+        {
-+            gs.sig[eglsCHKPT] = 1;
-+        }
-+
-+        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
-+        if (EI_VV(ir->eI))
-+        {
-+            if (!bInitStep)
-+            {
-+                update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
-+            }
-+            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
-+            {
-+                gmx_bool bIfRandomize;
-+                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr, DOMAINDECOMP(cr));
-+                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
-+                if (constr && bIfRandomize)
-+                {
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, tmp_vir, NULL,
-+                                       cr, nrnb, wcycle, upd, constr,
-+                                       bInitStep, TRUE, bCalcVir, vetanew);
-+                }
-+            }
-+        }
-+
-+        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
-+        {
-+            gmx_iterate_init(&iterate, TRUE);
-+            /* for iterations, we save these vectors, as we will be redoing the calculations */
-+            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-+        }
-+
-+        bFirstIterate = TRUE;
-+        while (bFirstIterate || iterate.bIterationActive)
-+        {
-+            /* We now restore these vectors to redo the calculation with improved extended variables */
-+            if (iterate.bIterationActive)
-+            {
-+                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-+            }
-+
-+            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
-+               so scroll down for that logic */
-+
-+            /* #########   START SECOND UPDATE STEP ################# */
-+            GMX_MPE_LOG(ev_update_start);
-+            /* Box is changed in update() when we do pressure coupling,
-+             * but we should still use the old box for energy corrections and when
-+             * writing it to the energy file, so it matches the trajectory files for
-+             * the same timestep above. Make a copy in a separate array.
-+             */
-+            copy_mat(state->box, lastbox);
-+
-+            bOK = TRUE;
-+            dvdl_constr = 0;
-+
-+            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
-+            {
-+                wallcycle_start(wcycle, ewcUPDATE);
-+                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-+                if (bTrotter)
-+                {
-+                    if (iterate.bIterationActive)
-+                    {
-+                        if (bFirstIterate)
-+                        {
-+                            scalevir = 1;
-+                        }
-+                        else
-+                        {
-+                            /* we use a new value of scalevir to converge the iterations faster */
-+                            scalevir = tracevir/trace(shake_vir);
-+                        }
-+                        msmul(shake_vir, scalevir, shake_vir);
-+                        m_add(force_vir, shake_vir, total_vir);
-+                        clear_mat(shake_vir);
-+                    }
-+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-+                    /* We can only do Berendsen coupling after we have summed
-+                     * the kinetic energy or virial. Since the happens
-+                     * in global_state after update, we should only do it at
-+                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
-+                     */
-+                }
-+                else
-+                {
-+                    update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
-+                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle,
-+                                   upd, bInitStep);
-+                }
-+
-+                if (bVV)
-+                {
-+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                    /* velocity half-step update */
-+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                                  ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
-+                                  cr, nrnb, constr, &top->idef);
-+                }
-+
-+                /* Above, initialize just copies ekinh into ekin,
-+                 * it doesn't copy position (for VV),
-+                 * and entire integrator for MD.
-+                 */
-+
-+                if (ir->eI == eiVVAK)
-+                {
-+                    copy_rvecn(state->x, cbuf, 0, state->natoms);
-+                }
-+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                              ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-+                wallcycle_stop(wcycle, ewcUPDATE);
-+
-+                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
-+                                   fr->bMolPBC, graph, f,
-+                                   &top->idef, shake_vir, force_vir,
-+                                   cr, nrnb, wcycle, upd, constr,
-+                                   bInitStep, FALSE, bCalcVir, state->veta);
-+
-+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-+                {
-+                    /* Correct the virial for multiple time stepping */
-+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-+                }
-+
-+                if (ir->eI == eiVVAK)
-+                {
-+                    /* erase F_EKIN and F_TEMP here? */
-+                    /* just compute the kinetic energy at the half step to perform a trotter step */
-+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                    constr, NULL, FALSE, lastbox,
-+                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
-+                                    cglo_flags | CGLO_TEMPERATURE
-+                                    );
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-+                    /* now we know the scaling, we can compute the positions again again */
-+                    copy_rvecn(cbuf, state->x, 0, state->natoms);
-+
-+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                                  ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+
-+                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
-+                    /* are the small terms in the shake_vir here due
-+                     * to numerical errors, or are they important
-+                     * physically? I'm thinking they are just errors, but not completely sure.
-+                     * For now, will call without actually constraining, constr=NULL*/
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, tmp_vir, force_vir,
-+                                       cr, nrnb, wcycle, upd, NULL,
-+                                       bInitStep, FALSE, bCalcVir,
-+                                       state->veta);
-+                }
-+                if (!bOK && !bFFscan)
-+                {
-+                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-+                }
-+
-+                if (fr->bSepDVDL && fplog && do_log)
-+                {
-+                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr);
-+                }
-+                if (bVV)
-+                {
-+                    /* this factor or 2 correction is necessary
-+                       because half of the constraint force is removed
-+                       in the vv step, so we have to double it.  See
-+                       the Redmine issue #1255.  It is not yet clear
-+                       if the factor of 2 is exact, or just a very
-+                       good approximation, and this will be
-+                       investigated.  The next step is to see if this
-+                       can be done adding a dhdl contribution from the
-+                       rattle step, but this is somewhat more
-+                       complicated with the current code. Will be
-+                       investigated, hopefully for 4.6.3. However,
-+                       this current solution is much better than
-+                       having it completely wrong.
-+                    */
-+                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
-+                }
-+                else
-+                {
-+                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-+                }
-+            }
-+            else if (graph)
-+            {
-+                /* Need to unshift here */
-+                unshift_self(graph, state->box, state->x);
-+            }
-+
-+            GMX_BARRIER(cr->mpi_comm_mygroup);
-+            GMX_MPE_LOG(ev_update_finish);
-+
-+            if (vsite != NULL)
-+            {
-+                wallcycle_start(wcycle, ewcVSITECONSTR);
-+                if (graph != NULL)
-+                {
-+                    shift_self(graph, state->box, state->x);
-+                }
-+                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
-+                                 top->idef.iparams, top->idef.il,
-+                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
-+
-+                if (graph != NULL)
-+                {
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+                wallcycle_stop(wcycle, ewcVSITECONSTR);
-+            }
-+
-+            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
-+            /* With Leap-Frog we can skip compute_globals at
-+             * non-communication steps, but we need to calculate
-+             * the kinetic energy one step before communication.
-+             */
-+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
-+            {
-+                if (ir->nstlist == -1 && bFirstIterate)
-+                {
-+                    gs.sig[eglsNABNSB] = nlh.nabnsb;
-+                }
-+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                constr,
-+                                bFirstIterate ? &gs : NULL,
-+                                (step_rel % gs.nstms == 0) &&
-+                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
-+                                lastbox,
-+                                top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
-+                                cglo_flags
-+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
-+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
-+                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
-+                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-+                                | CGLO_CONSTRAINT
-+                                );
-+                if (ir->nstlist == -1 && bFirstIterate)
-+                {
-+                    nlh.nabnsb         = gs.set[eglsNABNSB];
-+                    gs.set[eglsNABNSB] = 0;
-+                }
-+            }
-+            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
-+            /* #############  END CALC EKIN AND PRESSURE ################# */
-+
-+            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-+               the virial that should probably be addressed eventually. state->veta has better properies,
-+               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-+               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
-+
-+            if (iterate.bIterationActive &&
-+                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-+                               trace(shake_vir), &tracevir))
-+            {
-+                break;
-+            }
-+            bFirstIterate = FALSE;
-+        }
-+
-+        if (!bVV || bRerunMD)
-+        {
-+            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
-+            sum_dhdl(enerd, state->lambda, ir->fepvals);
-+        }
-+        update_box(fplog, step, ir, mdatoms, state, graph, f,
-+                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE);
-+
-+        /* ################# END UPDATE STEP 2 ################# */
-+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
-+
-+        /* The coordinates (x) were unshifted in update */
-+        if (bFFscan && (shellfc == NULL || bConverged))
-+        {
-+            if (print_forcefield(fplog, enerd->term, mdatoms->homenr,
-+                                 f, NULL, xcopy,
-+                                 &(top_global->mols), mdatoms->massT, pres))
-+            {
-+                gmx_finalize_par();
-+
-+                fprintf(stderr, "\n");
-+                exit(0);
-+            }
-+        }
-+        if (!bGStat)
-+        {
-+            /* We will not sum ekinh_old,
-+             * so signal that we still have to do it.
-+             */
-+            bSumEkinhOld = TRUE;
-+        }
-+
-+        if (bTCR)
-+        {
-+            /* Only do GCT when the relaxation of shells (minimization) has converged,
-+             * otherwise we might be coupling to bogus energies.
-+             * In parallel we must always do this, because the other sims might
-+             * update the FF.
-+             */
-+
-+            /* Since this is called with the new coordinates state->x, I assume
-+             * we want the new box state->box too. / EL 20040121
-+             */
-+            do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
-+                        ir, MASTER(cr),
-+                        mdatoms, &(top->idef), mu_aver,
-+                        top_global->mols.nr, cr,
-+                        state->box, total_vir, pres,
-+                        mu_tot, state->x, f, bConverged);
-+            debug_gmx();
-+        }
-+
-+        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
-+
-+        /* use the directly determined last velocity, not actually the averaged half steps */
-+        if (bTrotter && ir->eI == eiVV)
-+        {
-+            enerd->term[F_EKIN] = last_ekin;
-+        }
-+        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
-+
-+        if (bVV)
-+        {
-+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
-+        }
-+        else
-+        {
-+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
-+        }
-+        /* Check for excessively large energies */
-+        if (bIonize)
-+        {
-+#ifdef GMX_DOUBLE
-+            real etot_max = 1e200;
-+#else
-+            real etot_max = 1e30;
-+#endif
-+            if (fabs(enerd->term[F_ETOT]) > etot_max)
-+            {
-+                fprintf(stderr, "Energy too large (%g), giving up\n",
-+                        enerd->term[F_ETOT]);
-+            }
-+        }
-+        /* #########  END PREPARING EDR OUTPUT  ###########  */
-+
-+        /* Time for performance */
-+        if (((step % stepout) == 0) || bLastStep)
-+        {
-+            runtime_upd_proc(runtime);
-+        }
-+
-+        /* Output stuff */
-+        if (MASTER(cr))
-+        {
-+            gmx_bool do_dr, do_or;
-+
-+            if (fplog && do_log && bDoExpanded)
-+            {
-+                /* only needed if doing expanded ensemble */
-+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
-+                                          &state_global->dfhist, state->fep_state, ir->nstlog, step);
-+            }
-+            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
-+            {
-+                if (bCalcEner)
-+                {
-+                    upd_mdebin(mdebin, bDoDHDL, TRUE,
-+                               t, mdatoms->tmass, enerd, state,
-+                               ir->fepvals, ir->expandedvals, lastbox,
-+                               shake_vir, force_vir, total_vir, pres,
-+                               ekind, mu_tot, constr);
-+                }
-+                else
-+                {
-+                    upd_mdebin_step(mdebin);
-+                }
-+
-+                do_dr  = do_per_step(step, ir->nstdisreout);
-+                do_or  = do_per_step(step, ir->nstorireout);
-+
-+                print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
-+                           step, t,
-+                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
-+            }
-+            if (ir->ePull != epullNO)
-+            {
-+                pull_print_output(ir->pull, step, t);
-+            }
-+
-+            if (do_per_step(step, ir->nstlog))
-+            {
-+                if (fflush(fplog) != 0)
-+                {
-+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
-+                }
-+            }
-+        }
-+        if (bDoExpanded)
-+        {
-+            /* Have to do this part _after_ outputting the logfile and the edr file */
-+            /* Gets written into the state at the beginning of next loop*/
-+            state->fep_state = lamnew;
-+        }
-+
-+        /* Remaining runtime */
-+        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
-+        {
-+            if (shellfc)
-+            {
-+                fprintf(stderr, "\n");
-+            }
-+            print_time(stderr, runtime, step, ir, cr);
-+        }
-+
-+        /* Replica exchange */
-+        bExchanged = FALSE;
-+        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+            do_per_step(step, repl_ex_nst))
-+        {
-+            bExchanged = replica_exchange(fplog, cr, repl_ex,
-+                                          state_global, enerd,
-+                                          state, step, t);
-+
-+            if (bExchanged && DOMAINDECOMP(cr))
-+            {
-+                dd_partition_system(fplog, step, cr, TRUE, 1,
-+                                    state_global, top_global, ir,
-+                                    state, &f, mdatoms, top, fr,
-+                                    vsite, shellfc, constr,
-+                                    nrnb, wcycle, FALSE);
-+            }
-+        }
-+
-+        bFirstStep       = FALSE;
-+        bInitStep        = FALSE;
-+        bStartingFromCpt = FALSE;
-+
-+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
-+        /* With all integrators, except VV, we need to retain the pressure
-+         * at the current step for coupling at the next step.
-+         */
-+        if ((state->flags & (1<<estPRES_PREV)) &&
-+            (bGStatEveryStep ||
-+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
-+        {
-+            /* Store the pressure in t_state for pressure coupling
-+             * at the next MD step.
-+             */
-+            copy_mat(pres, state->pres_prev);
-+        }
-+
-+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
-+
-+        if ( (membed != NULL) && (!bLastStep) )
-+        {
-+            rescale_membed(step_rel, membed, state_global->x);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            if (MASTER(cr))
-+            {
-+                /* read next frame from input trajectory */
-+                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
-+            }
-+
-+            if (PAR(cr))
-+            {
-+                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
-+            }
-+        }
-+
-+        if (!bRerunMD || !rerun_fr.bStep)
-+        {
-+            /* increase the MD step number */
-+            step++;
-+            step_rel++;
-+        }
-+
-+        cycles = wallcycle_stop(wcycle, ewcSTEP);
-+        if (DOMAINDECOMP(cr) && wcycle)
-+        {
-+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
-+        }
-+
-+        if (bPMETuneRunning || bPMETuneTry)
-+        {
-+            /* PME grid + cut-off optimization with GPUs or PME nodes */
-+
-+            /* Count the total cycles over the last steps */
-+            cycles_pmes += cycles;
-+
-+            /* We can only switch cut-off at NS steps */
-+            if (step % ir->nstlist == 0)
-+            {
-+                /* PME grid + cut-off optimization with GPUs or PME nodes */
-+                if (bPMETuneTry)
-+                {
-+                    if (DDMASTER(cr->dd))
-+                    {
-+                        /* PME node load is too high, start tuning */
-+                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
-+                    }
-+                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-+
-+                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
-+                    {
-+                        bPMETuneTry     = FALSE;
-+                    }
-+                }
-+                if (bPMETuneRunning)
-+                {
-+                    /* init_step might not be a multiple of nstlist,
-+                     * but the first cycle is always skipped anyhow.
-+                     */
-+                    bPMETuneRunning =
-+                        pme_load_balance(pme_loadbal, cr,
-+                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
-+                                         fplog,
-+                                         ir, state, cycles_pmes,
-+                                         fr->ic, fr->nbv, &fr->pmedata,
-+                                         step);
-+
-+                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
-+                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
-+                    fr->rlist      = fr->ic->rlist;
-+                    fr->rlistlong  = fr->ic->rlistlong;
-+                    fr->rcoulomb   = fr->ic->rcoulomb;
-+                    fr->rvdw       = fr->ic->rvdw;
-+                }
-+                cycles_pmes = 0;
-+            }
-+        }
-+
-+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
-+            gs.set[eglsRESETCOUNTERS] != 0)
-+        {
-+            /* Reset all the counters related to performance over the run */
-+            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
-+                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
-+            wcycle_set_reset_counters(wcycle, -1);
-+            if (!(cr->duty & DUTY_PME))
-+            {
-+                /* Tell our PME node to reset its counters */
-+                gmx_pme_send_resetcounters(cr, step);
-+            }
-+            /* Correct max_hours for the elapsed time */
-+            max_hours                -= run_time/(60.0*60.0);
-+            bResetCountersHalfMaxH    = FALSE;
-+            gs.set[eglsRESETCOUNTERS] = 0;
-+        }
-+
-+    }
-+    /* End of main MD loop */
-+    debug_gmx();
-+
-+    /* Stop the time */
-+    runtime_end(runtime);
-+
-+    if (bRerunMD && MASTER(cr))
-+    {
-+        close_trj(status);
-+    }
-+
-+    if (!(cr->duty & DUTY_PME))
-+    {
-+        /* Tell the PME only node to finish */
-+        gmx_pme_send_finish(cr);
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        if (ir->nstcalcenergy > 0 && !bRerunMD)
-+        {
-+            print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
-+                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
-+        }
-+    }
-+
-+    done_mdoutf(outf);
-+
-+    debug_gmx();
-+
-+    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
-+    {
-+        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
-+        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
-+    }
-+
-+    if (pme_loadbal != NULL)
-+    {
-+        pme_loadbal_done(pme_loadbal, cr, fplog,
-+                         fr->nbv != NULL && fr->nbv->bUseGPU);
-+    }
-+
-+    if (shellfc && fplog)
-+    {
-+        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
-+                (nconverged*100.0)/step_rel);
-+        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
-+                tcount/step_rel);
-+    }
-+
-+    if (repl_ex_nst > 0 && MASTER(cr))
-+    {
-+        print_replica_exchange_statistics(fplog, repl_ex);
-+    }
-+
-+    runtime->nsteps_done = step_rel;
-+
-+    return 0;
-+}
-diff --git a/src/kernel/mdrun.c b/src/kernel/mdrun.c
-index eb30fc9..ca3b657 100644
---- a/src/kernel/mdrun.c
-+++ b/src/kernel/mdrun.c
-@@ -58,6 +58,12 @@
- /* afm stuf */
- #include "pull.h"
- 
-+/* PLUMED */
-+#include "../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain; extern void(*plumedcmd)(plumed,const char*,const void*);
-+/* END PLUMED */
-+
- int cmain(int argc, char *argv[])
- {
-     const char   *desc[] = {
-@@ -415,6 +421,7 @@ int cmain(int argc, char *argv[])
-         { efMTX, "-mtx",    "nm",       ffOPTWR },
-         { efNDX, "-dn",     "dipole",   ffOPTWR },
-         { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-plumed", "plumed",   ffOPTRD },   /* PLUMED */
-         { efDAT, "-membed", "membed",   ffOPTRD },
-         { efTOP, "-mp",     "membed",   ffOPTRD },
-         { efNDX, "-mn",     "membed",   ffOPTRD }
-@@ -752,6 +759,31 @@ int cmain(int argc, char *argv[])
-     ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
-     ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-     ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
-+    /* PLUMED */
-+    plumedswitch=0;
-+    if (opt2bSet("-plumed",NFILE,fnm)) plumedswitch=1;
-+    if(plumedswitch){ plumedcmd=plumed_cmd;
-+      int plumed_is_there=0;
-+      int real_precision=sizeof(real);
-+      real energyUnits=1.0;
-+      real lengthUnits=1.0;
-+      real timeUnits=1.0;
-+  
-+  
-+      if(!plumed_installed()){
-+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
-+      }
-+      plumedmain=plumed_create();
-+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
-+      // this is not necessary for gromacs units:
-+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
-+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
-+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
-+      //
-+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,NFILE,fnm));
-+      plumedswitch=1;
-+    }
-+    /* END PLUMED */
- 
-     rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-                   nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-@@ -761,6 +793,12 @@ int cmain(int argc, char *argv[])
-                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-                   pforce, cpt_period, max_hours, deviceOptions, Flags);
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      plumed_finalize(plumedmain);
-+    }
-+    /* END PLUMED */
-+  
-     gmx_finalize_par();
- 
-     if (MULTIMASTER(cr))
-diff --git a/src/kernel/mdrun.c.preplumed b/src/kernel/mdrun.c.preplumed
-new file mode 100644
-index 0000000..eb30fc9
---- /dev/null
-+++ b/src/kernel/mdrun.c.preplumed
-@@ -0,0 +1,779 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team,
-+ * check out http://www.gromacs.org for more information.
-+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
-+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
-+ * others, as listed in the AUTHORS file in the top-level source
-+ * directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include "typedefs.h"
-+#include "macros.h"
-+#include "copyrite.h"
-+#include "main.h"
-+#include "statutil.h"
-+#include "smalloc.h"
-+#include "futil.h"
-+#include "smalloc.h"
-+#include "edsam.h"
-+#include "mdrun.h"
-+#include "xmdrun.h"
-+#include "checkpoint.h"
-+#ifdef GMX_THREAD_MPI
-+#include "thread_mpi.h"
-+#endif
-+
-+/* afm stuf */
-+#include "pull.h"
-+
-+int cmain(int argc, char *argv[])
-+{
-+    const char   *desc[] = {
-+        "The [TT]mdrun[tt] program is the main computational chemistry engine",
-+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
-+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
-+        "test particle insertion or (re)calculation of energies.",
-+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
-+        "builds a Hessian matrix from single conformation.",
-+        "For usual Normal Modes-like calculations, make sure that",
-+        "the structure provided is properly energy-minimized.",
-+        "The generated matrix can be diagonalized by [TT]g_nmeig[tt].[PAR]",
-+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
-+        "and distributes the topology over nodes if needed.",
-+        "[TT]mdrun[tt] produces at least four output files.",
-+        "A single log file ([TT]-g[tt]) is written, unless the option",
-+        "[TT]-seppot[tt] is used, in which case each node writes a log file.",
-+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
-+        "optionally forces.",
-+        "The structure file ([TT]-c[tt]) contains the coordinates and",
-+        "velocities of the last step.",
-+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
-+        "pressure, etc, a lot of these things are also printed in the log file.",
-+        "Optionally coordinates can be written to a compressed trajectory file",
-+        "([TT]-x[tt]).[PAR]",
-+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
-+        "turned on.[PAR]",
-+        "A simulation can be run in parallel using two different parallelization",
-+        "schemes: MPI parallelization and/or OpenMP thread parallelization.",
-+        "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
-+        "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
-+        "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
-+        "are supported when mdrun is compiled with OpenMP. Full OpenMP support",
-+        "is only available with the Verlet cut-off scheme, with the (older)",
-+        "group scheme only PME-only processes can use OpenMP parallelization.",
-+        "In all cases [TT]mdrun[tt] will by default try to use all the available",
-+        "hardware resources. With a normal MPI library only the options",
-+        "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
-+        "for PME-only processes, can be used to control the number of threads.",
-+        "With thread-MPI there are additional options [TT]-nt[tt], which sets",
-+        "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
-+        "of thread-MPI threads.",
-+        "The number of OpenMP threads used by [TT]mdrun[tt] can also be set with",
-+        "the standard environment variable, [TT]OMP_NUM_THREADS[tt].",
-+        "The [TT]GMX_PME_NUM_THREADS[tt] environment variable can be used to specify",
-+        "the number of threads used by the PME-only processes.[PAR]",
-+        "Note that combined MPI+OpenMP parallelization is in many cases",
-+        "slower than either on its own. However, at high parallelization, using the",
-+        "combination is often beneficial as it reduces the number of domains and/or",
-+        "the number of MPI ranks. (Less and larger domains can improve scaling,",
-+        "with separate PME processes fewer MPI ranks reduces communication cost.)",
-+        "OpenMP-only parallelization is typically faster than MPI-only parallelization",
-+        "on a single CPU(-die). Since we currently don't have proper hardware",
-+        "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
-+        "automatically use OpenMP-only parallelization when you use up to 4",
-+        "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
-+        "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
-+        "parallelization is used (except with GPUs, see below).",
-+        "[PAR]",
-+        "To quickly test the performance of the new Verlet cut-off scheme",
-+        "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
-+        "the [TT]-testverlet[tt] option. This should not be used for production,",
-+        "since it can slightly modify potentials and it will remove charge groups",
-+        "making analysis difficult, as the [TT].tpr[tt] file will still contain",
-+        "charge groups. For production simulations it is highly recommended",
-+        "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
-+        "[PAR]",
-+        "With GPUs (only supported with the Verlet cut-off scheme), the number",
-+        "of GPUs should match the number of MPI processes or MPI threads,",
-+        "excluding PME-only processes/threads. With thread-MPI, unless set on the command line, the number",
-+        "of MPI threads will automatically be set to the number of GPUs detected.",
-+        "To use a subset of the available GPUs, or to manually provide a mapping of",
-+        "GPUs to PP ranks, you can use the [TT]-gpu_id[tt] option. The argument of [TT]-gpu_id[tt] is",
-+        "a string of digits (without delimiter) representing device id-s of the GPUs to be used.",
-+        "For example, \"[TT]02[tt]\" specifies using GPUs 0 and 2 in the first and second PP ranks per compute node",
-+        "respectively. To select different sets of GPU-s",
-+        "on different nodes of a compute cluster, use the [TT]GMX_GPU_ID[tt] environment",
-+        "variable instead. The format for [TT]GMX_GPU_ID[tt] is identical to ",
-+        "[TT]-gpu_id[tt], with the difference that an environment variable can have",
-+        "different values on different compute nodes. Multiple MPI ranks on each node",
-+        "can share GPUs. This is accomplished by specifying the id(s) of the GPU(s)",
-+        "multiple times, e.g. \"[TT]0011[tt]\" for four ranks sharing two GPUs in this node.",
-+        "This works within a single simulation, or a multi-simulation, with any form of MPI.",
-+        "[PAR]",
-+        "When using PME with separate PME nodes or with a GPU, the two major",
-+        "compute tasks, the non-bonded force calculation and the PME calculation",
-+        "run on different compute resources. If this load is not balanced,",
-+        "some of the resources will be idle part of time. With the Verlet",
-+        "cut-off scheme this load is automatically balanced when the PME load",
-+        "is too high (but not when it is too low). This is done by scaling",
-+        "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
-+        "few hundred steps different settings are tried and the fastest is chosen",
-+        "for the rest of the simulation. This does not affect the accuracy of",
-+        "the results, but it does affect the decomposition of the Coulomb energy",
-+        "into particle and mesh contributions. The auto-tuning can be turned off",
-+        "with the option [TT]-notunepme[tt].",
-+        "[PAR]",
-+        "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,",
-+        "when all (logical) cores on a compute node are used by [TT]mdrun[tt],",
-+        "even when no multi-threading is used,",
-+        "as this usually results in significantly better performance.",
-+        "If the queuing systems or the OpenMP library pinned threads, we honor",
-+        "this and don't pin again, even though the layout may be sub-optimal.",
-+        "If you want to have [TT]mdrun[tt] override an already set thread affinity",
-+        "or pin threads when using less cores, use [TT]-pin on[tt].",
-+        "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,",
-+        "there are multiple logical cores per physical core.",
-+        "The option [TT]-pinstride[tt] sets the stride in logical cores for",
-+        "pinning consecutive threads. Without SMT, 1 is usually the best choice.",
-+        "With Intel Hyper-Threading 2 is best when using half or less of the",
-+        "logical cores, 1 otherwise. The default value of 0 do exactly that:",
-+        "it minimizes the threads per logical core, to optimize performance.",
-+        "If you want to run multiple mdrun jobs on the same physical node,"
-+        "you should set [TT]-pinstride[tt] to 1 when using all logical cores.",
-+        "When running multiple mdrun (or other) simulations on the same physical",
-+        "node, some simulations need to start pinning from a non-zero core",
-+        "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
-+        "the offset in logical cores for pinning.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] is started using MPI with more than 1 process",
-+        "or with thread-MPI with more than 1 thread, MPI parallelization is used.",
-+        "By default domain decomposition is used, unless the [TT]-pd[tt]",
-+        "option is set, which selects particle decomposition.",
-+        "[PAR]",
-+        "With domain decomposition, the spatial decomposition can be set",
-+        "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
-+        "The user only needs to change this when the system is very inhomogeneous.",
-+        "Dynamic load balancing is set with the option [TT]-dlb[tt],",
-+        "which can give a significant performance improvement,",
-+        "especially for inhomogeneous systems. The only disadvantage of",
-+        "dynamic load balancing is that runs are no longer binary reproducible,",
-+        "but in most cases this is not important.",
-+        "By default the dynamic load balancing is automatically turned on",
-+        "when the measured performance loss due to load imbalance is 5% or more.",
-+        "At low parallelization these are the only important options",
-+        "for domain decomposition.",
-+        "At high parallelization the options in the next two sections",
-+        "could be important for increasing the performace.",
-+        "[PAR]",
-+        "When PME is used with domain decomposition, separate nodes can",
-+        "be assigned to do only the PME mesh calculation;",
-+        "this is computationally more efficient starting at about 12 nodes",
-+        "or even fewer when OpenMP parallelization is used.",
-+        "The number of PME nodes is set with option [TT]-npme[tt],",
-+        "this can not be more than half of the nodes.",
-+        "By default [TT]mdrun[tt] makes a guess for the number of PME",
-+        "nodes when the number of nodes is larger than 16. With GPUs,",
-+        "PME nodes are not selected automatically, since the optimal setup",
-+        "depends very much on the details of the hardware.",
-+        "In all cases you might gain performance by optimizing [TT]-npme[tt].",
-+        "Performance statistics on this issue",
-+        "are written at the end of the log file.",
-+        "For good load balancing at high parallelization, the PME grid x and y",
-+        "dimensions should be divisible by the number of PME nodes",
-+        "(the simulation will run correctly also when this is not the case).",
-+        "[PAR]",
-+        "This section lists all options that affect the domain decomposition.",
-+        "[PAR]",
-+        "Option [TT]-rdd[tt] can be used to set the required maximum distance",
-+        "for inter charge-group bonded interactions.",
-+        "Communication for two-body bonded interactions below the non-bonded",
-+        "cut-off distance always comes for free with the non-bonded communication.",
-+        "Atoms beyond the non-bonded cut-off are only communicated when they have",
-+        "missing bonded interactions; this means that the extra cost is minor",
-+        "and nearly indepedent of the value of [TT]-rdd[tt].",
-+        "With dynamic load balancing option [TT]-rdd[tt] also sets",
-+        "the lower limit for the domain decomposition cell sizes.",
-+        "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
-+        "the initial coordinates. The chosen value will be a balance",
-+        "between interaction range and communication cost.",
-+        "[PAR]",
-+        "When inter charge-group bonded interactions are beyond",
-+        "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
-+        "For pair interactions and tabulated bonds",
-+        "that do not generate exclusions, this check can be turned off",
-+        "with the option [TT]-noddcheck[tt].",
-+        "[PAR]",
-+        "When constraints are present, option [TT]-rcon[tt] influences",
-+        "the cell size limit as well.",
-+        "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
-+        "should not be beyond the smallest cell size. A error message is",
-+        "generated when this happens and the user should change the decomposition",
-+        "or decrease the LINCS order and increase the number of LINCS iterations.",
-+        "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
-+        "in a conservative fashion. For high parallelization it can be useful",
-+        "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
-+        "[PAR]",
-+        "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
-+        "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
-+        "the cells can scale down by at least this factor. This option is used",
-+        "for the automated spatial decomposition (when not using [TT]-dd[tt])",
-+        "as well as for determining the number of grid pulses, which in turn",
-+        "sets the minimum allowed cell size. Under certain circumstances",
-+        "the value of [TT]-dds[tt] might need to be adjusted to account for",
-+        "high or low spatial inhomogeneity of the system.",
-+        "[PAR]",
-+        "The option [TT]-gcom[tt] can be used to only do global communication",
-+        "every n steps.",
-+        "This can improve performance for highly parallel simulations",
-+        "where this global communication step becomes the bottleneck.",
-+        "For a global thermostat and/or barostat the temperature",
-+        "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
-+        "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
-+        "With [TT]-rerun[tt] an input trajectory can be given for which ",
-+        "forces and energies will be (re)calculated. Neighbor searching will be",
-+        "performed for every frame, unless [TT]nstlist[tt] is zero",
-+        "(see the [TT].mdp[tt] file).[PAR]",
-+        "ED (essential dynamics) sampling and/or additional flooding potentials",
-+        "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]",
-+        "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool",
-+        "or by using options in the essdyn menu of the WHAT IF program.",
-+        "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
-+        "contains projections of positions, velocities and forces onto selected",
-+        "eigenvectors.[PAR]",
-+        "When user-defined potential functions have been selected in the",
-+        "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
-+        "a formatted table with potential functions. The file is read from",
-+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
-+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
-+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
-+        "normal Coulomb.",
-+        "When pair interactions are present, a separate table for pair interaction",
-+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
-+        "When tabulated bonded functions are present in the topology,",
-+        "interaction functions are read using the [TT]-tableb[tt] option.",
-+        "For each different tabulated interaction type the table file name is",
-+        "modified in a different way: before the file extension an underscore is",
-+        "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
-+        "and finally the table number of the interaction type.[PAR]",
-+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
-+        "coordinates and forces when pulling is selected",
-+        "in the [TT].mdp[tt] file.[PAR]",
-+        "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
-+        "simulated in parallel.",
-+        "As many input files/directories are required as the number of systems. ",
-+        "The [TT]-multidir[tt] option takes a list of directories (one for each ",
-+        "system) and runs in each of them, using the input/output file names, ",
-+        "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
-+        "directories.",
-+        "With [TT]-multi[tt], the system number is appended to the run input ",
-+        "and each output filename, for instance [TT]topol.tpr[tt] becomes",
-+        "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
-+        "The number of nodes per system is the total number of nodes",
-+        "divided by the number of systems.",
-+        "One use of this option is for NMR refinement: when distance",
-+        "or orientation restraints are present these can be ensemble averaged",
-+        "over all the systems.[PAR]",
-+        "With [TT]-replex[tt] replica exchange is attempted every given number",
-+        "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
-+        "[TT]-multidir[tt] option, described above.",
-+        "All run input files should use a different coupling temperature,",
-+        "the order of the files is not important. The random seed is set with",
-+        "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
-+        "is performed after every exchange.[PAR]",
-+        "Finally some experimental algorithms can be tested when the",
-+        "appropriate options have been given. Currently under",
-+        "investigation are: polarizability and X-ray bombardments.",
-+        "[PAR]",
-+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
-+        "a protein into a membrane. The data file should contain the options",
-+        "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
-+        "both apply to this as well.",
-+        "[PAR]",
-+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
-+        "crashes due to too large forces. With this option coordinates and",
-+        "forces of atoms with a force larger than a certain value will",
-+        "be printed to stderr.",
-+        "[PAR]",
-+        "Checkpoints containing the complete state of the system are written",
-+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
-+        "unless option [TT]-cpt[tt] is set to -1.",
-+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
-+        "make sure that a recent state of the system is always available,",
-+        "even when the simulation is terminated while writing a checkpoint.",
-+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
-+        "with the step number.",
-+        "A simulation can be continued by reading the full state from file",
-+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
-+        "if no checkpoint file is found, Gromacs just assumes a normal run and",
-+        "starts from the first step of the [TT].tpr[tt] file. By default the output",
-+        "will be appending to the existing output files. The checkpoint file",
-+        "contains checksums of all output files, such that you will never",
-+        "loose data when some output files are modified, corrupt or removed.",
-+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
-+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
-+        "[TT]*[tt] all files are present with names and checksums matching those stored",
-+        "in the checkpoint file: files are appended[PAR]",
-+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
-+        "With [TT]-noappend[tt] new output files are opened and the simulation",
-+        "part number is added to all output file names.",
-+        "Note that in all cases the checkpoint file itself is not renamed",
-+        "and will be overwritten, unless its name does not match",
-+        "the [TT]-cpo[tt] option.",
-+        "[PAR]",
-+        "With checkpointing the output is appended to previously written",
-+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
-+        "output files are present (except for the checkpoint file).",
-+        "The integrity of the files to be appended is verified using checksums",
-+        "which are stored in the checkpoint file. This ensures that output can",
-+        "not be mixed up or corrupted due to file appending. When only some",
-+        "of the previous output files are present, a fatal error is generated",
-+        "and no old output files are modified and no new output files are opened.",
-+        "The result with appending will be the same as from a single run.",
-+        "The contents will be binary identical, unless you use a different number",
-+        "of nodes or dynamic load balancing or the FFT library uses optimizations",
-+        "through timing.",
-+        "[PAR]",
-+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
-+        "file is written at the first neighbor search step where the run time",
-+        "exceeds [TT]-maxh[tt]*0.99 hours.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
-+        "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
-+        "pressed), it will stop after the next neighbor search step ",
-+        "(with nstlist=0 at the next step).",
-+        "In both cases all the usual output will be written to file.",
-+        "When running with MPI, a signal to one of the [TT]mdrun[tt] processes",
-+        "is sufficient, this signal should not be sent to mpirun or",
-+        "the [TT]mdrun[tt] process that is the parent of the others.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
-+    };
-+    t_commrec    *cr;
-+    t_filenm      fnm[] = {
-+        { efTPX, NULL,      NULL,       ffREAD },
-+        { efTRN, "-o",      NULL,       ffWRITE },
-+        { efXTC, "-x",      NULL,       ffOPTWR },
-+        { efCPT, "-cpi",    NULL,       ffOPTRD },
-+        { efCPT, "-cpo",    NULL,       ffOPTWR },
-+        { efSTO, "-c",      "confout",  ffWRITE },
-+        { efEDR, "-e",      "ener",     ffWRITE },
-+        { efLOG, "-g",      "md",       ffWRITE },
-+        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
-+        { efXVG, "-field",  "field",    ffOPTWR },
-+        { efXVG, "-table",  "table",    ffOPTRD },
-+        { efXVG, "-tabletf", "tabletf",    ffOPTRD },
-+        { efXVG, "-tablep", "tablep",   ffOPTRD },
-+        { efXVG, "-tableb", "table",    ffOPTRD },
-+        { efTRX, "-rerun",  "rerun",    ffOPTRD },
-+        { efXVG, "-tpi",    "tpi",      ffOPTWR },
-+        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
-+        { efEDI, "-ei",     "sam",      ffOPTRD },
-+        { efXVG, "-eo",     "edsam",    ffOPTWR },
-+        { efGCT, "-j",      "wham",     ffOPTRD },
-+        { efGCT, "-jo",     "bam",      ffOPTWR },
-+        { efXVG, "-ffout",  "gct",      ffOPTWR },
-+        { efXVG, "-devout", "deviatie", ffOPTWR },
-+        { efXVG, "-runav",  "runaver",  ffOPTWR },
-+        { efXVG, "-px",     "pullx",    ffOPTWR },
-+        { efXVG, "-pf",     "pullf",    ffOPTWR },
-+        { efXVG, "-ro",     "rotation", ffOPTWR },
-+        { efLOG, "-ra",     "rotangles", ffOPTWR },
-+        { efLOG, "-rs",     "rotslabs", ffOPTWR },
-+        { efLOG, "-rt",     "rottorque", ffOPTWR },
-+        { efMTX, "-mtx",    "nm",       ffOPTWR },
-+        { efNDX, "-dn",     "dipole",   ffOPTWR },
-+        { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-membed", "membed",   ffOPTRD },
-+        { efTOP, "-mp",     "membed",   ffOPTRD },
-+        { efNDX, "-mn",     "membed",   ffOPTRD }
-+    };
-+#define NFILE asize(fnm)
-+
-+    /* Command line options ! */
-+    gmx_bool      bCart         = FALSE;
-+    gmx_bool      bPPPME        = FALSE;
-+    gmx_bool      bPartDec      = FALSE;
-+    gmx_bool      bDDBondCheck  = TRUE;
-+    gmx_bool      bDDBondComm   = TRUE;
-+    gmx_bool      bTunePME      = TRUE;
-+    gmx_bool      bTestVerlet   = FALSE;
-+    gmx_bool      bVerbose      = FALSE;
-+    gmx_bool      bCompact      = TRUE;
-+    gmx_bool      bSepPot       = FALSE;
-+    gmx_bool      bRerunVSite   = FALSE;
-+    gmx_bool      bIonize       = FALSE;
-+    gmx_bool      bConfout      = TRUE;
-+    gmx_bool      bReproducible = FALSE;
-+
-+    int           npme          = -1;
-+    int           nmultisim     = 0;
-+    int           nstglobalcomm = -1;
-+    int           repl_ex_nst   = 0;
-+    int           repl_ex_seed  = -1;
-+    int           repl_ex_nex   = 0;
-+    int           nstepout      = 100;
-+    int           resetstep     = -1;
-+    gmx_large_int_t nsteps      = -2; /* the value -2 means that the mdp option will be used */
-+
-+    rvec          realddxyz          = {0, 0, 0};
-+    const char   *ddno_opt[ddnoNR+1] =
-+    { NULL, "interleave", "pp_pme", "cartesian", NULL };
-+    const char   *dddlb_opt[] =
-+    { NULL, "auto", "no", "yes", NULL };
-+    const char   *thread_aff_opt[threadaffNR+1] =
-+    { NULL, "auto", "on", "off", NULL };
-+    const char   *nbpu_opt[] =
-+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
-+    real          rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
-+    char         *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
-+    real          cpt_period            = 15.0, max_hours = -1;
-+    gmx_bool      bAppendFiles          = TRUE;
-+    gmx_bool      bKeepAndNumCPT        = FALSE;
-+    gmx_bool      bResetCountersHalfWay = FALSE;
-+    output_env_t  oenv                  = NULL;
-+    const char   *deviceOptions         = "";
-+
-+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
-+     * But unfortunately we are not allowed to call a function here,
-+     * since declarations follow below.
-+     */
-+    gmx_hw_opt_t  hw_opt = { 0, 0, 0, 0, threadaffSEL, 0, 0,
-+                             { NULL, FALSE, 0, NULL } };
-+
-+    t_pargs       pa[] = {
-+
-+        { "-pd",      FALSE, etBOOL, {&bPartDec},
-+          "Use particle decompostion" },
-+        { "-dd",      FALSE, etRVEC, {&realddxyz},
-+          "Domain decomposition grid, 0 is optimize" },
-+        { "-ddorder", FALSE, etENUM, {ddno_opt},
-+          "DD node order" },
-+        { "-npme",    FALSE, etINT, {&npme},
-+          "Number of separate nodes to be used for PME, -1 is guess" },
-+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
-+          "Total number of threads to start (0 is guess)" },
-+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
-+          "Number of thread-MPI threads to start (0 is guess)" },
-+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
-+          "Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
-+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
-+          "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
-+        { "-pin",     FALSE, etENUM, {thread_aff_opt},
-+          "Fix threads (or processes) to specific cores" },
-+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
-+          "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
-+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
-+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
-+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
-+          "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
-+        { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
-+          "Check for all bonded interactions with DD" },
-+        { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
-+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
-+        { "-rdd",     FALSE, etREAL, {&rdd},
-+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
-+        { "-rcon",    FALSE, etREAL, {&rconstr},
-+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
-+        { "-dlb",     FALSE, etENUM, {dddlb_opt},
-+          "Dynamic load balancing (with DD)" },
-+        { "-dds",     FALSE, etREAL, {&dlb_scale},
-+          "Minimum allowed dlb scaling of the DD cell size" },
-+        { "-ddcsx",   FALSE, etSTR, {&ddcsx},
-+          "HIDDENThe DD cell sizes in x" },
-+        { "-ddcsy",   FALSE, etSTR, {&ddcsy},
-+          "HIDDENThe DD cell sizes in y" },
-+        { "-ddcsz",   FALSE, etSTR, {&ddcsz},
-+          "HIDDENThe DD cell sizes in z" },
-+        { "-gcom",    FALSE, etINT, {&nstglobalcomm},
-+          "Global communication frequency" },
-+        { "-nb",      FALSE, etENUM, {&nbpu_opt},
-+          "Calculate non-bonded interactions on" },
-+        { "-tunepme", FALSE, etBOOL, {&bTunePME},
-+          "Optimize PME load between PP/PME nodes or GPU/CPU" },
-+        { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
-+          "Test the Verlet non-bonded scheme" },
-+        { "-v",       FALSE, etBOOL, {&bVerbose},
-+          "Be loud and noisy" },
-+        { "-compact", FALSE, etBOOL, {&bCompact},
-+          "Write a compact log file" },
-+        { "-seppot",  FALSE, etBOOL, {&bSepPot},
-+          "Write separate V and dVdl terms for each interaction type and node to the log file(s)" },
-+        { "-pforce",  FALSE, etREAL, {&pforce},
-+          "Print all forces larger than this (kJ/mol nm)" },
-+        { "-reprod",  FALSE, etBOOL, {&bReproducible},
-+          "Try to avoid optimizations that affect binary reproducibility" },
-+        { "-cpt",     FALSE, etREAL, {&cpt_period},
-+          "Checkpoint interval (minutes)" },
-+        { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
-+          "Keep and number checkpoint files" },
-+        { "-append",  FALSE, etBOOL, {&bAppendFiles},
-+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
-+        { "-nsteps",  FALSE, etGMX_LARGE_INT, {&nsteps},
-+          "Run this number of steps, overrides .mdp file option" },
-+        { "-maxh",   FALSE, etREAL, {&max_hours},
-+          "Terminate after 0.99 times this time (hours)" },
-+        { "-multi",   FALSE, etINT, {&nmultisim},
-+          "Do multiple simulations in parallel" },
-+        { "-replex",  FALSE, etINT, {&repl_ex_nst},
-+          "Attempt replica exchange periodically with this period (steps)" },
-+        { "-nex",  FALSE, etINT, {&repl_ex_nex},
-+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
-+        { "-reseed",  FALSE, etINT, {&repl_ex_seed},
-+          "Seed for replica exchange, -1 is generate a seed" },
-+        { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
-+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
-+        { "-ionize",  FALSE, etBOOL, {&bIonize},
-+          "Do a simulation including the effect of an X-Ray bombardment on your system" },
-+        { "-confout", FALSE, etBOOL, {&bConfout},
-+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
-+        { "-stepout", FALSE, etINT, {&nstepout},
-+          "HIDDENFrequency of writing the remaining runtime" },
-+        { "-resetstep", FALSE, etINT, {&resetstep},
-+          "HIDDENReset cycle counters after these many time steps" },
-+        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
-+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
-+    };
-+    gmx_edsam_t   ed;
-+    unsigned long Flags, PCA_Flags;
-+    ivec          ddxyz;
-+    int           dd_node_order;
-+    gmx_bool      bAddPart;
-+    FILE         *fplog, *fpmulti;
-+    int           sim_part, sim_part_fn;
-+    const char   *part_suffix = ".part";
-+    char          suffix[STRLEN];
-+    int           rc;
-+    char        **multidir = NULL;
-+
-+
-+    cr = init_par(&argc, &argv);
-+
-+    if (MASTER(cr))
-+    {
-+        CopyRight(stderr, argv[0]);
-+    }
-+
-+    PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
-+
-+    /* Comment this in to do fexist calls only on master
-+     * works not with rerun or tables at the moment
-+     * also comment out the version of init_forcerec in md.c
-+     * with NULL instead of opt2fn
-+     */
-+    /*
-+       if (!MASTER(cr))
-+       {
-+       PCA_Flags |= PCA_NOT_READ_NODE;
-+       }
-+     */
-+
-+    parse_common_args(&argc, argv, PCA_Flags, NFILE, fnm, asize(pa), pa,
-+                      asize(desc), desc, 0, NULL, &oenv);
-+
-+
-+    /* we set these early because they might be used in init_multisystem()
-+       Note that there is the potential for npme>nnodes until the number of
-+       threads is set later on, if there's thread parallelization. That shouldn't
-+       lead to problems. */
-+    dd_node_order = nenum(ddno_opt);
-+    cr->npmenodes = npme;
-+
-+    hw_opt.thread_affinity = nenum(thread_aff_opt);
-+
-+    /* now check the -multi and -multidir option */
-+    if (opt2bSet("-multidir", NFILE, fnm))
-+    {
-+        int i;
-+        if (nmultisim > 0)
-+        {
-+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
-+        }
-+        nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
-+    }
-+
-+
-+    if (repl_ex_nst != 0 && nmultisim < 2)
-+    {
-+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multi)");
-+    }
-+
-+    if (repl_ex_nex < 0)
-+    {
-+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
-+    }
-+
-+    if (nmultisim > 1)
-+    {
-+#ifndef GMX_THREAD_MPI
-+        gmx_bool bParFn = (multidir == NULL);
-+        init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
-+#else
-+        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library.Please compile GROMACS with MPI support");
-+#endif
-+    }
-+
-+    bAddPart = !bAppendFiles;
-+
-+    /* Check if there is ANY checkpoint file available */
-+    sim_part    = 1;
-+    sim_part_fn = sim_part;
-+    if (opt2bSet("-cpi", NFILE, fnm))
-+    {
-+        if (bSepPot && bAppendFiles)
-+        {
-+            gmx_fatal(FARGS, "Output file appending is not supported with -seppot");
-+        }
-+
-+        bAppendFiles =
-+            read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
-+                                                          fnm, cr),
-+                                            &sim_part_fn, NULL, cr,
-+                                            bAppendFiles, NFILE, fnm,
-+                                            part_suffix, &bAddPart);
-+        if (sim_part_fn == 0 && MULTIMASTER(cr))
-+        {
-+            fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
-+        }
-+        else
-+        {
-+            sim_part = sim_part_fn + 1;
-+        }
-+
-+        if (MULTISIM(cr) && MASTER(cr))
-+        {
-+            if (MULTIMASTER(cr))
-+            {
-+                /* Log file is not yet available, so if there's a
-+                 * problem we can only write to stderr. */
-+                fpmulti = stderr;
-+            }
-+            else
-+            {
-+                fpmulti = NULL;
-+            }
-+            check_multi_int(fpmulti, cr->ms, sim_part, "simulation part", TRUE);
-+        }
-+    }
-+    else
-+    {
-+        bAppendFiles = FALSE;
-+    }
-+
-+    if (!bAppendFiles)
-+    {
-+        sim_part_fn = sim_part;
-+    }
-+
-+    if (bAddPart)
-+    {
-+        /* Rename all output files (except checkpoint files) */
-+        /* create new part name first (zero-filled) */
-+        sprintf(suffix, "%s%04d", part_suffix, sim_part_fn);
-+
-+        add_suffix_to_output_names(fnm, NFILE, suffix);
-+        if (MULTIMASTER(cr))
-+        {
-+            fprintf(stdout, "Checkpoint file is from part %d, new output files will be suffixed '%s'.\n", sim_part-1, suffix);
-+        }
-+    }
-+
-+    Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
-+    Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
-+    Flags = Flags | (bIonize       ? MD_IONIZE       : 0);
-+    Flags = Flags | (bPartDec      ? MD_PARTDEC      : 0);
-+    Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
-+    Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
-+    Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
-+    Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
-+    Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
-+    Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
-+    Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
-+    Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
-+    Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
-+    Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
-+    Flags = Flags | (sim_part > 1    ? MD_STARTFROMCPT : 0);
-+    Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
-+
-+
-+    /* We postpone opening the log file if we are appending, so we can
-+       first truncate the old log file and append to the correct position
-+       there instead.  */
-+    if ((MASTER(cr) || bSepPot) && !bAppendFiles)
-+    {
-+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
-+                     !bSepPot, Flags & MD_APPENDFILES, &fplog);
-+        CopyRight(fplog, argv[0]);
-+        please_cite(fplog, "Hess2008b");
-+        please_cite(fplog, "Spoel2005a");
-+        please_cite(fplog, "Lindahl2001a");
-+        please_cite(fplog, "Berendsen95a");
-+    }
-+    else if (!MASTER(cr) && bSepPot)
-+    {
-+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr, !bSepPot, Flags, &fplog);
-+    }
-+    else
-+    {
-+        fplog = NULL;
-+    }
-+
-+    ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
-+    ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-+    ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
-+
-+    rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-+                  nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-+                  dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
-+                  nbpu_opt[0],
-+                  nsteps, nstepout, resetstep,
-+                  nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-+                  pforce, cpt_period, max_hours, deviceOptions, Flags);
-+
-+    gmx_finalize_par();
-+
-+    if (MULTIMASTER(cr))
-+    {
-+        thanx(stderr);
-+    }
-+
-+    /* Log file has to be closed in mdrunner if we are appending to it
-+       (fplog not set here) */
-+    if (MASTER(cr) && !bAppendFiles)
-+    {
-+        gmx_log_close(fplog);
-+    }
-+
-+    return rc;
-+}
-diff --git a/src/kernel/repl_ex.c b/src/kernel/repl_ex.c
-index 0f094d4..7f27136 100644
---- a/src/kernel/repl_ex.c
-+++ b/src/kernel/repl_ex.c
-@@ -53,6 +53,12 @@
- #include "domdec.h"
- #include "partdec.h"
- 
-+/* PLUMED */
-+#include "../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #define PROBABILITYCUTOFF 100
- /* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
- 
-@@ -113,14 +119,16 @@ static gmx_bool repl_quantity(FILE *fplog, const gmx_multisim_t *ms,
-     qall[re->repl] = q;
-     gmx_sum_sim(ms->nsim, qall, ms);
- 
--    bDiff = FALSE;
--    for (s = 1; s < ms->nsim; s++)
--    {
--        if (qall[s] != qall[0])
--        {
-+    /* PLUMED */
-+    //bDiff = FALSE;
-+    //for (s = 1; s < ms->nsim; s++)
-+    //{
-+    //    if (qall[s] != qall[0])
-+    //    {
-             bDiff = TRUE;
--        }
--    }
-+    //    }
-+    //}
-+    /* END PLUMED */
- 
-     if (bDiff)
-     {
-@@ -257,6 +265,10 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-         re->ind[i] = i;
-     }
- 
-+    /* PLUMED */
-+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
-+    // in those cases replicas can share the same temperature.
-+    /*
-     if (re->type < ereENDSINGLE)
-     {
- 
-@@ -266,10 +278,6 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-             {
-                 if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
-                 {
--                    /* Unordered replicas are supposed to work, but there
--                     * is still an issues somewhere.
--                     * Note that at this point still re->ind[i]=i.
--                     */
-                     gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
-                               i, j,
-                               erename[re->type],
-@@ -287,6 +295,8 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-             }
-         }
-     }
-+    */
-+    /* END PLUMED */
- 
-     /* keep track of all the swaps, starting with the initial placement. */
-     snew(re->allswaps, re->nrepl);
-@@ -988,6 +998,10 @@ test_for_replica_exchange(FILE                 *fplog,
-         pind[i] = re->ind[i];
-     }
- 
-+    /* PLUMED */
-+    int plumed_test_exchange_pattern=0;
-+    /* END PLUMED */
-+
-     if (bMultiEx)
-     {
-         /* multiple random switch exchange */
-@@ -1057,6 +1071,31 @@ test_for_replica_exchange(FILE                 *fplog,
-     {
-         /* standard nearest neighbor replica exchange */
-         m = (step / re->nst) % 2;
-+        /* PLUMED */
-+        if(plumedswitch){
-+          int partner=re->repl;
-+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
-+          if(plumed_test_exchange_pattern>0){
-+            int *list;
-+            snew(list,re->nrepl);
-+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
-+            plumed_cmd(plumedmain,"getExchangesList",list);
-+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
-+            sfree(list);
-+          }
-+
-+          for(i=1; i<re->nrepl; i++) {
-+            if (i % 2 != m) continue;
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+            if(re->repl==a) partner=b;
-+            if(re->repl==b) partner=a;
-+          }
-+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
-+          plumed_cmd(plumedmain,"GREX calculate",NULL);
-+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
-+        }
-+        /* END PLUMED */
-         for (i = 1; i < re->nrepl; i++)
-         {
-             a = re->ind[i-1];
-@@ -1066,6 +1105,18 @@ test_for_replica_exchange(FILE                 *fplog,
-             if (i % 2 == m)
-             {
-                 delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  real adb,bdb,dplumed;
-+                  char buf[300];
-+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
-+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
-+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
-+                  delta+=dplumed;
-+                  if (bPrint)
-+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
-+                }
-+                /* END PLUMED */
-                 if (delta <= 0)
-                 {
-                     /* accepted */
-@@ -1089,11 +1140,22 @@ test_for_replica_exchange(FILE                 *fplog,
- 
-                 if (bEx[i])
-                 {
-+                  /* PLUMED */
-+                  if(!plumed_test_exchange_pattern) {
-+                    /* standard neighbour swapping */
-                     /* swap these two */
-                     tmp       = pind[i-1];
-                     pind[i-1] = pind[i];
-                     pind[i]   = tmp;
-                     re->nexchange[i]++;  /* statistics for back compatibility */
-+                  } else {
-+                    /* alternative swapping patterns */
-+                    tmp       = pind[a];
-+                    pind[a]   = pind[b];
-+                    pind[b]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                  }
-+                  /* END PLUMED */
-                 }
-             }
-             else
-@@ -1109,6 +1171,15 @@ test_for_replica_exchange(FILE                 *fplog,
-         re->nattempt[m]++;
-     }
- 
-+    /* PLUMED */
-+    if(plumed_test_exchange_pattern>0) {
-+      for (i = 0; i < re->nrepl; i++)
-+      {
-+          re->ind[i] = i;
-+      }
-+    }
-+    /* END PLUMED */
-+
-     /* record which moves were made and accepted */
-     for (i = 0; i < re->nrepl; i++)
-     {
-@@ -1314,6 +1385,10 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
-     /* The order in which multiple exchanges will occur. */
-     gmx_bool bThisReplicaExchanged = FALSE;
- 
-+    /* PLUMED */
-+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
-+    /* END PLUMED */
-+
-     if (MASTER(cr))
-     {
-         replica_id  = re->repl;
-diff --git a/src/kernel/repl_ex.c.preplumed b/src/kernel/repl_ex.c.preplumed
-new file mode 100644
-index 0000000..0f094d4
---- /dev/null
-+++ b/src/kernel/repl_ex.c.preplumed
-@@ -0,0 +1,1450 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team,
-+ * check out http://www.gromacs.org for more information.
-+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
-+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
-+ * others, as listed in the AUTHORS file in the top-level source
-+ * directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <math.h>
-+#include "repl_ex.h"
-+#include "network.h"
-+#include "random.h"
-+#include "smalloc.h"
-+#include "physics.h"
-+#include "copyrite.h"
-+#include "macros.h"
-+#include "vec.h"
-+#include "names.h"
-+#include "mvdata.h"
-+#include "domdec.h"
-+#include "partdec.h"
-+
-+#define PROBABILITYCUTOFF 100
-+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
-+
-+enum {
-+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
-+};
-+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
-+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
-+   it are multiple replica exchange methods */
-+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
-+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
-+
-+typedef struct gmx_repl_ex
-+{
-+    int      repl;
-+    int      nrepl;
-+    real     temp;
-+    int      type;
-+    real   **q;
-+    gmx_bool bNPT;
-+    real    *pres;
-+    int     *ind;
-+    int     *allswaps;
-+    int      nst;
-+    int      nex;
-+    int      seed;
-+    int      nattempt[2];
-+    real    *prob_sum;
-+    int    **nmoves;
-+    int     *nexchange;
-+
-+    /* these are helper arrays for replica exchange; allocated here so they
-+       don't have to be allocated each time */
-+    int      *destinations;
-+    int     **cyclic;
-+    int     **order;
-+    int      *tmpswap;
-+    gmx_bool *incycle;
-+    gmx_bool *bEx;
-+
-+    /* helper arrays to hold the quantities that are exchanged */
-+    real  *prob;
-+    real  *Epot;
-+    real  *beta;
-+    real  *Vol;
-+    real **de;
-+
-+} t_gmx_repl_ex;
-+
-+static gmx_bool repl_quantity(FILE *fplog, const gmx_multisim_t *ms,
-+                              struct gmx_repl_ex *re, int ere, real q)
-+{
-+    real    *qall;
-+    gmx_bool bDiff;
-+    int      i, s;
-+
-+    snew(qall, ms->nsim);
-+    qall[re->repl] = q;
-+    gmx_sum_sim(ms->nsim, qall, ms);
-+
-+    bDiff = FALSE;
-+    for (s = 1; s < ms->nsim; s++)
-+    {
-+        if (qall[s] != qall[0])
-+        {
-+            bDiff = TRUE;
-+        }
-+    }
-+
-+    if (bDiff)
-+    {
-+        /* Set the replica exchange type and quantities */
-+        re->type = ere;
-+
-+        snew(re->q[ere], re->nrepl);
-+        for (s = 0; s < ms->nsim; s++)
-+        {
-+            re->q[ere][s] = qall[s];
-+        }
-+    }
-+    sfree(qall);
-+    return bDiff;
-+}
-+
-+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-+                                    const gmx_multisim_t *ms,
-+                                    const t_state *state,
-+                                    const t_inputrec *ir,
-+                                    int nst, int nex, int init_seed)
-+{
-+    real                temp, pres;
-+    int                 i, j, k;
-+    struct gmx_repl_ex *re;
-+    gmx_bool            bTemp;
-+    gmx_bool            bLambda = FALSE;
-+
-+    fprintf(fplog, "\nInitializing Replica Exchange\n");
-+
-+    if (ms == NULL || ms->nsim == 1)
-+    {
-+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
-+    }
-+
-+    snew(re, 1);
-+
-+    re->repl     = ms->sim;
-+    re->nrepl    = ms->nsim;
-+    snew(re->q, ereENDSINGLE);
-+
-+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
-+
-+    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
-+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
-+    check_multi_large_int(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
-+    check_multi_large_int(fplog, ms, (ir->init_step+nst-1)/nst,
-+                          "first exchange step: init_step/-replex", FALSE);
-+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
-+    check_multi_int(fplog, ms, ir->opts.ngtc,
-+                    "the number of temperature coupling groups", FALSE);
-+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
-+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
-+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
-+
-+    re->temp = ir->opts.ref_t[0];
-+    for (i = 1; (i < ir->opts.ngtc); i++)
-+    {
-+        if (ir->opts.ref_t[i] != re->temp)
-+        {
-+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
-+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
-+        }
-+    }
-+
-+    re->type = -1;
-+    bTemp    = repl_quantity(fplog, ms, re, ereTEMP, re->temp);
-+    if (ir->efep != efepNO)
-+    {
-+        bLambda = repl_quantity(fplog, ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
-+    }
-+    if (re->type == -1)  /* nothing was assigned */
-+    {
-+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
-+    }
-+    if (bLambda && bTemp)
-+    {
-+        re->type = ereTL;
-+    }
-+
-+    if (bTemp)
-+    {
-+        please_cite(fplog, "Sugita1999a");
-+        if (ir->epc != epcNO)
-+        {
-+            re->bNPT = TRUE;
-+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
-+            please_cite(fplog, "Okabe2001a");
-+        }
-+        if (ir->etc == etcBERENDSEN)
-+        {
-+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
-+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
-+        }
-+    }
-+    if (bLambda)
-+    {
-+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
-+        {
-+            gmx_fatal(FARGS, "delta_lambda is not zero");
-+        }
-+    }
-+    if (re->bNPT)
-+    {
-+        snew(re->pres, re->nrepl);
-+        if (ir->epct == epctSURFACETENSION)
-+        {
-+            pres = ir->ref_p[ZZ][ZZ];
-+        }
-+        else
-+        {
-+            pres = 0;
-+            j    = 0;
-+            for (i = 0; i < DIM; i++)
-+            {
-+                if (ir->compress[i][i] != 0)
-+                {
-+                    pres += ir->ref_p[i][i];
-+                    j++;
-+                }
-+            }
-+            pres /= j;
-+        }
-+        re->pres[re->repl] = pres;
-+        gmx_sum_sim(re->nrepl, re->pres, ms);
-+    }
-+
-+    /* Make an index for increasing replica order */
-+    /* only makes sense if one or the other is varying, not both!
-+       if both are varying, we trust the order the person gave. */
-+    snew(re->ind, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->ind[i] = i;
-+    }
-+
-+    if (re->type < ereENDSINGLE)
-+    {
-+
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = i+1; j < re->nrepl; j++)
-+            {
-+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
-+                {
-+                    /* Unordered replicas are supposed to work, but there
-+                     * is still an issues somewhere.
-+                     * Note that at this point still re->ind[i]=i.
-+                     */
-+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
-+                              i, j,
-+                              erename[re->type],
-+                              re->q[re->type][i], re->q[re->type][j],
-+                              erename[re->type]);
-+
-+                    k          = re->ind[i];
-+                    re->ind[i] = re->ind[j];
-+                    re->ind[j] = k;
-+                }
-+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
-+                {
-+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
-+                }
-+            }
-+        }
-+    }
-+
-+    /* keep track of all the swaps, starting with the initial placement. */
-+    snew(re->allswaps, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->allswaps[i] = re->ind[i];
-+    }
-+
-+    switch (re->type)
-+    {
-+        case ereTEMP:
-+            fprintf(fplog, "\nReplica exchange in temperature\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        case ereLAMBDA:
-+            fprintf(fplog, "\nReplica exchange in lambda\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        case ereTL:
-+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        default:
-+            gmx_incons("Unknown replica exchange quantity");
-+    }
-+    if (re->bNPT)
-+    {
-+        fprintf(fplog, "\nRepl  p");
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
-+        }
-+
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
-+            {
-+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
-+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
-+            }
-+        }
-+    }
-+    re->nst = nst;
-+    if (init_seed == -1)
-+    {
-+        if (MASTERSIM(ms))
-+        {
-+            re->seed = make_seed();
-+        }
-+        else
-+        {
-+            re->seed = 0;
-+        }
-+        gmx_sumi_sim(1, &(re->seed), ms);
-+    }
-+    else
-+    {
-+        re->seed = init_seed;
-+    }
-+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
-+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
-+
-+    re->nattempt[0] = 0;
-+    re->nattempt[1] = 0;
-+
-+    snew(re->prob_sum, re->nrepl);
-+    snew(re->nexchange, re->nrepl);
-+    snew(re->nmoves, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->nmoves[i], re->nrepl);
-+    }
-+    fprintf(fplog, "Replica exchange information below: x=exchange, pr=probability\n");
-+
-+    /* generate space for the helper functions so we don't have to snew each time */
-+
-+    snew(re->destinations, re->nrepl);
-+    snew(re->incycle, re->nrepl);
-+    snew(re->tmpswap, re->nrepl);
-+    snew(re->cyclic, re->nrepl);
-+    snew(re->order, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->cyclic[i], re->nrepl);
-+        snew(re->order[i], re->nrepl);
-+    }
-+    /* allocate space for the functions storing the data for the replicas */
-+    /* not all of these arrays needed in all cases, but they don't take
-+       up much space, since the max size is nrepl**2 */
-+    snew(re->prob, re->nrepl);
-+    snew(re->bEx, re->nrepl);
-+    snew(re->beta, re->nrepl);
-+    snew(re->Vol, re->nrepl);
-+    snew(re->Epot, re->nrepl);
-+    snew(re->de, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->de[i], re->nrepl);
-+    }
-+    re->nex = nex;
-+    return re;
-+}
-+
-+static void exchange_reals(const gmx_multisim_t *ms, int b, real *v, int n)
-+{
-+    real *buf;
-+    int   i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+
-+static void exchange_ints(const gmx_multisim_t *ms, int b, int *v, int n)
-+{
-+    int *buf;
-+    int  i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
-+             buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
-+             ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_doubles(const gmx_multisim_t *ms, int b, double *v, int n)
-+{
-+    double *buf;
-+    int     i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_rvecs(const gmx_multisim_t *ms, int b, rvec *v, int n)
-+{
-+    rvec *buf;
-+    int   i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            copy_rvec(buf[i], v[i]);
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
-+{
-+    /* When t_state changes, this code should be updated. */
-+    int ngtc, nnhpres;
-+    ngtc    = state->ngtc * state->nhchainlength;
-+    nnhpres = state->nnhpres* state->nhchainlength;
-+    exchange_rvecs(ms, b, state->box, DIM);
-+    exchange_rvecs(ms, b, state->box_rel, DIM);
-+    exchange_rvecs(ms, b, state->boxv, DIM);
-+    exchange_reals(ms, b, &(state->veta), 1);
-+    exchange_reals(ms, b, &(state->vol0), 1);
-+    exchange_rvecs(ms, b, state->svir_prev, DIM);
-+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
-+    exchange_rvecs(ms, b, state->pres_prev, DIM);
-+    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
-+    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
-+    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
-+    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
-+    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
-+    exchange_rvecs(ms, b, state->x, state->natoms);
-+    exchange_rvecs(ms, b, state->v, state->natoms);
-+    exchange_rvecs(ms, b, state->sd_X, state->natoms);
-+}
-+
-+static void copy_rvecs(rvec *s, rvec *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            copy_rvec(s[i], d[i]);
-+        }
-+    }
-+}
-+
-+static void copy_doubles(const double *s, double *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+static void copy_reals(const real *s, real *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+static void copy_ints(const int *s, int *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
-+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
-+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
-+#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
-+
-+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
-+{
-+    /* When t_state changes, this code should be updated. */
-+    int ngtc, nnhpres;
-+    ngtc    = state->ngtc * state->nhchainlength;
-+    nnhpres = state->nnhpres* state->nhchainlength;
-+    scopy_rvecs(box, DIM);
-+    scopy_rvecs(box_rel, DIM);
-+    scopy_rvecs(boxv, DIM);
-+    state_local->veta = state->veta;
-+    state_local->vol0 = state->vol0;
-+    scopy_rvecs(svir_prev, DIM);
-+    scopy_rvecs(fvir_prev, DIM);
-+    scopy_rvecs(pres_prev, DIM);
-+    scopy_doubles(nosehoover_xi, ngtc);
-+    scopy_doubles(nosehoover_vxi, ngtc);
-+    scopy_doubles(nhpres_xi, nnhpres);
-+    scopy_doubles(nhpres_vxi, nnhpres);
-+    scopy_doubles(therm_integral, state->ngtc);
-+    scopy_rvecs(x, state->natoms);
-+    scopy_rvecs(v, state->natoms);
-+    scopy_rvecs(sd_X, state->natoms);
-+    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
-+    scopy_reals(lambda, efptNR);
-+}
-+
-+static void scale_velocities(t_state *state, real fac)
-+{
-+    int i;
-+
-+    if (state->v)
-+    {
-+        for (i = 0; i < state->natoms; i++)
-+        {
-+            svmul(fac, state->v[i], state->v[i]);
-+        }
-+    }
-+}
-+
-+static void pd_collect_state(const t_commrec *cr, t_state *state)
-+{
-+    int shift;
-+
-+    if (debug)
-+    {
-+        fprintf(debug, "Collecting state before exchange\n");
-+    }
-+    shift = cr->nnodes - cr->npmenodes - 1;
-+    move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->x, NULL, shift, NULL);
-+    if (state->v)
-+    {
-+        move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->v, NULL, shift, NULL);
-+    }
-+    if (state->sd_X)
-+    {
-+        move_rvecs(cr, FALSE, FALSE, GMX_LEFT, GMX_RIGHT, state->sd_X, NULL, shift, NULL);
-+    }
-+}
-+
-+static void print_transition_matrix(FILE *fplog, const char *leg, int n, int **nmoves, int *nattempt)
-+{
-+    int   i, j, ntot;
-+    float Tprint;
-+
-+    ntot = nattempt[0] + nattempt[1];
-+    fprintf(fplog, "\n");
-+    fprintf(fplog, "Repl");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "    ");  /* put the title closer to the center */
-+    }
-+    fprintf(fplog, "Empirical Transition Matrix\n");
-+
-+    fprintf(fplog, "Repl");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%8d", (i+1));
-+    }
-+    fprintf(fplog, "\n");
-+
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "Repl");
-+        for (j = 0; j < n; j++)
-+        {
-+            Tprint = 0.0;
-+            if (nmoves[i][j] > 0)
-+            {
-+                Tprint = nmoves[i][j]/(2.0*ntot);
-+            }
-+            fprintf(fplog, "%8.4f", Tprint);
-+        }
-+        fprintf(fplog, "%3d\n", i);
-+    }
-+}
-+
-+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
-+{
-+    int i;
-+
-+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
-+    for (i = 1; i < n; i++)
-+    {
-+        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static void print_allswitchind(FILE *fplog, int n, int *ind, int *pind, int *allswaps, int *tmpswap)
-+{
-+    int i;
-+
-+    for (i = 0; i < n; i++)
-+    {
-+        tmpswap[i] = allswaps[i];
-+    }
-+    for (i = 0; i < n; i++)
-+    {
-+        allswaps[i] = tmpswap[pind[i]];
-+    }
-+
-+    fprintf(fplog, "\nAccepted Exchanges:   ");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%d ", pind[i]);
-+    }
-+    fprintf(fplog, "\n");
-+
-+    /* the "Order After Exchange" is the state label corresponding to the configuration that
-+       started in state listed in order, i.e.
-+
-+       3 0 1 2
-+
-+       means that the:
-+       configuration starting in simulation 3 is now in simulation 0,
-+       configuration starting in simulation 0 is now in simulation 1,
-+       configuration starting in simulation 1 is now in simulation 2,
-+       configuration starting in simulation 2 is now in simulation 3
-+     */
-+    fprintf(fplog, "Order After Exchange: ");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%d ", allswaps[i]);
-+    }
-+    fprintf(fplog, "\n\n");
-+}
-+
-+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
-+{
-+    int  i;
-+    char buf[8];
-+
-+    fprintf(fplog, "Repl %2s ", leg);
-+    for (i = 1; i < n; i++)
-+    {
-+        if (prob[i] >= 0)
-+        {
-+            sprintf(buf, "%4.2f", prob[i]);
-+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
-+        }
-+        else
-+        {
-+            fprintf(fplog, "     ");
-+        }
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static void print_count(FILE *fplog, const char *leg, int n, int *count)
-+{
-+    int i;
-+
-+    fprintf(fplog, "Repl %2s ", leg);
-+    for (i = 1; i < n; i++)
-+    {
-+        fprintf(fplog, " %4d", count[i]);
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
-+{
-+
-+    real   ediff, dpV, delta = 0;
-+    real  *Epot = re->Epot;
-+    real  *Vol  = re->Vol;
-+    real **de   = re->de;
-+    real  *beta = re->beta;
-+
-+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
-+       to the non permuted case */
-+
-+    switch (re->type)
-+    {
-+        case ereTEMP:
-+            /*
-+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
-+             */
-+            ediff = Epot[b] - Epot[a];
-+            delta = -(beta[bp] - beta[ap])*ediff;
-+            break;
-+        case ereLAMBDA:
-+            /* two cases:  when we are permuted, and not.  */
-+            /* non-permuted:
-+               ediff =  E_new - E_old
-+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
-+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
-+                     =  de[b][a] + de[a][b] */
-+
-+            /* permuted:
-+               ediff =  E_new - E_old
-+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
-+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
-+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
-+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
-+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
-+            /* but, in the current code implementation, we flip configurations, not indices . . .
-+               So let's examine that.
-+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
-+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
-+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
-+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
-+                     So the simple solution is to flip the
-+                     position of perturbed and original indices in the tests.
-+             */
-+
-+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
-+            delta = ediff*beta[a]; /* assume all same temperature in this case */
-+            break;
-+        case ereTL:
-+            /* not permuted:  */
-+            /* delta =  reduced E_new - reduced E_old
-+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
-+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
-+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
-+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
-+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
-+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
-+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
-+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
-+            /* permuted (big breath!) */
-+            /*   delta =  reduced E_new - reduced E_old
-+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
-+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
-+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
-+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
-+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
-+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
-+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
-+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
-+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
-+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
-+            break;
-+        default:
-+            gmx_incons("Unknown replica exchange quantity");
-+    }
-+    if (bPrint)
-+    {
-+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
-+    }
-+    if (re->bNPT)
-+    {
-+        /* revist the calculation for 5.0.  Might be some improvements. */
-+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
-+        if (bPrint)
-+        {
-+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
-+        }
-+        delta += dpV;
-+    }
-+    return delta;
-+}
-+
-+static void
-+test_for_replica_exchange(FILE                 *fplog,
-+                          const gmx_multisim_t *ms,
-+                          struct gmx_repl_ex   *re,
-+                          gmx_enerdata_t       *enerd,
-+                          real                  vol,
-+                          gmx_large_int_t       step,
-+                          real                  time)
-+{
-+    int       m, i, j, a, b, ap, bp, i0, i1, tmp;
-+    real      ediff = 0, delta = 0, dpV = 0;
-+    gmx_bool  bPrint, bMultiEx;
-+    gmx_bool *bEx      = re->bEx;
-+    real     *prob     = re->prob;
-+    int      *pind     = re->destinations; /* permuted index */
-+    gmx_bool  bEpot    = FALSE;
-+    gmx_bool  bDLambda = FALSE;
-+    gmx_bool  bVol     = FALSE;
-+
-+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
-+    fprintf(fplog, "Replica exchange at step " gmx_large_int_pfmt " time %.5f\n", step, time);
-+
-+    if (re->bNPT)
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->Vol[i] = 0;
-+        }
-+        bVol               = TRUE;
-+        re->Vol[re->repl]  = vol;
-+    }
-+    if ((re->type == ereTEMP || re->type == ereTL))
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->Epot[i] = 0;
-+        }
-+        bEpot              = TRUE;
-+        re->Epot[re->repl] = enerd->term[F_EPOT];
-+        /* temperatures of different states*/
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
-+        }
-+    }
-+    else
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
-+        }
-+    }
-+    if (re->type == ereLAMBDA || re->type == ereTL)
-+    {
-+        bDLambda = TRUE;
-+        /* lambda differences. */
-+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
-+           minus the energy of the jth simulation in the jth Hamiltonian */
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = 0; j < re->nrepl; j++)
-+            {
-+                re->de[i][j] = 0;
-+            }
-+        }
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
-+        }
-+    }
-+
-+    /* now actually do the communication */
-+    if (bVol)
-+    {
-+        gmx_sum_sim(re->nrepl, re->Vol, ms);
-+    }
-+    if (bEpot)
-+    {
-+        gmx_sum_sim(re->nrepl, re->Epot, ms);
-+    }
-+    if (bDLambda)
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            gmx_sum_sim(re->nrepl, re->de[i], ms);
-+        }
-+    }
-+
-+    /* make a duplicate set of indices for shuffling */
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        pind[i] = re->ind[i];
-+    }
-+
-+    if (bMultiEx)
-+    {
-+        /* multiple random switch exchange */
-+        for (i = 0; i < re->nex; i++)
-+        {
-+            /* randomly select a pair  */
-+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
-+               probability of occurring (log p > -100) and only operate on those switches */
-+            /* find out which state it is from, and what label that state currently has. Likely
-+               more work that useful. */
-+            i0 = (int)(re->nrepl*rando(&(re->seed)));
-+            i1 = (int)(re->nrepl*rando(&(re->seed)));
-+            if (i0 == i1)
-+            {
-+                i--;
-+                continue;  /* self-exchange, back up and do it again */
-+            }
-+
-+            a  = re->ind[i0]; /* what are the indices of these states? */
-+            b  = re->ind[i1];
-+            ap = pind[i0];
-+            bp = pind[i1];
-+
-+            bPrint = FALSE; /* too noisy */
-+            /* calculate the energy difference */
-+            /* if the code changes to flip the STATES, rather than the configurations,
-+               use the commented version of the code */
-+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
-+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
-+
-+            /* we actually only use the first space in the prob and bEx array,
-+               since there are actually many switches between pairs. */
-+
-+            if (delta <= 0)
-+            {
-+                /* accepted */
-+                prob[0] = 1;
-+                bEx[0]  = TRUE;
-+            }
-+            else
-+            {
-+                if (delta > PROBABILITYCUTOFF)
-+                {
-+                    prob[0] = 0;
-+                }
-+                else
-+                {
-+                    prob[0] = exp(-delta);
-+                }
-+                /* roll a number to determine if accepted */
-+                bEx[0] = (rando(&(re->seed)) < prob[0]);
-+            }
-+            re->prob_sum[0] += prob[0];
-+
-+            if (bEx[0])
-+            {
-+                /* swap the states */
-+                tmp      = pind[i0];
-+                pind[i0] = pind[i1];
-+                pind[i1] = tmp;
-+            }
-+        }
-+        re->nattempt[0]++;  /* keep track of total permutation trials here */
-+        print_allswitchind(fplog, re->nrepl, re->ind, pind, re->allswaps, re->tmpswap);
-+    }
-+    else
-+    {
-+        /* standard nearest neighbor replica exchange */
-+        m = (step / re->nst) % 2;
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+
-+            bPrint = (re->repl == a || re->repl == b);
-+            if (i % 2 == m)
-+            {
-+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                if (delta <= 0)
-+                {
-+                    /* accepted */
-+                    prob[i] = 1;
-+                    bEx[i]  = TRUE;
-+                }
-+                else
-+                {
-+                    if (delta > PROBABILITYCUTOFF)
-+                    {
-+                        prob[i] = 0;
-+                    }
-+                    else
-+                    {
-+                        prob[i] = exp(-delta);
-+                    }
-+                    /* roll a number to determine if accepted */
-+                    bEx[i] = (rando(&(re->seed)) < prob[i]);
-+                }
-+                re->prob_sum[i] += prob[i];
-+
-+                if (bEx[i])
-+                {
-+                    /* swap these two */
-+                    tmp       = pind[i-1];
-+                    pind[i-1] = pind[i];
-+                    pind[i]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                }
-+            }
-+            else
-+            {
-+                prob[i] = -1;
-+                bEx[i]  = FALSE;
-+            }
-+        }
-+        /* print some statistics */
-+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
-+        print_prob(fplog, "pr", re->nrepl, prob);
-+        fprintf(fplog, "\n");
-+        re->nattempt[m]++;
-+    }
-+
-+    /* record which moves were made and accepted */
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->nmoves[re->ind[i]][pind[i]] += 1;
-+        re->nmoves[pind[i]][re->ind[i]] += 1;
-+    }
-+    fflush(fplog); /* make sure we can see what the last exchange was */
-+}
-+
-+static void write_debug_x(t_state *state)
-+{
-+    int i;
-+
-+    if (debug)
-+    {
-+        for (i = 0; i < state->natoms; i += 10)
-+        {
-+            fprintf(debug, "dx %5d %10.5f %10.5f %10.5f\n", i, state->x[i][XX], state->x[i][YY], state->x[i][ZZ]);
-+        }
-+    }
-+}
-+
-+static void
-+cyclic_decomposition(FILE      *fplog,
-+                     const int *destinations,
-+                     int      **cyclic,
-+                     gmx_bool  *incycle,
-+                     const int  nrepl,
-+                     int       *nswap)
-+{
-+
-+    int i, j, c, p;
-+    int maxlen = 1;
-+    for (i = 0; i < nrepl; i++)
-+    {
-+        incycle[i] = FALSE;
-+    }
-+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
-+    {
-+        if (incycle[i])
-+        {
-+            cyclic[i][0] = -1;
-+            continue;
-+        }
-+        cyclic[i][0] = i;
-+        incycle[i]   = TRUE;
-+        c            = 1;
-+        p            = i;
-+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
-+        {
-+            p = destinations[p];    /* start permuting */
-+            if (p == i)
-+            {
-+                cyclic[i][c] = -1;
-+                if (c > maxlen)
-+                {
-+                    maxlen = c;
-+                }
-+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
-+            }
-+            else
-+            {
-+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
-+                incycle[p]   = TRUE;
-+                c++;
-+            }
-+        }
-+    }
-+    *nswap = maxlen - 1;
-+
-+    if (debug)
-+    {
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            fprintf(debug, "Cycle %d:", i);
-+            for (j = 0; j < nrepl; j++)
-+            {
-+                if (cyclic[i][j] < 0)
-+                {
-+                    break;
-+                }
-+                fprintf(debug, "%2d", cyclic[i][j]);
-+            }
-+            fprintf(debug, "\n");
-+        }
-+        fflush(debug);
-+    }
-+}
-+
-+static void
-+compute_exchange_order(FILE     *fplog,
-+                       int     **cyclic,
-+                       int     **order,
-+                       const int nrepl,
-+                       const int maxswap)
-+{
-+    int i, j;
-+
-+    for (j = 0; j < maxswap; j++)
-+    {
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            if (cyclic[i][j+1] >= 0)
-+            {
-+                order[cyclic[i][j+1]][j] = cyclic[i][j];
-+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
-+            }
-+        }
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            if (order[i][j] < 0)
-+            {
-+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
-+            }
-+        }
-+    }
-+
-+    if (debug)
-+    {
-+        fprintf(fplog, "Replica Exchange Order\n");
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            fprintf(fplog, "Replica %d:", i);
-+            for (j = 0; j < maxswap; j++)
-+            {
-+                if (order[i][j] < 0)
-+                {
-+                    break;
-+                }
-+                fprintf(debug, "%2d", order[i][j]);
-+            }
-+            fprintf(fplog, "\n");
-+        }
-+        fflush(fplog);
-+    }
-+}
-+
-+static void
-+prepare_to_do_exchange(FILE      *fplog,
-+                       struct gmx_repl_ex *re,
-+                       const int  replica_id,
-+                       int       *maxswap,
-+                       gmx_bool  *bThisReplicaExchanged)
-+{
-+    int i, j;
-+    /* Hold the cyclic decomposition of the (multiple) replica
-+     * exchange. */
-+    gmx_bool bAnyReplicaExchanged = FALSE;
-+    *bThisReplicaExchanged = FALSE;
-+
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        if (re->destinations[i] != re->ind[i])
-+        {
-+            /* only mark as exchanged if the index has been shuffled */
-+            bAnyReplicaExchanged = TRUE;
-+            break;
-+        }
-+    }
-+    if (bAnyReplicaExchanged)
-+    {
-+        /* reinitialize the placeholder arrays */
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = 0; j < re->nrepl; j++)
-+            {
-+                re->cyclic[i][j] = -1;
-+                re->order[i][j]  = -1;
-+            }
-+        }
-+
-+        /* Identify the cyclic decomposition of the permutation (very
-+         * fast if neighbor replica exchange). */
-+        cyclic_decomposition(fplog, re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
-+
-+        /* Now translate the decomposition into a replica exchange
-+         * order at each step. */
-+        compute_exchange_order(fplog, re->cyclic, re->order, re->nrepl, *maxswap);
-+
-+        /* Did this replica do any exchange at any point? */
-+        for (j = 0; j < *maxswap; j++)
-+        {
-+            if (replica_id != re->order[replica_id][j])
-+            {
-+                *bThisReplicaExchanged = TRUE;
-+                break;
-+            }
-+        }
-+    }
-+}
-+
-+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
-+                          t_state *state, gmx_enerdata_t *enerd,
-+                          t_state *state_local, gmx_large_int_t step, real time)
-+{
-+    int i, j;
-+    int replica_id = 0;
-+    int exchange_partner;
-+    int maxswap = 0;
-+    /* Number of rounds of exchanges needed to deal with any multiple
-+     * exchanges. */
-+    /* Where each replica ends up after the exchange attempt(s). */
-+    /* The order in which multiple exchanges will occur. */
-+    gmx_bool bThisReplicaExchanged = FALSE;
-+
-+    if (MASTER(cr))
-+    {
-+        replica_id  = re->repl;
-+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
-+        prepare_to_do_exchange(fplog, re, replica_id, &maxswap, &bThisReplicaExchanged);
-+    }
-+    /* Do intra-simulation broadcast so all processors belonging to
-+     * each simulation know whether they need to participate in
-+     * collecting the state. Otherwise, they might as well get on with
-+     * the next thing to do. */
-+    if (PAR(cr))
-+    {
-+#ifdef GMX_MPI
-+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
-+                  cr->mpi_comm_mygroup);
-+#endif
-+    }
-+
-+    if (bThisReplicaExchanged)
-+    {
-+        /* Exchange the states */
-+
-+        if (PAR(cr))
-+        {
-+            /* Collect the global state on the master node */
-+            if (DOMAINDECOMP(cr))
-+            {
-+                dd_collect_state(cr->dd, state_local, state);
-+            }
-+            else
-+            {
-+                pd_collect_state(cr, state);
-+            }
-+        }
-+        else
-+        {
-+            copy_state_nonatomdata(state_local, state);
-+        }
-+
-+        if (MASTER(cr))
-+        {
-+            /* There will be only one swap cycle with standard replica
-+             * exchange, but there may be multiple swap cycles if we
-+             * allow multiple swaps. */
-+
-+            for (j = 0; j < maxswap; j++)
-+            {
-+                exchange_partner = re->order[replica_id][j];
-+
-+                if (exchange_partner != replica_id)
-+                {
-+                    /* Exchange the global states between the master nodes */
-+                    if (debug)
-+                    {
-+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
-+                    }
-+                    exchange_state(cr->ms, exchange_partner, state);
-+                }
-+            }
-+            /* For temperature-type replica exchange, we need to scale
-+             * the velocities. */
-+            if (re->type == ereTEMP || re->type == ereTL)
-+            {
-+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
-+            }
-+
-+        }
-+
-+        /* With domain decomposition the global state is distributed later */
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            /* Copy the global state to the local state data structure */
-+            copy_state_nonatomdata(state, state_local);
-+
-+            if (PAR(cr))
-+            {
-+                bcast_state(cr, state, FALSE);
-+            }
-+        }
-+    }
-+
-+    return bThisReplicaExchanged;
-+}
-+
-+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
-+{
-+    int  i;
-+
-+    fprintf(fplog, "\nReplica exchange statistics\n");
-+
-+    if (re->nex == 0)
-+    {
-+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
-+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
-+
-+        fprintf(fplog, "Repl  average probabilities:\n");
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            if (re->nattempt[i%2] == 0)
-+            {
-+                re->prob[i] = 0;
-+            }
-+            else
-+            {
-+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
-+            }
-+        }
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_prob(fplog, "", re->nrepl, re->prob);
-+
-+        fprintf(fplog, "Repl  number of exchanges:\n");
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_count(fplog, "", re->nrepl, re->nexchange);
-+
-+        fprintf(fplog, "Repl  average number of exchanges:\n");
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            if (re->nattempt[i%2] == 0)
-+            {
-+                re->prob[i] = 0;
-+            }
-+            else
-+            {
-+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
-+            }
-+        }
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_prob(fplog, "", re->nrepl, re->prob);
-+
-+        fprintf(fplog, "\n");
-+    }
-+    /* print the transition matrix */
-+    print_transition_matrix(fplog, "", re->nrepl, re->nmoves, re->nattempt);
-+}
-diff --git a/src/mdlib/force.c b/src/mdlib/force.c
-index 75da6bd..a36cbd0 100644
---- a/src/mdlib/force.c
-+++ b/src/mdlib/force.c
-@@ -67,6 +67,14 @@
- #include "mpelogging.h"
- #include "gmx_omp_nthreads.h"
- 
-+/* PLUMED */
-+#include "../../Plumed.h"
-+int    plumedswitch=0;
-+plumed plumedmain;
-+void(*plumedcmd)(plumed,const char*,const void*)=NULL;
-+/* END PLUMED */
-+
-+
- void ns(FILE              *fp,
-         t_forcerec        *fr,
-         rvec               x[],
-@@ -692,6 +700,16 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
- 
-     GMX_MPE_LOG(ev_force_finish);
- 
-+
-+/* PLUMED */
-+ if(plumedswitch){
-+   int plumedNeedsEnergy;
-+   (*plumedcmd)(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+   if(!plumedNeedsEnergy) (*plumedcmd)(plumedmain,"performCalc",NULL);
-+ }
-+/* END PLUMED */
-+
-+
- }
- 
- void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
-diff --git a/src/mdlib/force.c.preplumed b/src/mdlib/force.c.preplumed
-new file mode 100644
-index 0000000..75da6bd
---- /dev/null
-+++ b/src/mdlib/force.c.preplumed
-@@ -0,0 +1,973 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team,
-+ * check out http://www.gromacs.org for more information.
-+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
-+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
-+ * others, as listed in the AUTHORS file in the top-level source
-+ * directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <math.h>
-+#include <string.h>
-+#include <assert.h>
-+#include "sysstuff.h"
-+#include "typedefs.h"
-+#include "macros.h"
-+#include "smalloc.h"
-+#include "macros.h"
-+#include "physics.h"
-+#include "force.h"
-+#include "nonbonded.h"
-+#include "names.h"
-+#include "network.h"
-+#include "pbc.h"
-+#include "ns.h"
-+#include "nrnb.h"
-+#include "bondf.h"
-+#include "mshift.h"
-+#include "txtdump.h"
-+#include "coulomb.h"
-+#include "pme.h"
-+#include "mdrun.h"
-+#include "domdec.h"
-+#include "partdec.h"
-+#include "qmmm.h"
-+#include "mpelogging.h"
-+#include "gmx_omp_nthreads.h"
-+
-+void ns(FILE              *fp,
-+        t_forcerec        *fr,
-+        rvec               x[],
-+        matrix             box,
-+        gmx_groups_t      *groups,
-+        t_grpopts         *opts,
-+        gmx_localtop_t    *top,
-+        t_mdatoms         *md,
-+        t_commrec         *cr,
-+        t_nrnb            *nrnb,
-+        real              *lambda,
-+        real              *dvdlambda,
-+        gmx_grppairener_t *grppener,
-+        gmx_bool           bFillGrid,
-+        gmx_bool           bDoLongRangeNS)
-+{
-+    char   *ptr;
-+    int     nsearch;
-+
-+    GMX_MPE_LOG(ev_ns_start);
-+    if (!fr->ns.nblist_initialized)
-+    {
-+        init_neighbor_list(fp, fr, md->homenr);
-+    }
-+
-+    if (fr->bTwinRange)
-+    {
-+        fr->nlr = 0;
-+    }
-+
-+    nsearch = search_neighbours(fp, fr, x, box, top, groups, cr, nrnb, md,
-+                                lambda, dvdlambda, grppener,
-+                                bFillGrid, bDoLongRangeNS, TRUE);
-+    if (debug)
-+    {
-+        fprintf(debug, "nsearch = %d\n", nsearch);
-+    }
-+
-+    /* Check whether we have to do dynamic load balancing */
-+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
-+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
-+       &(top->idef),opts->ngener);
-+     */
-+    if (fr->ns.dump_nl > 0)
-+    {
-+        dump_nblist(fp, cr, fr, fr->ns.dump_nl);
-+    }
-+
-+    GMX_MPE_LOG(ev_ns_finish);
-+}
-+
-+static void reduce_thread_forces(int n, rvec *f,
-+                                 tensor vir,
-+                                 real *Vcorr,
-+                                 int efpt_ind, real *dvdl,
-+                                 int nthreads, f_thread_t *f_t)
-+{
-+    int t, i;
-+
-+    /* This reduction can run over any number of threads */
-+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
-+    for (i = 0; i < n; i++)
-+    {
-+        for (t = 1; t < nthreads; t++)
-+        {
-+            rvec_inc(f[i], f_t[t].f[i]);
-+        }
-+    }
-+    for (t = 1; t < nthreads; t++)
-+    {
-+        *Vcorr += f_t[t].Vcorr;
-+        *dvdl  += f_t[t].dvdl[efpt_ind];
-+        m_add(vir, f_t[t].vir, vir);
-+    }
-+}
-+
-+void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
-+                       t_forcerec *fr,      t_inputrec *ir,
-+                       t_idef     *idef,    t_commrec  *cr,
-+                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
-+                       t_mdatoms  *md,
-+                       t_grpopts  *opts,
-+                       rvec       x[],      history_t  *hist,
-+                       rvec       f[],
-+                       rvec       f_longrange[],
-+                       gmx_enerdata_t *enerd,
-+                       t_fcdata   *fcd,
-+                       gmx_mtop_t     *mtop,
-+                       gmx_localtop_t *top,
-+                       gmx_genborn_t *born,
-+                       t_atomtypes *atype,
-+                       gmx_bool       bBornRadii,
-+                       matrix     box,
-+                       t_lambda   *fepvals,
-+                       real       *lambda,
-+                       t_graph    *graph,
-+                       t_blocka   *excl,
-+                       rvec       mu_tot[],
-+                       int        flags,
-+                       float      *cycles_pme)
-+{
-+    int         i, j, status;
-+    int         donb_flags;
-+    gmx_bool    bDoEpot, bSepDVDL, bSB;
-+    int         pme_flags;
-+    matrix      boxs;
-+    rvec        box_size;
-+    real        Vsr, Vlr, Vcorr = 0;
-+    t_pbc       pbc;
-+    real        dvdgb;
-+    char        buf[22];
-+    double      clam_i, vlam_i;
-+    real        dvdl_dum[efptNR], dvdl, dvdl_nb[efptNR], lam_i[efptNR];
-+    real        dvdlsum;
-+
-+#ifdef GMX_MPI
-+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
-+#endif
-+
-+#define PRINT_SEPDVDL(s, v, dvdlambda) if (bSepDVDL) {fprintf(fplog, sepdvdlformat, s, v, dvdlambda); }
-+
-+    GMX_MPE_LOG(ev_force_start);
-+    set_pbc(&pbc, fr->ePBC, box);
-+
-+    /* reset free energy components */
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        dvdl_nb[i]  = 0;
-+        dvdl_dum[i] = 0;
-+    }
-+
-+    /* Reset box */
-+    for (i = 0; (i < DIM); i++)
-+    {
-+        box_size[i] = box[i][i];
-+    }
-+
-+    bSepDVDL = (fr->bSepDVDL && do_per_step(step, ir->nstlog));
-+    debug_gmx();
-+
-+    /* do QMMM first if requested */
-+    if (fr->bQMMM)
-+    {
-+        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr, md);
-+    }
-+
-+    if (bSepDVDL)
-+    {
-+        fprintf(fplog, "Step %s: non-bonded V and dVdl for node %d:\n",
-+                gmx_step_str(step, buf), cr->nodeid);
-+    }
-+
-+    /* Call the short range functions all in one go. */
-+    GMX_MPE_LOG(ev_do_fnbf_start);
-+
-+#ifdef GMX_MPI
-+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
-+#define TAKETIME FALSE
-+    if (TAKETIME)
-+    {
-+        MPI_Barrier(cr->mpi_comm_mygroup);
-+        t0 = MPI_Wtime();
-+    }
-+#endif
-+
-+    if (ir->nwall)
-+    {
-+        /* foreign lambda component for walls */
-+        dvdl = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
-+                        enerd->grpp.ener[egLJSR], nrnb);
-+        PRINT_SEPDVDL("Walls", 0.0, dvdl);
-+        enerd->dvdl_lin[efptVDW] += dvdl;
-+    }
-+
-+    /* If doing GB, reset dvda and calculate the Born radii */
-+    if (ir->implicit_solvent)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
-+
-+        for (i = 0; i < born->nr; i++)
-+        {
-+            fr->dvda[i] = 0;
-+        }
-+
-+        if (bBornRadii)
-+        {
-+            calc_gb_rad(cr, fr, ir, top, atype, x, &(fr->gblist), born, md, nrnb);
-+        }
-+
-+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
-+    }
-+
-+    where();
-+    /* We only do non-bonded calculation with group scheme here, the verlet
-+     * calls are done from do_force_cutsVERLET(). */
-+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
-+    {
-+        donb_flags = 0;
-+        /* Add short-range interactions */
-+        donb_flags |= GMX_NONBONDED_DO_SR;
-+
-+        if (flags & GMX_FORCE_FORCES)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_FORCE;
-+        }
-+        if (flags & GMX_FORCE_ENERGY)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
-+        }
-+        if (flags & GMX_FORCE_DO_LR)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_LR;
-+        }
-+
-+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
-+        do_nonbonded(cr, fr, x, f, f_longrange, md, excl,
-+                     &enerd->grpp, box_size, nrnb,
-+                     lambda, dvdl_nb, -1, -1, donb_flags);
-+
-+        /* If we do foreign lambda and we have soft-core interactions
-+         * we have to recalculate the (non-linear) energies contributions.
-+         */
-+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
-+        {
-+            for (i = 0; i < enerd->n_lambda; i++)
-+            {
-+                for (j = 0; j < efptNR; j++)
-+                {
-+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
-+                }
-+                reset_foreign_enerdata(enerd);
-+                do_nonbonded(cr, fr, x, f, f_longrange, md, excl,
-+                             &(enerd->foreign_grpp), box_size, nrnb,
-+                             lam_i, dvdl_dum, -1, -1,
-+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
-+                sum_epot(&ir->opts, &(enerd->foreign_grpp), enerd->foreign_term);
-+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
-+            }
-+        }
-+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
-+        where();
-+    }
-+
-+    /* If we are doing GB, calculate bonded forces and apply corrections
-+     * to the solvation forces */
-+    /* MRS: Eventually, many need to include free energy contribution here! */
-+    if (ir->implicit_solvent)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsBONDED);
-+        calc_gb_forces(cr, md, born, top, atype, x, f, fr, idef,
-+                       ir->gb_algorithm, ir->sa_algorithm, nrnb, bBornRadii, &pbc, graph, enerd);
-+        wallcycle_sub_stop(wcycle, ewcsBONDED);
-+    }
-+
-+#ifdef GMX_MPI
-+    if (TAKETIME)
-+    {
-+        t1          = MPI_Wtime();
-+        fr->t_fnbf += t1-t0;
-+    }
-+#endif
-+
-+    if (fepvals->sc_alpha != 0)
-+    {
-+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
-+    }
-+    else
-+    {
-+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
-+    }
-+
-+    if (fepvals->sc_alpha != 0)
-+
-+    /* even though coulomb part is linear, we already added it, beacuse we
-+       need to go through the vdw calculation anyway */
-+    {
-+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
-+    }
-+    else
-+    {
-+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
-+    }
-+
-+    Vsr = 0;
-+    if (bSepDVDL)
-+    {
-+        for (i = 0; i < enerd->grpp.nener; i++)
-+        {
-+            Vsr +=
-+                (fr->bBHAM ?
-+                 enerd->grpp.ener[egBHAMSR][i] :
-+                 enerd->grpp.ener[egLJSR][i])
-+                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
-+        }
-+        dvdlsum = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
-+        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.", Vsr, dvdlsum);
-+    }
-+    debug_gmx();
-+
-+    GMX_MPE_LOG(ev_do_fnbf_finish);
-+
-+    if (debug)
-+    {
-+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
-+    }
-+
-+    /* Shift the coordinates. Must be done before bonded forces and PPPM,
-+     * but is also necessary for SHAKE and update, therefore it can NOT
-+     * go when no bonded forces have to be evaluated.
-+     */
-+
-+    /* Here sometimes we would not need to shift with NBFonly,
-+     * but we do so anyhow for consistency of the returned coordinates.
-+     */
-+    if (graph)
-+    {
-+        shift_self(graph, box, x);
-+        if (TRICLINIC(box))
-+        {
-+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
-+        }
-+        else
-+        {
-+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
-+        }
-+    }
-+    /* Check whether we need to do bondeds or correct for exclusions */
-+    if (fr->bMolPBC &&
-+        ((flags & GMX_FORCE_BONDED)
-+         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype)))
-+    {
-+        /* Since all atoms are in the rectangular or triclinic unit-cell,
-+         * only single box vector shifts (2 in x) are required.
-+         */
-+        set_pbc_dd(&pbc, fr->ePBC, cr->dd, TRUE, box);
-+    }
-+    debug_gmx();
-+
-+    if (flags & GMX_FORCE_BONDED)
-+    {
-+        GMX_MPE_LOG(ev_calc_bonds_start);
-+
-+        wallcycle_sub_start(wcycle, ewcsBONDED);
-+        calc_bonds(fplog, cr->ms,
-+                   idef, x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
-+                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
-+                   flags,
-+                   fr->bSepDVDL && do_per_step(step, ir->nstlog), step);
-+
-+        /* Check if we have to determine energy differences
-+         * at foreign lambda's.
-+         */
-+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) &&
-+            idef->ilsort != ilsortNO_FE)
-+        {
-+            if (idef->ilsort != ilsortFE_SORTED)
-+            {
-+                gmx_incons("The bonded interactions are not sorted for free energy");
-+            }
-+            for (i = 0; i < enerd->n_lambda; i++)
-+            {
-+                reset_foreign_enerdata(enerd);
-+                for (j = 0; j < efptNR; j++)
-+                {
-+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
-+                }
-+                calc_bonds_lambda(fplog, idef, x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
-+                                  fcd, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
-+                sum_epot(&ir->opts, &(enerd->foreign_grpp), enerd->foreign_term);
-+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
-+            }
-+        }
-+        debug_gmx();
-+        GMX_MPE_LOG(ev_calc_bonds_finish);
-+        wallcycle_sub_stop(wcycle, ewcsBONDED);
-+    }
-+
-+    where();
-+
-+    *cycles_pme = 0;
-+    if (EEL_FULL(fr->eeltype))
-+    {
-+        bSB = (ir->nwall == 2);
-+        if (bSB)
-+        {
-+            copy_mat(box, boxs);
-+            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
-+            box_size[ZZ] *= ir->wall_ewald_zfac;
-+        }
-+
-+        clear_mat(fr->vir_el_recip);
-+
-+        if (fr->bEwald)
-+        {
-+            Vcorr = 0;
-+            dvdl  = 0;
-+
-+            /* With the Verlet scheme exclusion forces are calculated
-+             * in the non-bonded kernel.
-+             */
-+            /* The TPI molecule does not have exclusions with the rest
-+             * of the system and no intra-molecular PME grid contributions
-+             * will be calculated in gmx_pme_calc_energy.
-+             */
-+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
-+                ir->ewald_geometry != eewg3D ||
-+                ir->epsilon_surface != 0)
-+            {
-+                int nthreads, t;
-+
-+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
-+
-+                if (fr->n_tpi > 0)
-+                {
-+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
-+                }
-+
-+                nthreads = gmx_omp_nthreads_get(emntBonded);
-+#pragma omp parallel for num_threads(nthreads) schedule(static)
-+                for (t = 0; t < nthreads; t++)
-+                {
-+                    int     s, e, i;
-+                    rvec   *fnv;
-+                    tensor *vir;
-+                    real   *Vcorrt, *dvdlt;
-+                    if (t == 0)
-+                    {
-+                        fnv    = fr->f_novirsum;
-+                        vir    = &fr->vir_el_recip;
-+                        Vcorrt = &Vcorr;
-+                        dvdlt  = &dvdl;
-+                    }
-+                    else
-+                    {
-+                        fnv    = fr->f_t[t].f;
-+                        vir    = &fr->f_t[t].vir;
-+                        Vcorrt = &fr->f_t[t].Vcorr;
-+                        dvdlt  = &fr->f_t[t].dvdl[efptCOUL];
-+                        for (i = 0; i < fr->natoms_force; i++)
-+                        {
-+                            clear_rvec(fnv[i]);
-+                        }
-+                        clear_mat(*vir);
-+                    }
-+                    *dvdlt  = 0;
-+                    *Vcorrt =
-+                        ewald_LRcorrection(fplog,
-+                                           fr->excl_load[t], fr->excl_load[t+1],
-+                                           cr, t, fr,
-+                                           md->chargeA,
-+                                           md->nChargePerturbed ? md->chargeB : NULL,
-+                                           ir->cutoff_scheme != ecutsVERLET,
-+                                           excl, x, bSB ? boxs : box, mu_tot,
-+                                           ir->ewald_geometry,
-+                                           ir->epsilon_surface,
-+                                           fnv, *vir,
-+                                           lambda[efptCOUL], dvdlt);
-+                }
-+                if (nthreads > 1)
-+                {
-+                    reduce_thread_forces(fr->natoms_force, fr->f_novirsum,
-+                                         fr->vir_el_recip,
-+                                         &Vcorr, efptCOUL, &dvdl,
-+                                         nthreads, fr->f_t);
-+                }
-+
-+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
-+            }
-+
-+            if (fr->n_tpi == 0)
-+            {
-+                Vcorr += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
-+                                                 &dvdl, fr->vir_el_recip);
-+            }
-+
-+            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.", Vcorr, dvdl);
-+            enerd->dvdl_lin[efptCOUL] += dvdl;
-+        }
-+
-+        status = 0;
-+        Vlr    = 0;
-+        dvdl   = 0;
-+        switch (fr->eeltype)
-+        {
-+            case eelPME:
-+            case eelPMESWITCH:
-+            case eelPMEUSER:
-+            case eelPMEUSERSWITCH:
-+            case eelP3M_AD:
-+                if (cr->duty & DUTY_PME)
-+                {
-+                    assert(fr->n_tpi >= 0);
-+                    if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
-+                    {
-+                        pme_flags = GMX_PME_SPREAD_Q | GMX_PME_SOLVE;
-+                        if (flags & GMX_FORCE_FORCES)
-+                        {
-+                            pme_flags |= GMX_PME_CALC_F;
-+                        }
-+                        if (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY))
-+                        {
-+                            pme_flags |= GMX_PME_CALC_ENER_VIR;
-+                        }
-+                        if (fr->n_tpi > 0)
-+                        {
-+                            /* We don't calculate f, but we do want the potential */
-+                            pme_flags |= GMX_PME_CALC_POT;
-+                        }
-+                        wallcycle_start(wcycle, ewcPMEMESH);
-+                        status = gmx_pme_do(fr->pmedata,
-+                                            md->start, md->homenr - fr->n_tpi,
-+                                            x, fr->f_novirsum,
-+                                            md->chargeA, md->chargeB,
-+                                            bSB ? boxs : box, cr,
-+                                            DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
-+                                            DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
-+                                            nrnb, wcycle,
-+                                            fr->vir_el_recip, fr->ewaldcoeff,
-+                                            &Vlr, lambda[efptCOUL], &dvdl,
-+                                            pme_flags);
-+                        *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
-+
-+                        /* We should try to do as little computation after
-+                         * this as possible, because parallel PME synchronizes
-+                         * the nodes, so we want all load imbalance of the rest
-+                         * of the force calculation to be before the PME call.
-+                         * DD load balancing is done on the whole time of
-+                         * the force call (without PME).
-+                         */
-+                    }
-+                    if (fr->n_tpi > 0)
-+                    {
-+                        /* Determine the PME grid energy of the test molecule
-+                         * with the PME grid potential of the other charges.
-+                         */
-+                        gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
-+                                            x + md->homenr - fr->n_tpi,
-+                                            md->chargeA + md->homenr - fr->n_tpi,
-+                                            &Vlr);
-+                    }
-+                    PRINT_SEPDVDL("PME mesh", Vlr, dvdl);
-+                }
-+                break;
-+            case eelEWALD:
-+                Vlr = do_ewald(fplog, FALSE, ir, x, fr->f_novirsum,
-+                               md->chargeA, md->chargeB,
-+                               box_size, cr, md->homenr,
-+                               fr->vir_el_recip, fr->ewaldcoeff,
-+                               lambda[efptCOUL], &dvdl, fr->ewald_table);
-+                PRINT_SEPDVDL("Ewald long-range", Vlr, dvdl);
-+                break;
-+            default:
-+                gmx_fatal(FARGS, "No such electrostatics method implemented %s",
-+                          eel_names[fr->eeltype]);
-+        }
-+        if (status != 0)
-+        {
-+            gmx_fatal(FARGS, "Error %d in long range electrostatics routine %s",
-+                      status, EELTYPE(fr->eeltype));
-+        }
-+        /* Note that with separate PME nodes we get the real energies later */
-+        enerd->dvdl_lin[efptCOUL] += dvdl;
-+        enerd->term[F_COUL_RECIP]  = Vlr + Vcorr;
-+        if (debug)
-+        {
-+            fprintf(debug, "Vlr = %g, Vcorr = %g, Vlr_corr = %g\n",
-+                    Vlr, Vcorr, enerd->term[F_COUL_RECIP]);
-+            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
-+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
-+        }
-+    }
-+    else
-+    {
-+        if (EEL_RF(fr->eeltype))
-+        {
-+            /* With the Verlet scheme exclusion forces are calculated
-+             * in the non-bonded kernel.
-+             */
-+            if (ir->cutoff_scheme != ecutsVERLET && fr->eeltype != eelRF_NEC)
-+            {
-+                dvdl                   = 0;
-+                enerd->term[F_RF_EXCL] =
-+                    RF_excl_correction(fplog, fr, graph, md, excl, x, f,
-+                                       fr->fshift, &pbc, lambda[efptCOUL], &dvdl);
-+            }
-+
-+            enerd->dvdl_lin[efptCOUL] += dvdl;
-+            PRINT_SEPDVDL("RF exclusion correction",
-+                          enerd->term[F_RF_EXCL], dvdl);
-+        }
-+    }
-+    where();
-+    debug_gmx();
-+
-+    if (debug)
-+    {
-+        print_nrnb(debug, nrnb);
-+    }
-+    debug_gmx();
-+
-+#ifdef GMX_MPI
-+    if (TAKETIME)
-+    {
-+        t2 = MPI_Wtime();
-+        MPI_Barrier(cr->mpi_comm_mygroup);
-+        t3          = MPI_Wtime();
-+        fr->t_wait += t3-t2;
-+        if (fr->timesteps == 11)
-+        {
-+            fprintf(stderr, "* PP load balancing info: node %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
-+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
-+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
-+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
-+        }
-+        fr->timesteps++;
-+    }
-+#endif
-+
-+    if (debug)
-+    {
-+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
-+    }
-+
-+    GMX_MPE_LOG(ev_force_finish);
-+
-+}
-+
-+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
-+{
-+    int i, n2;
-+
-+    for (i = 0; i < F_NRE; i++)
-+    {
-+        enerd->term[i]         = 0;
-+        enerd->foreign_term[i] = 0;
-+    }
-+
-+
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        enerd->dvdl_lin[i]     = 0;
-+        enerd->dvdl_nonlin[i]  = 0;
-+    }
-+
-+    n2 = ngener*ngener;
-+    if (debug)
-+    {
-+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
-+    }
-+    enerd->grpp.nener         = n2;
-+    enerd->foreign_grpp.nener = n2;
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        snew(enerd->grpp.ener[i], n2);
-+        snew(enerd->foreign_grpp.ener[i], n2);
-+    }
-+
-+    if (n_lambda)
-+    {
-+        enerd->n_lambda = 1 + n_lambda;
-+        snew(enerd->enerpart_lambda, enerd->n_lambda);
-+    }
-+    else
-+    {
-+        enerd->n_lambda = 0;
-+    }
-+}
-+
-+void destroy_enerdata(gmx_enerdata_t *enerd)
-+{
-+    int i;
-+
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        sfree(enerd->grpp.ener[i]);
-+    }
-+
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        sfree(enerd->foreign_grpp.ener[i]);
-+    }
-+
-+    if (enerd->n_lambda)
-+    {
-+        sfree(enerd->enerpart_lambda);
-+    }
-+}
-+
-+static real sum_v(int n, real v[])
-+{
-+    real t;
-+    int  i;
-+
-+    t = 0.0;
-+    for (i = 0; (i < n); i++)
-+    {
-+        t = t + v[i];
-+    }
-+
-+    return t;
-+}
-+
-+void sum_epot(t_grpopts *opts, gmx_grppairener_t *grpp, real *epot)
-+{
-+    int i;
-+
-+    /* Accumulate energies */
-+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
-+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
-+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
-+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
-+    epot[F_COUL_LR]  = sum_v(grpp->nener, grpp->ener[egCOULLR]);
-+    epot[F_LJ_LR]    = sum_v(grpp->nener, grpp->ener[egLJLR]);
-+    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
-+    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
-+
-+/* lattice part of LR doesnt belong to any group
-+ * and has been added earlier
-+ */
-+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
-+    epot[F_BHAM_LR]  = sum_v(grpp->nener, grpp->ener[egBHAMLR]);
-+
-+    epot[F_EPOT] = 0;
-+    for (i = 0; (i < F_EPOT); i++)
-+    {
-+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
-+        {
-+            epot[F_EPOT] += epot[i];
-+        }
-+    }
-+}
-+
-+void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
-+{
-+    int    i, j, index;
-+    double dlam;
-+
-+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
-+    enerd->term[F_DVDL]       = 0.0;
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        if (fepvals->separate_dvdl[i])
-+        {
-+            /* could this be done more readably/compactly? */
-+            switch (i)
-+            {
-+                case (efptMASS):
-+                    index = F_DKDL;
-+                    break;
-+                case (efptCOUL):
-+                    index = F_DVDL_COUL;
-+                    break;
-+                case (efptVDW):
-+                    index = F_DVDL_VDW;
-+                    break;
-+                case (efptBONDED):
-+                    index = F_DVDL_BONDED;
-+                    break;
-+                case (efptRESTRAINT):
-+                    index = F_DVDL_RESTRAINT;
-+                    break;
-+                default:
-+                    index = F_DVDL;
-+                    break;
-+            }
-+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
-+            if (debug)
-+            {
-+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
-+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
-+            }
-+        }
-+        else
-+        {
-+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
-+            if (debug)
-+            {
-+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
-+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
-+            }
-+        }
-+    }
-+
-+    /* Notes on the foreign lambda free energy difference evaluation:
-+     * Adding the potential and ekin terms that depend linearly on lambda
-+     * as delta lam * dvdl to the energy differences is exact.
-+     * For the constraints this is not exact, but we have no other option
-+     * without literally changing the lengths and reevaluating the energies at each step.
-+     * (try to remedy this post 4.6 - MRS)
-+     * For the non-bonded LR term we assume that the soft-core (if present)
-+     * no longer affects the energy beyond the short-range cut-off,
-+     * which is a very good approximation (except for exotic settings).
-+     * (investigate how to overcome this post 4.6 - MRS)
-+     */
-+    if (fepvals->separate_dvdl[efptBONDED])
-+    {
-+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
-+    }
-+    else
-+    {
-+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
-+    }
-+    enerd->term[F_DVDL_CONSTR] = 0;
-+
-+    for (i = 0; i < fepvals->n_lambda; i++)
-+    {                                         /* note we are iterating over fepvals here!
-+                                                 For the current lam, dlam = 0 automatically,
-+                                                 so we don't need to add anything to the
-+                                                 enerd->enerpart_lambda[0] */
-+
-+        /* we don't need to worry about dvdl_lin contributions to dE at
-+           current lambda, because the contributions to the current
-+           lambda are automatically zeroed */
-+
-+        for (j = 0; j < efptNR; j++)
-+        {
-+            /* Note that this loop is over all dhdl components, not just the separated ones */
-+            dlam = (fepvals->all_lambda[j][i]-lambda[j]);
-+            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
-+            if (debug)
-+            {
-+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
-+                        fepvals->all_lambda[j][i], efpt_names[j],
-+                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
-+                        dlam, enerd->dvdl_lin[j]);
-+            }
-+        }
-+    }
-+}
-+
-+
-+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
-+{
-+    int  i, j;
-+
-+    /* First reset all foreign energy components.  Foreign energies always called on
-+       neighbor search steps */
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        for (j = 0; (j < enerd->grpp.nener); j++)
-+        {
-+            enerd->foreign_grpp.ener[i][j] = 0.0;
-+        }
-+    }
-+
-+    /* potential energy components */
-+    for (i = 0; (i <= F_EPOT); i++)
-+    {
-+        enerd->foreign_term[i] = 0.0;
-+    }
-+}
-+
-+void reset_enerdata(t_grpopts *opts,
-+                    t_forcerec *fr, gmx_bool bNS,
-+                    gmx_enerdata_t *enerd,
-+                    gmx_bool bMaster)
-+{
-+    gmx_bool bKeepLR;
-+    int      i, j;
-+
-+    /* First reset all energy components, except for the long range terms
-+     * on the master at non neighbor search steps, since the long range
-+     * terms have already been summed at the last neighbor search step.
-+     */
-+    bKeepLR = (fr->bTwinRange && !bNS);
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        if (!(bKeepLR && bMaster && (i == egCOULLR || i == egLJLR)))
-+        {
-+            for (j = 0; (j < enerd->grpp.nener); j++)
-+            {
-+                enerd->grpp.ener[i][j] = 0.0;
-+            }
-+        }
-+    }
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        enerd->dvdl_lin[i]    = 0.0;
-+        enerd->dvdl_nonlin[i] = 0.0;
-+    }
-+
-+    /* Normal potential energy components */
-+    for (i = 0; (i <= F_EPOT); i++)
-+    {
-+        enerd->term[i] = 0.0;
-+    }
-+    /* Initialize the dVdlambda term with the long range contribution */
-+    /* Initialize the dvdl term with the long range contribution */
-+    enerd->term[F_DVDL]            = 0.0;
-+    enerd->term[F_DVDL_COUL]       = 0.0;
-+    enerd->term[F_DVDL_VDW]        = 0.0;
-+    enerd->term[F_DVDL_BONDED]     = 0.0;
-+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
-+    enerd->term[F_DKDL]            = 0.0;
-+    if (enerd->n_lambda > 0)
-+    {
-+        for (i = 0; i < enerd->n_lambda; i++)
-+        {
-+            enerd->enerpart_lambda[i] = 0.0;
-+        }
-+    }
-+    /* reset foreign energy data - separate function since we also call it elsewhere */
-+    reset_foreign_enerdata(enerd);
-+}
-diff --git a/src/mdlib/minimize.c b/src/mdlib/minimize.c
-index 8afe436..15fd15a 100644
---- a/src/mdlib/minimize.c
-+++ b/src/mdlib/minimize.c
-@@ -83,6 +83,12 @@
- #include "gmx_omp_nthreads.h"
- #include "md_logging.h"
- 
-+/* PLUMED */
-+#include "../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+extern void(*plumedcmd)(plumed,const char*,const void*);
-+/* END PLUMED */
- 
- typedef struct {
-     t_state  s;
-@@ -459,6 +465,47 @@ void init_em(FILE *fplog, const char *title,
- 
-     clear_rvec(mu_tot);
-     calc_shifts(ems->s.box, fr->shift_vec);
-+
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) (*plumedcmd) (plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        (*plumedcmd) (plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }else{
-+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
-+        }
-+      }
-+      (*plumedcmd) (plumedmain,"setNatoms",&top_global->natoms);
-+      (*plumedcmd) (plumedmain,"setMDEngine","gromacs");
-+      (*plumedcmd) (plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      (*plumedcmd) (plumedmain,"setTimestep",&real_delta_t);
-+      (*plumedcmd) (plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }else{
-+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&mdatoms->homenr);
-+          (*plumedcmd) (plumedmain,"setAtomsContiguous",&mdatoms->start);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
-+
- }
- 
- static void finish_em(FILE *fplog, t_commrec *cr, gmx_mdoutf_t *outf,
-@@ -738,6 +785,12 @@ static void evaluate_energy(FILE *fplog, gmx_bool bVerbose, t_commrec *cr,
-             em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-                                    ems, top, mdatoms, fr, vsite, constr,
-                                    nrnb, wcycle);
-+            /* PLUMED */
-+            if(plumedswitch){
-+              (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+              (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+            }
-+            /* END PLUMED */
-         }
-     }
- 
-@@ -745,6 +798,22 @@ static void evaluate_energy(FILE *fplog, gmx_bool bVerbose, t_commrec *cr,
-     /* do_force always puts the charge groups in the box and shifts again
-      * We do not unshift, so molecules are always whole in congrad.c
-      */
-+    /* PLUMED */
-+    int plumedNeedsEnergy=0;
-+    matrix plumed_vir;
-+    if(plumedswitch){
-+      long int lstep=count; (*plumedcmd)(plumedmain,"setStepLong",&count);
-+      (*plumedcmd) (plumedmain,"setPositions",&ems->s.x[mdatoms->start][0]);
-+      (*plumedcmd) (plumedmain,"setMasses",&mdatoms->massT[mdatoms->start]);
-+      (*plumedcmd) (plumedmain,"setCharges",&mdatoms->chargeA[mdatoms->start]);
-+      (*plumedcmd) (plumedmain,"setBox",&ems->s.box[0][0]);
-+      (*plumedcmd) (plumedmain,"prepareCalc",NULL);
-+      (*plumedcmd) (plumedmain,"setForces",&ems->f[mdatoms->start][0]);
-+      (*plumedcmd) (plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+      clear_mat(plumed_vir);
-+      (*plumedcmd) (plumedmain,"setVirial",&plumed_vir[0][0]);
-+    }
-+    /* END PLUMED */
-     do_force(fplog, cr, inputrec,
-              count, nrnb, wcycle, top, top_global, &top_global->groups,
-              ems->s.box, ems->s.x, &ems->s.hist,
-@@ -753,6 +822,19 @@ static void evaluate_energy(FILE *fplog, gmx_bool bVerbose, t_commrec *cr,
-              GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
-              GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
-              (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(plumedNeedsEnergy) {
-+        msmul(force_vir,2.0,plumed_vir);
-+        (*plumedcmd) (plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+        (*plumedcmd) (plumedmain,"performCalc",NULL);
-+        msmul(plumed_vir,0.5,force_vir);
-+      } else {
-+        msmul(plumed_vir,0.5,plumed_vir);
-+        m_add(force_vir,plumed_vir,force_vir);
-+      }
-+    }
-+    /* END PLUMED */
- 
-     /* Clear the unused shake virial and pressure */
-     clear_mat(shake_vir);
-diff --git a/src/mdlib/minimize.c.preplumed b/src/mdlib/minimize.c.preplumed
-new file mode 100644
-index 0000000..8afe436
---- /dev/null
-+++ b/src/mdlib/minimize.c.preplumed
-@@ -0,0 +1,2864 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team,
-+ * check out http://www.gromacs.org for more information.
-+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
-+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
-+ * others, as listed in the AUTHORS file in the top-level source
-+ * directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <string.h>
-+#include <time.h>
-+#include <math.h>
-+#include "sysstuff.h"
-+#include "string2.h"
-+#include "network.h"
-+#include "confio.h"
-+#include "copyrite.h"
-+#include "smalloc.h"
-+#include "nrnb.h"
-+#include "main.h"
-+#include "force.h"
-+#include "macros.h"
-+#include "random.h"
-+#include "names.h"
-+#include "gmx_fatal.h"
-+#include "txtdump.h"
-+#include "typedefs.h"
-+#include "update.h"
-+#include "constr.h"
-+#include "vec.h"
-+#include "statutil.h"
-+#include "tgroup.h"
-+#include "mdebin.h"
-+#include "vsite.h"
-+#include "force.h"
-+#include "mdrun.h"
-+#include "md_support.h"
-+#include "sim_util.h"
-+#include "domdec.h"
-+#include "partdec.h"
-+#include "trnio.h"
-+#include "sparsematrix.h"
-+#include "mtxio.h"
-+#include "mdatoms.h"
-+#include "ns.h"
-+#include "gmx_wallcycle.h"
-+#include "mtop_util.h"
-+#include "gmxfio.h"
-+#include "pme.h"
-+#include "bondf.h"
-+#include "gmx_omp_nthreads.h"
-+#include "md_logging.h"
-+
-+
-+typedef struct {
-+    t_state  s;
-+    rvec    *f;
-+    real     epot;
-+    real     fnorm;
-+    real     fmax;
-+    int      a_fmax;
-+} em_state_t;
-+
-+static em_state_t *init_em_state()
-+{
-+    em_state_t *ems;
-+
-+    snew(ems, 1);
-+
-+    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
-+    snew(ems->s.lambda, efptNR);
-+
-+    return ems;
-+}
-+
-+static void print_em_start(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
-+                           gmx_wallcycle_t wcycle,
-+                           const char *name)
-+{
-+    runtime_start(runtime);
-+    wallcycle_start(wcycle, ewcRUN);
-+    print_start(fplog, cr, runtime, name);
-+}
-+
-+static void em_time_end(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
-+                        gmx_wallcycle_t wcycle)
-+{
-+    wallcycle_stop(wcycle, ewcRUN);
-+
-+    runtime_end(runtime);
-+}
-+
-+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
-+{
-+    fprintf(out, "\n");
-+    fprintf(out, "%s:\n", minimizer);
-+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
-+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
-+}
-+
-+static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
-+{
-+    char buffer[2048];
-+    if (bLastStep)
-+    {
-+        sprintf(buffer,
-+                "\nEnergy minimization reached the maximum number "
-+                "of steps before the forces reached the requested "
-+                "precision Fmax < %g.\n", ftol);
-+    }
-+    else
-+    {
-+        sprintf(buffer,
-+                "\nEnergy minimization has stopped, but the forces have "
-+                "not converged to the requested precision Fmax < %g (which "
-+                "may not be possible for your system). It stopped "
-+                "because the algorithm tried to make a new step whose size "
-+                "was too small, or there was no change in the energy since "
-+                "last step. Either way, we regard the minimization as "
-+                "converged to within the available machine precision, "
-+                "given your starting configuration and EM parameters.\n%s%s",
-+                ftol,
-+                sizeof(real) < sizeof(double) ?
-+                "\nDouble precision normally gives you higher accuracy, but "
-+                "this is often not needed for preparing to run molecular "
-+                "dynamics.\n" :
-+                "",
-+                bConstrain ?
-+                "You might need to increase your constraint accuracy, or turn\n"
-+                "off constraints altogether (set constraints = none in mdp file)\n" :
-+                "");
-+    }
-+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
-+}
-+
-+
-+
-+static void print_converged(FILE *fp, const char *alg, real ftol,
-+                            gmx_large_int_t count, gmx_bool bDone, gmx_large_int_t nsteps,
-+                            real epot, real fmax, int nfmax, real fnorm)
-+{
-+    char buf[STEPSTRSIZE];
-+
-+    if (bDone)
-+    {
-+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
-+                alg, ftol, gmx_step_str(count, buf));
-+    }
-+    else if (count < nsteps)
-+    {
-+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
-+                "but did not reach the requested Fmax < %g.\n",
-+                alg, gmx_step_str(count, buf), ftol);
-+    }
-+    else
-+    {
-+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
-+                alg, ftol, gmx_step_str(count, buf));
-+    }
-+
-+#ifdef GMX_DOUBLE
-+    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
-+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
-+    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
-+#else
-+    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
-+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
-+    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
-+#endif
-+}
-+
-+static void get_f_norm_max(t_commrec *cr,
-+                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
-+                           real *fnorm, real *fmax, int *a_fmax)
-+{
-+    double fnorm2, *sum;
-+    real   fmax2, fmax2_0, fam;
-+    int    la_max, a_max, start, end, i, m, gf;
-+
-+    /* This routine finds the largest force and returns it.
-+     * On parallel machines the global max is taken.
-+     */
-+    fnorm2 = 0;
-+    fmax2  = 0;
-+    la_max = -1;
-+    gf     = 0;
-+    start  = mdatoms->start;
-+    end    = mdatoms->homenr + start;
-+    if (mdatoms->cFREEZE)
-+    {
-+        for (i = start; i < end; i++)
-+        {
-+            gf  = mdatoms->cFREEZE[i];
-+            fam = 0;
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    fam += sqr(f[i][m]);
-+                }
-+            }
-+            fnorm2 += fam;
-+            if (fam > fmax2)
-+            {
-+                fmax2  = fam;
-+                la_max = i;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        for (i = start; i < end; i++)
-+        {
-+            fam     = norm2(f[i]);
-+            fnorm2 += fam;
-+            if (fam > fmax2)
-+            {
-+                fmax2  = fam;
-+                la_max = i;
-+            }
-+        }
-+    }
-+
-+    if (la_max >= 0 && DOMAINDECOMP(cr))
-+    {
-+        a_max = cr->dd->gatindex[la_max];
-+    }
-+    else
-+    {
-+        a_max = la_max;
-+    }
-+    if (PAR(cr))
-+    {
-+        snew(sum, 2*cr->nnodes+1);
-+        sum[2*cr->nodeid]   = fmax2;
-+        sum[2*cr->nodeid+1] = a_max;
-+        sum[2*cr->nnodes]   = fnorm2;
-+        gmx_sumd(2*cr->nnodes+1, sum, cr);
-+        fnorm2 = sum[2*cr->nnodes];
-+        /* Determine the global maximum */
-+        for (i = 0; i < cr->nnodes; i++)
-+        {
-+            if (sum[2*i] > fmax2)
-+            {
-+                fmax2 = sum[2*i];
-+                a_max = (int)(sum[2*i+1] + 0.5);
-+            }
-+        }
-+        sfree(sum);
-+    }
-+
-+    if (fnorm)
-+    {
-+        *fnorm = sqrt(fnorm2);
-+    }
-+    if (fmax)
-+    {
-+        *fmax  = sqrt(fmax2);
-+    }
-+    if (a_fmax)
-+    {
-+        *a_fmax = a_max;
-+    }
-+}
-+
-+static void get_state_f_norm_max(t_commrec *cr,
-+                                 t_grpopts *opts, t_mdatoms *mdatoms,
-+                                 em_state_t *ems)
-+{
-+    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
-+}
-+
-+void init_em(FILE *fplog, const char *title,
-+             t_commrec *cr, t_inputrec *ir,
-+             t_state *state_global, gmx_mtop_t *top_global,
-+             em_state_t *ems, gmx_localtop_t **top,
-+             rvec **f, rvec **f_global,
-+             t_nrnb *nrnb, rvec mu_tot,
-+             t_forcerec *fr, gmx_enerdata_t **enerd,
-+             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int nfile, const t_filenm fnm[],
-+             gmx_mdoutf_t **outf, t_mdebin **mdebin)
-+{
-+    int  start, homenr, i;
-+    real dvdl_constr;
-+
-+    if (fplog)
-+    {
-+        fprintf(fplog, "Initiating %s\n", title);
-+    }
-+
-+    state_global->ngtc = 0;
-+
-+    /* Initialize lambda variables */
-+    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
-+
-+    init_nrnb(nrnb);
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        *top = dd_init_local_top(top_global);
-+
-+        dd_init_local_state(cr->dd, state_global, &ems->s);
-+
-+        *f = NULL;
-+
-+        /* Distribute the charge groups over the nodes from the master node */
-+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
-+                            state_global, top_global, ir,
-+                            &ems->s, &ems->f, mdatoms, *top,
-+                            fr, vsite, NULL, constr,
-+                            nrnb, NULL, FALSE);
-+        dd_store_state(cr->dd, &ems->s);
-+
-+        if (ir->nstfout)
-+        {
-+            snew(*f_global, top_global->natoms);
-+        }
-+        else
-+        {
-+            *f_global = NULL;
-+        }
-+        *graph = NULL;
-+    }
-+    else
-+    {
-+        snew(*f, top_global->natoms);
-+
-+        /* Just copy the state */
-+        ems->s = *state_global;
-+        snew(ems->s.x, ems->s.nalloc);
-+        snew(ems->f, ems->s.nalloc);
-+        for (i = 0; i < state_global->natoms; i++)
-+        {
-+            copy_rvec(state_global->x[i], ems->s.x[i]);
-+        }
-+        copy_mat(state_global->box, ems->s.box);
-+
-+        if (PAR(cr) && ir->eI != eiNM)
-+        {
-+            /* Initialize the particle decomposition and split the topology */
-+            *top = split_system(fplog, top_global, ir, cr);
-+
-+            pd_cg_range(cr, &fr->cg0, &fr->hcg);
-+        }
-+        else
-+        {
-+            *top = gmx_mtop_generate_local_top(top_global, ir);
-+        }
-+        *f_global = *f;
-+
-+        forcerec_set_excl_load(fr, *top, cr);
-+
-+        setup_bonded_threading(fr, &(*top)->idef);
-+
-+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
-+        {
-+            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
-+        }
-+        else
-+        {
-+            *graph = NULL;
-+        }
-+
-+        if (PARTDECOMP(cr))
-+        {
-+            pd_at_range(cr, &start, &homenr);
-+            homenr -= start;
-+        }
-+        else
-+        {
-+            start  = 0;
-+            homenr = top_global->natoms;
-+        }
-+        atoms2md(top_global, ir, 0, NULL, start, homenr, mdatoms);
-+        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
-+
-+        if (vsite)
-+        {
-+            set_vsite_top(vsite, *top, mdatoms, cr);
-+        }
-+    }
-+
-+    if (constr)
-+    {
-+        if (ir->eConstrAlg == econtSHAKE &&
-+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
-+        {
-+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
-+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
-+        }
-+
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            set_constraints(constr, *top, ir, mdatoms, cr);
-+        }
-+
-+        if (!ir->bContinuation)
-+        {
-+            /* Constrain the starting coordinates */
-+            dvdl_constr = 0;
-+            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
-+                      ir, NULL, cr, -1, 0, mdatoms,
-+                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
-+                      ems->s.lambda[efptFEP], &dvdl_constr,
-+                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
-+        }
-+    }
-+
-+    if (PAR(cr))
-+    {
-+        *gstat = global_stat_init(ir);
-+    }
-+
-+    *outf = init_mdoutf(nfile, fnm, 0, cr, ir, NULL);
-+
-+    snew(*enerd, 1);
-+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
-+                  *enerd);
-+
-+    if (mdebin != NULL)
-+    {
-+        /* Init bin for energy stuff */
-+        *mdebin = init_mdebin((*outf)->fp_ene, top_global, ir, NULL);
-+    }
-+
-+    clear_rvec(mu_tot);
-+    calc_shifts(ems->s.box, fr->shift_vec);
-+}
-+
-+static void finish_em(FILE *fplog, t_commrec *cr, gmx_mdoutf_t *outf,
-+                      gmx_runtime_t *runtime, gmx_wallcycle_t wcycle)
-+{
-+    if (!(cr->duty & DUTY_PME))
-+    {
-+        /* Tell the PME only node to finish */
-+        gmx_pme_send_finish(cr);
-+    }
-+
-+    done_mdoutf(outf);
-+
-+    em_time_end(fplog, cr, runtime, wcycle);
-+}
-+
-+static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
-+{
-+    em_state_t tmp;
-+
-+    tmp   = *ems1;
-+    *ems1 = *ems2;
-+    *ems2 = tmp;
-+}
-+
-+static void copy_em_coords(em_state_t *ems, t_state *state)
-+{
-+    int i;
-+
-+    for (i = 0; (i < state->natoms); i++)
-+    {
-+        copy_rvec(ems->s.x[i], state->x[i]);
-+    }
-+}
-+
-+static void write_em_traj(FILE *fplog, t_commrec *cr,
-+                          gmx_mdoutf_t *outf,
-+                          gmx_bool bX, gmx_bool bF, const char *confout,
-+                          gmx_mtop_t *top_global,
-+                          t_inputrec *ir, gmx_large_int_t step,
-+                          em_state_t *state,
-+                          t_state *state_global, rvec *f_global)
-+{
-+    int mdof_flags;
-+
-+    if ((bX || bF || confout != NULL) && !DOMAINDECOMP(cr))
-+    {
-+        copy_em_coords(state, state_global);
-+        f_global = state->f;
-+    }
-+
-+    mdof_flags = 0;
-+    if (bX)
-+    {
-+        mdof_flags |= MDOF_X;
-+    }
-+    if (bF)
-+    {
-+        mdof_flags |= MDOF_F;
-+    }
-+    write_traj(fplog, cr, outf, mdof_flags,
-+               top_global, step, (double)step,
-+               &state->s, state_global, state->f, f_global, NULL, NULL);
-+
-+    if (confout != NULL && MASTER(cr))
-+    {
-+        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
-+        {
-+            /* Make molecules whole only for confout writing */
-+            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
-+                        state_global->x);
-+        }
-+
-+        write_sto_conf_mtop(confout,
-+                            *top_global->name, top_global,
-+                            state_global->x, NULL, ir->ePBC, state_global->box);
-+    }
-+}
-+
-+static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
-+                       gmx_bool bMolPBC,
-+                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
-+                       gmx_constr_t constr, gmx_localtop_t *top,
-+                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                       gmx_large_int_t count)
-+
-+{
-+    t_state *s1, *s2;
-+    int      i;
-+    int      start, end;
-+    rvec    *x1, *x2;
-+    real     dvdl_constr;
-+
-+    s1 = &ems1->s;
-+    s2 = &ems2->s;
-+
-+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
-+    {
-+        gmx_incons("state mismatch in do_em_step");
-+    }
-+
-+    s2->flags = s1->flags;
-+
-+    if (s2->nalloc != s1->nalloc)
-+    {
-+        s2->nalloc = s1->nalloc;
-+        srenew(s2->x, s1->nalloc);
-+        srenew(ems2->f,  s1->nalloc);
-+        if (s2->flags & (1<<estCGP))
-+        {
-+            srenew(s2->cg_p,  s1->nalloc);
-+        }
-+    }
-+
-+    s2->natoms = s1->natoms;
-+    copy_mat(s1->box, s2->box);
-+    /* Copy free energy state */
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        s2->lambda[i] = s1->lambda[i];
-+    }
-+    copy_mat(s1->box, s2->box);
-+
-+    start = md->start;
-+    end   = md->start + md->homenr;
-+
-+    x1 = s1->x;
-+    x2 = s2->x;
-+
-+#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
-+    {
-+        int gf, i, m;
-+
-+        gf = 0;
-+#pragma omp for schedule(static) nowait
-+        for (i = start; i < end; i++)
-+        {
-+            if (md->cFREEZE)
-+            {
-+                gf = md->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (ir->opts.nFreeze[gf][m])
-+                {
-+                    x2[i][m] = x1[i][m];
-+                }
-+                else
-+                {
-+                    x2[i][m] = x1[i][m] + a*f[i][m];
-+                }
-+            }
-+        }
-+
-+        if (s2->flags & (1<<estCGP))
-+        {
-+            /* Copy the CG p vector */
-+            x1 = s1->cg_p;
-+            x2 = s2->cg_p;
-+#pragma omp for schedule(static) nowait
-+            for (i = start; i < end; i++)
-+            {
-+                copy_rvec(x1[i], x2[i]);
-+            }
-+        }
-+
-+        if (DOMAINDECOMP(cr))
-+        {
-+            s2->ddp_count = s1->ddp_count;
-+            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
-+            {
-+#pragma omp barrier
-+                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
-+                srenew(s2->cg_gl, s2->cg_gl_nalloc);
-+#pragma omp barrier
-+            }
-+            s2->ncg_gl = s1->ncg_gl;
-+#pragma omp for schedule(static) nowait
-+            for (i = 0; i < s2->ncg_gl; i++)
-+            {
-+                s2->cg_gl[i] = s1->cg_gl[i];
-+            }
-+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
-+        }
-+    }
-+
-+    if (constr)
-+    {
-+        wallcycle_start(wcycle, ewcCONSTR);
-+        dvdl_constr = 0;
-+        constrain(NULL, TRUE, TRUE, constr, &top->idef,
-+                  ir, NULL, cr, count, 0, md,
-+                  s1->x, s2->x, NULL, bMolPBC, s2->box,
-+                  s2->lambda[efptBONDED], &dvdl_constr,
-+                  NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
-+        wallcycle_stop(wcycle, ewcCONSTR);
-+    }
-+}
-+
-+static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
-+                                   gmx_mtop_t *top_global, t_inputrec *ir,
-+                                   em_state_t *ems, gmx_localtop_t *top,
-+                                   t_mdatoms *mdatoms, t_forcerec *fr,
-+                                   gmx_vsite_t *vsite, gmx_constr_t constr,
-+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
-+{
-+    /* Repartition the domain decomposition */
-+    wallcycle_start(wcycle, ewcDOMDEC);
-+    dd_partition_system(fplog, step, cr, FALSE, 1,
-+                        NULL, top_global, ir,
-+                        &ems->s, &ems->f,
-+                        mdatoms, top, fr, vsite, NULL, constr,
-+                        nrnb, wcycle, FALSE);
-+    dd_store_state(cr->dd, &ems->s);
-+    wallcycle_stop(wcycle, ewcDOMDEC);
-+}
-+
-+static void evaluate_energy(FILE *fplog, gmx_bool bVerbose, t_commrec *cr,
-+                            t_state *state_global, gmx_mtop_t *top_global,
-+                            em_state_t *ems, gmx_localtop_t *top,
-+                            t_inputrec *inputrec,
-+                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                            gmx_global_stat_t gstat,
-+                            gmx_vsite_t *vsite, gmx_constr_t constr,
-+                            t_fcdata *fcd,
-+                            t_graph *graph, t_mdatoms *mdatoms,
-+                            t_forcerec *fr, rvec mu_tot,
-+                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
-+                            gmx_large_int_t count, gmx_bool bFirst)
-+{
-+    real     t;
-+    gmx_bool bNS;
-+    int      nabnsb;
-+    tensor   force_vir, shake_vir, ekin;
-+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
-+    real     terminate = 0;
-+
-+    /* Set the time to the initial time, the time does not change during EM */
-+    t = inputrec->init_t;
-+
-+    if (bFirst ||
-+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
-+    {
-+        /* This the first state or an old state used before the last ns */
-+        bNS = TRUE;
-+    }
-+    else
-+    {
-+        bNS = FALSE;
-+        if (inputrec->nstlist > 0)
-+        {
-+            bNS = TRUE;
-+        }
-+        else if (inputrec->nstlist == -1)
-+        {
-+            nabnsb = natoms_beyond_ns_buffer(inputrec, fr, &top->cgs, NULL, ems->s.x);
-+            if (PAR(cr))
-+            {
-+                gmx_sumi(1, &nabnsb, cr);
-+            }
-+            bNS = (nabnsb > 0);
-+        }
-+    }
-+
-+    if (vsite)
-+    {
-+        construct_vsites(fplog, vsite, ems->s.x, nrnb, 1, NULL,
-+                         top->idef.iparams, top->idef.il,
-+                         fr->ePBC, fr->bMolPBC, graph, cr, ems->s.box);
-+    }
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        if (bNS)
-+        {
-+            /* Repartition the domain decomposition */
-+            em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-+                                   ems, top, mdatoms, fr, vsite, constr,
-+                                   nrnb, wcycle);
-+        }
-+    }
-+
-+    /* Calc force & energy on new trial position  */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole in congrad.c
-+     */
-+    do_force(fplog, cr, inputrec,
-+             count, nrnb, wcycle, top, top_global, &top_global->groups,
-+             ems->s.box, ems->s.x, &ems->s.hist,
-+             ems->f, force_vir, mdatoms, enerd, fcd,
-+             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
-+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
-+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
-+             (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
-+
-+    /* Clear the unused shake virial and pressure */
-+    clear_mat(shake_vir);
-+    clear_mat(pres);
-+
-+    /* Communicate stuff when parallel */
-+    if (PAR(cr) && inputrec->eI != eiNM)
-+    {
-+        wallcycle_start(wcycle, ewcMoveE);
-+
-+        global_stat(fplog, gstat, cr, enerd, force_vir, shake_vir, mu_tot,
-+                    inputrec, NULL, NULL, NULL, 1, &terminate,
-+                    top_global, &ems->s, FALSE,
-+                    CGLO_ENERGY |
-+                    CGLO_PRESSURE |
-+                    CGLO_CONSTRAINT |
-+                    CGLO_FIRSTITERATE);
-+
-+        wallcycle_stop(wcycle, ewcMoveE);
-+    }
-+
-+    /* Calculate long range corrections to pressure and energy */
-+    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
-+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
-+    enerd->term[F_DISPCORR] = enercorr;
-+    enerd->term[F_EPOT]    += enercorr;
-+    enerd->term[F_PRES]    += prescorr;
-+    enerd->term[F_DVDL]    += dvdlcorr;
-+
-+    ems->epot = enerd->term[F_EPOT];
-+
-+    if (constr)
-+    {
-+        /* Project out the constraint components of the force */
-+        wallcycle_start(wcycle, ewcCONSTR);
-+        dvdl_constr = 0;
-+        constrain(NULL, FALSE, FALSE, constr, &top->idef,
-+                  inputrec, NULL, cr, count, 0, mdatoms,
-+                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
-+                  ems->s.lambda[efptBONDED], &dvdl_constr,
-+                  NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
-+        if (fr->bSepDVDL && fplog)
-+        {
-+            fprintf(fplog, sepdvdlformat, "Constraints", t, dvdl_constr);
-+        }
-+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-+        m_add(force_vir, shake_vir, vir);
-+        wallcycle_stop(wcycle, ewcCONSTR);
-+    }
-+    else
-+    {
-+        copy_mat(force_vir, vir);
-+    }
-+
-+    clear_mat(ekin);
-+    enerd->term[F_PRES] =
-+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
-+
-+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
-+
-+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
-+    {
-+        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
-+    }
-+}
-+
-+static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
-+                              gmx_mtop_t *mtop,
-+                              em_state_t *s_min, em_state_t *s_b)
-+{
-+    rvec          *fm, *fb, *fmg;
-+    t_block       *cgs_gl;
-+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
-+    double         partsum;
-+    unsigned char *grpnrFREEZE;
-+
-+    if (debug)
-+    {
-+        fprintf(debug, "Doing reorder_partsum\n");
-+    }
-+
-+    fm = s_min->f;
-+    fb = s_b->f;
-+
-+    cgs_gl = dd_charge_groups_global(cr->dd);
-+    index  = cgs_gl->index;
-+
-+    /* Collect fm in a global vector fmg.
-+     * This conflicts with the spirit of domain decomposition,
-+     * but to fully optimize this a much more complicated algorithm is required.
-+     */
-+    snew(fmg, mtop->natoms);
-+
-+    ncg   = s_min->s.ncg_gl;
-+    cg_gl = s_min->s.cg_gl;
-+    i     = 0;
-+    for (c = 0; c < ncg; c++)
-+    {
-+        cg = cg_gl[c];
-+        a0 = index[cg];
-+        a1 = index[cg+1];
-+        for (a = a0; a < a1; a++)
-+        {
-+            copy_rvec(fm[i], fmg[a]);
-+            i++;
-+        }
-+    }
-+    gmx_sum(mtop->natoms*3, fmg[0], cr);
-+
-+    /* Now we will determine the part of the sum for the cgs in state s_b */
-+    ncg         = s_b->s.ncg_gl;
-+    cg_gl       = s_b->s.cg_gl;
-+    partsum     = 0;
-+    i           = 0;
-+    gf          = 0;
-+    grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
-+    for (c = 0; c < ncg; c++)
-+    {
-+        cg = cg_gl[c];
-+        a0 = index[cg];
-+        a1 = index[cg+1];
-+        for (a = a0; a < a1; a++)
-+        {
-+            if (mdatoms->cFREEZE && grpnrFREEZE)
-+            {
-+                gf = grpnrFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
-+                }
-+            }
-+            i++;
-+        }
-+    }
-+
-+    sfree(fmg);
-+
-+    return partsum;
-+}
-+
-+static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
-+                    gmx_mtop_t *mtop,
-+                    em_state_t *s_min, em_state_t *s_b)
-+{
-+    rvec  *fm, *fb;
-+    double sum;
-+    int    gf, i, m;
-+
-+    /* This is just the classical Polak-Ribiere calculation of beta;
-+     * it looks a bit complicated since we take freeze groups into account,
-+     * and might have to sum it in parallel runs.
-+     */
-+
-+    if (!DOMAINDECOMP(cr) ||
-+        (s_min->s.ddp_count == cr->dd->ddp_count &&
-+         s_b->s.ddp_count   == cr->dd->ddp_count))
-+    {
-+        fm  = s_min->f;
-+        fb  = s_b->f;
-+        sum = 0;
-+        gf  = 0;
-+        /* This part of code can be incorrect with DD,
-+         * since the atom ordering in s_b and s_min might differ.
-+         */
-+        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
-+        {
-+            if (mdatoms->cFREEZE)
-+            {
-+                gf = mdatoms->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
-+                }
-+            }
-+        }
-+    }
-+    else
-+    {
-+        /* We need to reorder cgs while summing */
-+        sum = reorder_partsum(cr, opts, mdatoms, mtop, s_min, s_b);
-+    }
-+    if (PAR(cr))
-+    {
-+        gmx_sumd(1, &sum, cr);
-+    }
-+
-+    return sum/sqr(s_min->fnorm);
-+}
-+
-+double do_cg(FILE *fplog, t_commrec *cr,
-+             int nfile, const t_filenm fnm[],
-+             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
-+             int nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int stepout,
-+             t_inputrec *inputrec,
-+             gmx_mtop_t *top_global, t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t ed,
-+             t_forcerec *fr,
-+             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
-+             gmx_membed_t membed,
-+             real cpt_period, real max_hours,
-+             const char *deviceOptions,
-+             unsigned long Flags,
-+             gmx_runtime_t *runtime)
-+{
-+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
-+
-+    em_state_t       *s_min, *s_a, *s_b, *s_c;
-+    gmx_localtop_t   *top;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f;
-+    gmx_global_stat_t gstat;
-+    t_graph          *graph;
-+    rvec             *f_global, *p, *sf, *sfm;
-+    double            gpa, gpb, gpc, tmp, sum[2], minstep;
-+    real              fnormn;
-+    real              stepsize;
-+    real              a, b, c, beta = 0.0;
-+    real              epot_repl = 0;
-+    real              pnorm;
-+    t_mdebin         *mdebin;
-+    gmx_bool          converged, foundlower;
-+    rvec              mu_tot;
-+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
-+    tensor            vir, pres;
-+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
-+    gmx_mdoutf_t     *outf;
-+    int               i, m, gf, step, nminstep;
-+    real              terminate = 0;
-+
-+    step = 0;
-+
-+    s_min = init_em_state();
-+    s_a   = init_em_state();
-+    s_b   = init_em_state();
-+    s_c   = init_em_state();
-+
-+    /* Init em and store the local state in s_min */
-+    init_em(fplog, CG, cr, inputrec,
-+            state_global, top_global, s_min, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin);
-+
-+    /* Print to log file */
-+    print_em_start(fplog, cr, runtime, wcycle, CG);
-+
-+    /* Max number of steps */
-+    number_steps = inputrec->nsteps;
-+
-+    if (MASTER(cr))
-+    {
-+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
-+    }
-+
-+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole in congrad.c
-+     */
-+    evaluate_energy(fplog, bVerbose, cr,
-+                    state_global, top_global, s_min, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    where();
-+
-+    if (MASTER(cr))
-+    {
-+        /* Copy stuff to the energy bin for easy printing etc. */
-+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
-+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+        print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+        print_ebin(outf->fp_ene, TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+    where();
-+
-+    /* Estimate/guess the initial stepsize */
-+    stepsize = inputrec->em_stepsize/s_min->fnorm;
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
-+                s_min->fmax, s_min->a_fmax+1);
-+        fprintf(stderr, "   F-Norm            = %12.5e\n",
-+                s_min->fnorm/sqrt(state_global->natoms));
-+        fprintf(stderr, "\n");
-+        /* and copy to the log file too... */
-+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
-+                s_min->fmax, s_min->a_fmax+1);
-+        fprintf(fplog, "   F-Norm            = %12.5e\n",
-+                s_min->fnorm/sqrt(state_global->natoms));
-+        fprintf(fplog, "\n");
-+    }
-+    /* Start the loop over CG steps.
-+     * Each successful step is counted, and we continue until
-+     * we either converge or reach the max number of steps.
-+     */
-+    converged = FALSE;
-+    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
-+    {
-+
-+        /* start taking steps in a new direction
-+         * First time we enter the routine, beta=0, and the direction is
-+         * simply the negative gradient.
-+         */
-+
-+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
-+        p   = s_min->s.cg_p;
-+        sf  = s_min->f;
-+        gpa = 0;
-+        gf  = 0;
-+        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
-+        {
-+            if (mdatoms->cFREEZE)
-+            {
-+                gf = mdatoms->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!inputrec->opts.nFreeze[gf][m])
-+                {
-+                    p[i][m] = sf[i][m] + beta*p[i][m];
-+                    gpa    -= p[i][m]*sf[i][m];
-+                    /* f is negative gradient, thus the sign */
-+                }
-+                else
-+                {
-+                    p[i][m] = 0;
-+                }
-+            }
-+        }
-+
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpa, cr);
-+        }
-+
-+        /* Calculate the norm of the search vector */
-+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
-+
-+        /* Just in case stepsize reaches zero due to numerical precision... */
-+        if (stepsize <= 0)
-+        {
-+            stepsize = inputrec->em_stepsize/pnorm;
-+        }
-+
-+        /*
-+         * Double check the value of the derivative in the search direction.
-+         * If it is positive it must be due to the old information in the
-+         * CG formula, so just remove that and start over with beta=0.
-+         * This corresponds to a steepest descent step.
-+         */
-+        if (gpa > 0)
-+        {
-+            beta = 0;
-+            step--;   /* Don't count this step since we are restarting */
-+            continue; /* Go back to the beginning of the big for-loop */
-+        }
-+
-+        /* Calculate minimum allowed stepsize, before the average (norm)
-+         * relative change in coordinate is smaller than precision
-+         */
-+        minstep = 0;
-+        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
-+        {
-+            for (m = 0; m < DIM; m++)
-+            {
-+                tmp = fabs(s_min->s.x[i][m]);
-+                if (tmp < 1.0)
-+                {
-+                    tmp = 1.0;
-+                }
-+                tmp      = p[i][m]/tmp;
-+                minstep += tmp*tmp;
-+            }
-+        }
-+        /* Add up from all CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &minstep, cr);
-+        }
-+
-+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
-+
-+        if (stepsize < minstep)
-+        {
-+            converged = TRUE;
-+            break;
-+        }
-+
-+        /* Write coordinates if necessary */
-+        do_x = do_per_step(step, inputrec->nstxout);
-+        do_f = do_per_step(step, inputrec->nstfout);
-+
-+        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
-+                      top_global, inputrec, step,
-+                      s_min, state_global, f_global);
-+
-+        /* Take a step downhill.
-+         * In theory, we should minimize the function along this direction.
-+         * That is quite possible, but it turns out to take 5-10 function evaluations
-+         * for each line. However, we dont really need to find the exact minimum -
-+         * it is much better to start a new CG step in a modified direction as soon
-+         * as we are close to it. This will save a lot of energy evaluations.
-+         *
-+         * In practice, we just try to take a single step.
-+         * If it worked (i.e. lowered the energy), we increase the stepsize but
-+         * the continue straight to the next CG step without trying to find any minimum.
-+         * If it didn't work (higher energy), there must be a minimum somewhere between
-+         * the old position and the new one.
-+         *
-+         * Due to the finite numerical accuracy, it turns out that it is a good idea
-+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-+         * This leads to lower final energies in the tests I've done. / Erik
-+         */
-+        s_a->epot = s_min->epot;
-+        a         = 0.0;
-+        c         = a + stepsize; /* reference position along line is zero */
-+
-+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
-+        {
-+            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
-+                                   s_min, top, mdatoms, fr, vsite, constr,
-+                                   nrnb, wcycle);
-+        }
-+
-+        /* Take a trial step (new coords in s_c) */
-+        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
-+                   constr, top, nrnb, wcycle, -1);
-+
-+        neval++;
-+        /* Calculate energy for the trial step */
-+        evaluate_energy(fplog, bVerbose, cr,
-+                        state_global, top_global, s_c, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, -1, FALSE);
-+
-+        /* Calc derivative along line */
-+        p   = s_c->s.cg_p;
-+        sf  = s_c->f;
-+        gpc = 0;
-+        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
-+        {
-+            for (m = 0; m < DIM; m++)
-+            {
-+                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
-+            }
-+        }
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpc, cr);
-+        }
-+
-+        /* This is the max amount of increase in energy we tolerate */
-+        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
-+
-+        /* Accept the step if the energy is lower, or if it is not significantly higher
-+         * and the line derivative is still negative.
-+         */
-+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
-+        {
-+            foundlower = TRUE;
-+            /* Great, we found a better energy. Increase step for next iteration
-+             * if we are still going down, decrease it otherwise
-+             */
-+            if (gpc < 0)
-+            {
-+                stepsize *= 1.618034; /* The golden section */
-+            }
-+            else
-+            {
-+                stepsize *= 0.618034; /* 1/golden section */
-+            }
-+        }
-+        else
-+        {
-+            /* New energy is the same or higher. We will have to do some work
-+             * to find a smaller value in the interval. Take smaller step next time!
-+             */
-+            foundlower = FALSE;
-+            stepsize  *= 0.618034;
-+        }
-+
-+
-+
-+
-+        /* OK, if we didn't find a lower value we will have to locate one now - there must
-+         * be one in the interval [a=0,c].
-+         * The same thing is valid here, though: Don't spend dozens of iterations to find
-+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-+         *
-+         * I also have a safeguard for potentially really patological functions so we never
-+         * take more than 20 steps before we give up ...
-+         *
-+         * If we already found a lower value we just skip this step and continue to the update.
-+         */
-+        if (!foundlower)
-+        {
-+            nminstep = 0;
-+
-+            do
-+            {
-+                /* Select a new trial point.
-+                 * If the derivatives at points a & c have different sign we interpolate to zero,
-+                 * otherwise just do a bisection.
-+                 */
-+                if (gpa < 0 && gpc > 0)
-+                {
-+                    b = a + gpa*(a-c)/(gpc-gpa);
-+                }
-+                else
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* safeguard if interpolation close to machine accuracy causes errors:
-+                 * never go outside the interval
-+                 */
-+                if (b <= a || b >= c)
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-+                {
-+                    /* Reload the old state */
-+                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
-+                                           s_min, top, mdatoms, fr, vsite, constr,
-+                                           nrnb, wcycle);
-+                }
-+
-+                /* Take a trial step to this new point - new coords in s_b */
-+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
-+                           constr, top, nrnb, wcycle, -1);
-+
-+                neval++;
-+                /* Calculate energy for the trial step */
-+                evaluate_energy(fplog, bVerbose, cr,
-+                                state_global, top_global, s_b, top,
-+                                inputrec, nrnb, wcycle, gstat,
-+                                vsite, constr, fcd, graph, mdatoms, fr,
-+                                mu_tot, enerd, vir, pres, -1, FALSE);
-+
-+                /* p does not change within a step, but since the domain decomposition
-+                 * might change, we have to use cg_p of s_b here.
-+                 */
-+                p   = s_b->s.cg_p;
-+                sf  = s_b->f;
-+                gpb = 0;
-+                for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
-+                {
-+                    for (m = 0; m < DIM; m++)
-+                    {
-+                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
-+                    }
-+                }
-+                /* Sum the gradient along the line across CPUs */
-+                if (PAR(cr))
-+                {
-+                    gmx_sumd(1, &gpb, cr);
-+                }
-+
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
-+                            s_a->epot, s_b->epot, s_c->epot, gpb);
-+                }
-+
-+                epot_repl = s_b->epot;
-+
-+                /* Keep one of the intervals based on the value of the derivative at the new point */
-+                if (gpb > 0)
-+                {
-+                    /* Replace c endpoint with b */
-+                    swap_em_state(s_b, s_c);
-+                    c   = b;
-+                    gpc = gpb;
-+                }
-+                else
-+                {
-+                    /* Replace a endpoint with b */
-+                    swap_em_state(s_b, s_a);
-+                    a   = b;
-+                    gpa = gpb;
-+                }
-+
-+                /*
-+                 * Stop search as soon as we find a value smaller than the endpoints.
-+                 * Never run more than 20 steps, no matter what.
-+                 */
-+                nminstep++;
-+            }
-+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
-+                   (nminstep < 20));
-+
-+            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
-+                nminstep >= 20)
-+            {
-+                /* OK. We couldn't find a significantly lower energy.
-+                 * If beta==0 this was steepest descent, and then we give up.
-+                 * If not, set beta=0 and restart with steepest descent before quitting.
-+                 */
-+                if (beta == 0.0)
-+                {
-+                    /* Converged */
-+                    converged = TRUE;
-+                    break;
-+                }
-+                else
-+                {
-+                    /* Reset memory before giving up */
-+                    beta = 0.0;
-+                    continue;
-+                }
-+            }
-+
-+            /* Select min energy state of A & C, put the best in B.
-+             */
-+            if (s_c->epot < s_a->epot)
-+            {
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
-+                            s_c->epot, s_a->epot);
-+                }
-+                swap_em_state(s_b, s_c);
-+                gpb = gpc;
-+                b   = c;
-+            }
-+            else
-+            {
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
-+                            s_a->epot, s_c->epot);
-+                }
-+                swap_em_state(s_b, s_a);
-+                gpb = gpa;
-+                b   = a;
-+            }
-+
-+        }
-+        else
-+        {
-+            if (debug)
-+            {
-+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
-+                        s_c->epot);
-+            }
-+            swap_em_state(s_b, s_c);
-+            gpb = gpc;
-+            b   = c;
-+        }
-+
-+        /* new search direction */
-+        /* beta = 0 means forget all memory and restart with steepest descents. */
-+        if (nstcg && ((step % nstcg) == 0))
-+        {
-+            beta = 0.0;
-+        }
-+        else
-+        {
-+            /* s_min->fnorm cannot be zero, because then we would have converged
-+             * and broken out.
-+             */
-+
-+            /* Polak-Ribiere update.
-+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
-+             */
-+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
-+        }
-+        /* Limit beta to prevent oscillations */
-+        if (fabs(beta) > 5.0)
-+        {
-+            beta = 0.0;
-+        }
-+
-+
-+        /* update positions */
-+        swap_em_state(s_min, s_b);
-+        gpa = gpb;
-+
-+        /* Print it if necessary */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
-+                        step, s_min->epot, s_min->fnorm/sqrt(state_global->natoms),
-+                        s_min->fmax, s_min->a_fmax+1);
-+            }
-+            /* Store the new (lower) energies */
-+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
-+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+            do_log = do_per_step(step, inputrec->nstlog);
-+            do_ene = do_per_step(step, inputrec->nstenergy);
-+            if (do_log)
-+            {
-+                print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+            }
-+            print_ebin(outf->fp_ene, do_ene, FALSE, FALSE,
-+                       do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+
-+        /* Stop when the maximum force lies below tolerance.
-+         * If we have reached machine precision, converged is already set to true.
-+         */
-+        converged = converged || (s_min->fmax < inputrec->em_tol);
-+
-+    } /* End of the loop */
-+
-+    if (converged)
-+    {
-+        step--; /* we never took that last step in this case */
-+
-+    }
-+    if (s_min->fmax > inputrec->em_tol)
-+    {
-+        if (MASTER(cr))
-+        {
-+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
-+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
-+        }
-+        converged = FALSE;
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        /* If we printed energy and/or logfile last step (which was the last step)
-+         * we don't have to do it again, but otherwise print the final values.
-+         */
-+        if (!do_log)
-+        {
-+            /* Write final value to log since we didn't do anything the last step */
-+            print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+        }
-+        if (!do_ene || !do_log)
-+        {
-+            /* Write final energy file entries */
-+            print_ebin(outf->fp_ene, !do_ene, FALSE, FALSE,
-+                       !do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+    }
-+
-+    /* Print some stuff... */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+
-+    /* IMPORTANT!
-+     * For accurate normal mode calculation it is imperative that we
-+     * store the last conformation into the full precision binary trajectory.
-+     *
-+     * However, we should only do it if we did NOT already write this step
-+     * above (which we did if do_x or do_f was true).
-+     */
-+    do_x = !do_per_step(step, inputrec->nstxout);
-+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
-+
-+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, step,
-+                  s_min, state_global, f_global);
-+
-+    fnormn = s_min->fnorm/sqrt(state_global->natoms);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+
-+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-+    }
-+
-+    finish_em(fplog, cr, outf, runtime, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    runtime->nsteps_done = step;
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_lbfgs(FILE *fplog, t_commrec *cr,
-+                int nfile, const t_filenm fnm[],
-+                const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
-+                int nstglobalcomm,
-+                gmx_vsite_t *vsite, gmx_constr_t constr,
-+                int stepout,
-+                t_inputrec *inputrec,
-+                gmx_mtop_t *top_global, t_fcdata *fcd,
-+                t_state *state,
-+                t_mdatoms *mdatoms,
-+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                gmx_edsam_t ed,
-+                t_forcerec *fr,
-+                int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
-+                gmx_membed_t membed,
-+                real cpt_period, real max_hours,
-+                const char *deviceOptions,
-+                unsigned long Flags,
-+                gmx_runtime_t *runtime)
-+{
-+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
-+    em_state_t         ems;
-+    gmx_localtop_t    *top;
-+    gmx_enerdata_t    *enerd;
-+    rvec              *f;
-+    gmx_global_stat_t  gstat;
-+    t_graph           *graph;
-+    rvec              *f_global;
-+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
-+    double             stepsize, gpa, gpb, gpc, tmp, minstep;
-+    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
-+    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
-+    real               a, b, c, maxdelta, delta;
-+    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
-+    real               dgdx, dgdg, sq, yr, beta;
-+    t_mdebin          *mdebin;
-+    gmx_bool           converged, first;
-+    rvec               mu_tot;
-+    real               fnorm, fmax;
-+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
-+    tensor             vir, pres;
-+    int                start, end, number_steps;
-+    gmx_mdoutf_t      *outf;
-+    int                i, k, m, n, nfmax, gf, step;
-+    int                mdof_flags;
-+    /* not used */
-+    real               terminate;
-+
-+    if (PAR(cr))
-+    {
-+        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
-+    }
-+
-+    if (NULL != constr)
-+    {
-+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
-+    }
-+
-+    n        = 3*state->natoms;
-+    nmaxcorr = inputrec->nbfgscorr;
-+
-+    /* Allocate memory */
-+    /* Use pointers to real so we dont have to loop over both atoms and
-+     * dimensions all the time...
-+     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
-+     * that point to the same memory.
-+     */
-+    snew(xa, n);
-+    snew(xb, n);
-+    snew(xc, n);
-+    snew(fa, n);
-+    snew(fb, n);
-+    snew(fc, n);
-+    snew(frozen, n);
-+
-+    snew(p, n);
-+    snew(lastx, n);
-+    snew(lastf, n);
-+    snew(rho, nmaxcorr);
-+    snew(alpha, nmaxcorr);
-+
-+    snew(dx, nmaxcorr);
-+    for (i = 0; i < nmaxcorr; i++)
-+    {
-+        snew(dx[i], n);
-+    }
-+
-+    snew(dg, nmaxcorr);
-+    for (i = 0; i < nmaxcorr; i++)
-+    {
-+        snew(dg[i], n);
-+    }
-+
-+    step  = 0;
-+    neval = 0;
-+
-+    /* Init em */
-+    init_em(fplog, LBFGS, cr, inputrec,
-+            state, top_global, &ems, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin);
-+    /* Do_lbfgs is not completely updated like do_steep and do_cg,
-+     * so we free some memory again.
-+     */
-+    sfree(ems.s.x);
-+    sfree(ems.f);
-+
-+    xx = (real *)state->x;
-+    ff = (real *)f;
-+
-+    start = mdatoms->start;
-+    end   = mdatoms->homenr + start;
-+
-+    /* Print to log file */
-+    print_em_start(fplog, cr, runtime, wcycle, LBFGS);
-+
-+    do_log = do_ene = do_x = do_f = TRUE;
-+
-+    /* Max number of steps */
-+    number_steps = inputrec->nsteps;
-+
-+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
-+    gf = 0;
-+    for (i = start; i < end; i++)
-+    {
-+        if (mdatoms->cFREEZE)
-+        {
-+            gf = mdatoms->cFREEZE[i];
-+        }
-+        for (m = 0; m < DIM; m++)
-+        {
-+            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
-+        }
-+    }
-+    if (MASTER(cr))
-+    {
-+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
-+    }
-+
-+    if (vsite)
-+    {
-+        construct_vsites(fplog, vsite, state->x, nrnb, 1, NULL,
-+                         top->idef.iparams, top->idef.il,
-+                         fr->ePBC, fr->bMolPBC, graph, cr, state->box);
-+    }
-+
-+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole
-+     */
-+    neval++;
-+    ems.s.x = state->x;
-+    ems.f   = f;
-+    evaluate_energy(fplog, bVerbose, cr,
-+                    state, top_global, &ems, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    where();
-+
-+    if (MASTER(cr))
-+    {
-+        /* Copy stuff to the energy bin for easy printing etc. */
-+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                   mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
-+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+        print_ebin(outf->fp_ene, TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+    where();
-+
-+    /* This is the starting energy */
-+    Epot = enerd->term[F_EPOT];
-+
-+    fnorm = ems.fnorm;
-+    fmax  = ems.fmax;
-+    nfmax = ems.a_fmax;
-+
-+    /* Set the initial step.
-+     * since it will be multiplied by the non-normalized search direction
-+     * vector (force vector the first time), we scale it by the
-+     * norm of the force.
-+     */
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
-+        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
-+        fprintf(stderr, "\n");
-+        /* and copy to the log file too... */
-+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
-+        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
-+        fprintf(fplog, "\n");
-+    }
-+
-+    point = 0;
-+    for (i = 0; i < n; i++)
-+    {
-+        if (!frozen[i])
-+        {
-+            dx[point][i] = ff[i]; /* Initial search direction */
-+        }
-+        else
-+        {
-+            dx[point][i] = 0;
-+        }
-+    }
-+
-+    stepsize  = 1.0/fnorm;
-+    converged = FALSE;
-+
-+    /* Start the loop over BFGS steps.
-+     * Each successful step is counted, and we continue until
-+     * we either converge or reach the max number of steps.
-+     */
-+
-+    ncorr = 0;
-+
-+    /* Set the gradient from the force */
-+    converged = FALSE;
-+    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
-+    {
-+
-+        /* Write coordinates if necessary */
-+        do_x = do_per_step(step, inputrec->nstxout);
-+        do_f = do_per_step(step, inputrec->nstfout);
-+
-+        mdof_flags = 0;
-+        if (do_x)
-+        {
-+            mdof_flags |= MDOF_X;
-+        }
-+
-+        if (do_f)
-+        {
-+            mdof_flags |= MDOF_F;
-+        }
-+
-+        write_traj(fplog, cr, outf, mdof_flags,
-+                   top_global, step, (real)step, state, state, f, f, NULL, NULL);
-+
-+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
-+
-+        /* pointer to current direction - point=0 first time here */
-+        s = dx[point];
-+
-+        /* calculate line gradient */
-+        for (gpa = 0, i = 0; i < n; i++)
-+        {
-+            gpa -= s[i]*ff[i];
-+        }
-+
-+        /* Calculate minimum allowed stepsize, before the average (norm)
-+         * relative change in coordinate is smaller than precision
-+         */
-+        for (minstep = 0, i = 0; i < n; i++)
-+        {
-+            tmp = fabs(xx[i]);
-+            if (tmp < 1.0)
-+            {
-+                tmp = 1.0;
-+            }
-+            tmp      = s[i]/tmp;
-+            minstep += tmp*tmp;
-+        }
-+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
-+
-+        if (stepsize < minstep)
-+        {
-+            converged = TRUE;
-+            break;
-+        }
-+
-+        /* Store old forces and coordinates */
-+        for (i = 0; i < n; i++)
-+        {
-+            lastx[i] = xx[i];
-+            lastf[i] = ff[i];
-+        }
-+        Epot0 = Epot;
-+
-+        first = TRUE;
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            xa[i] = xx[i];
-+        }
-+
-+        /* Take a step downhill.
-+         * In theory, we should minimize the function along this direction.
-+         * That is quite possible, but it turns out to take 5-10 function evaluations
-+         * for each line. However, we dont really need to find the exact minimum -
-+         * it is much better to start a new BFGS step in a modified direction as soon
-+         * as we are close to it. This will save a lot of energy evaluations.
-+         *
-+         * In practice, we just try to take a single step.
-+         * If it worked (i.e. lowered the energy), we increase the stepsize but
-+         * the continue straight to the next BFGS step without trying to find any minimum.
-+         * If it didn't work (higher energy), there must be a minimum somewhere between
-+         * the old position and the new one.
-+         *
-+         * Due to the finite numerical accuracy, it turns out that it is a good idea
-+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-+         * This leads to lower final energies in the tests I've done. / Erik
-+         */
-+        foundlower = FALSE;
-+        EpotA      = Epot0;
-+        a          = 0.0;
-+        c          = a + stepsize; /* reference position along line is zero */
-+
-+        /* Check stepsize first. We do not allow displacements
-+         * larger than emstep.
-+         */
-+        do
-+        {
-+            c        = a + stepsize;
-+            maxdelta = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                delta = c*s[i];
-+                if (delta > maxdelta)
-+                {
-+                    maxdelta = delta;
-+                }
-+            }
-+            if (maxdelta > inputrec->em_stepsize)
-+            {
-+                stepsize *= 0.1;
-+            }
-+        }
-+        while (maxdelta > inputrec->em_stepsize);
-+
-+        /* Take a trial step */
-+        for (i = 0; i < n; i++)
-+        {
-+            xc[i] = lastx[i] + c*s[i];
-+        }
-+
-+        neval++;
-+        /* Calculate energy for the trial step */
-+        ems.s.x = (rvec *)xc;
-+        ems.f   = (rvec *)fc;
-+        evaluate_energy(fplog, bVerbose, cr,
-+                        state, top_global, &ems, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, step, FALSE);
-+        EpotC = ems.epot;
-+
-+        /* Calc derivative along line */
-+        for (gpc = 0, i = 0; i < n; i++)
-+        {
-+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
-+        }
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpc, cr);
-+        }
-+
-+        /* This is the max amount of increase in energy we tolerate */
-+        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
-+
-+        /* Accept the step if the energy is lower, or if it is not significantly higher
-+         * and the line derivative is still negative.
-+         */
-+        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
-+        {
-+            foundlower = TRUE;
-+            /* Great, we found a better energy. Increase step for next iteration
-+             * if we are still going down, decrease it otherwise
-+             */
-+            if (gpc < 0)
-+            {
-+                stepsize *= 1.618034; /* The golden section */
-+            }
-+            else
-+            {
-+                stepsize *= 0.618034; /* 1/golden section */
-+            }
-+        }
-+        else
-+        {
-+            /* New energy is the same or higher. We will have to do some work
-+             * to find a smaller value in the interval. Take smaller step next time!
-+             */
-+            foundlower = FALSE;
-+            stepsize  *= 0.618034;
-+        }
-+
-+        /* OK, if we didn't find a lower value we will have to locate one now - there must
-+         * be one in the interval [a=0,c].
-+         * The same thing is valid here, though: Don't spend dozens of iterations to find
-+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-+         *
-+         * I also have a safeguard for potentially really patological functions so we never
-+         * take more than 20 steps before we give up ...
-+         *
-+         * If we already found a lower value we just skip this step and continue to the update.
-+         */
-+
-+        if (!foundlower)
-+        {
-+
-+            nminstep = 0;
-+            do
-+            {
-+                /* Select a new trial point.
-+                 * If the derivatives at points a & c have different sign we interpolate to zero,
-+                 * otherwise just do a bisection.
-+                 */
-+
-+                if (gpa < 0 && gpc > 0)
-+                {
-+                    b = a + gpa*(a-c)/(gpc-gpa);
-+                }
-+                else
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* safeguard if interpolation close to machine accuracy causes errors:
-+                 * never go outside the interval
-+                 */
-+                if (b <= a || b >= c)
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* Take a trial step */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xb[i] = lastx[i] + b*s[i];
-+                }
-+
-+                neval++;
-+                /* Calculate energy for the trial step */
-+                ems.s.x = (rvec *)xb;
-+                ems.f   = (rvec *)fb;
-+                evaluate_energy(fplog, bVerbose, cr,
-+                                state, top_global, &ems, top,
-+                                inputrec, nrnb, wcycle, gstat,
-+                                vsite, constr, fcd, graph, mdatoms, fr,
-+                                mu_tot, enerd, vir, pres, step, FALSE);
-+                EpotB = ems.epot;
-+
-+                fnorm = ems.fnorm;
-+
-+                for (gpb = 0, i = 0; i < n; i++)
-+                {
-+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
-+
-+                }
-+                /* Sum the gradient along the line across CPUs */
-+                if (PAR(cr))
-+                {
-+                    gmx_sumd(1, &gpb, cr);
-+                }
-+
-+                /* Keep one of the intervals based on the value of the derivative at the new point */
-+                if (gpb > 0)
-+                {
-+                    /* Replace c endpoint with b */
-+                    EpotC = EpotB;
-+                    c     = b;
-+                    gpc   = gpb;
-+                    /* swap coord pointers b/c */
-+                    xtmp = xb;
-+                    ftmp = fb;
-+                    xb   = xc;
-+                    fb   = fc;
-+                    xc   = xtmp;
-+                    fc   = ftmp;
-+                }
-+                else
-+                {
-+                    /* Replace a endpoint with b */
-+                    EpotA = EpotB;
-+                    a     = b;
-+                    gpa   = gpb;
-+                    /* swap coord pointers a/b */
-+                    xtmp = xb;
-+                    ftmp = fb;
-+                    xb   = xa;
-+                    fb   = fa;
-+                    xa   = xtmp;
-+                    fa   = ftmp;
-+                }
-+
-+                /*
-+                 * Stop search as soon as we find a value smaller than the endpoints,
-+                 * or if the tolerance is below machine precision.
-+                 * Never run more than 20 steps, no matter what.
-+                 */
-+                nminstep++;
-+            }
-+            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
-+
-+            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
-+            {
-+                /* OK. We couldn't find a significantly lower energy.
-+                 * If ncorr==0 this was steepest descent, and then we give up.
-+                 * If not, reset memory to restart as steepest descent before quitting.
-+                 */
-+                if (ncorr == 0)
-+                {
-+                    /* Converged */
-+                    converged = TRUE;
-+                    break;
-+                }
-+                else
-+                {
-+                    /* Reset memory */
-+                    ncorr = 0;
-+                    /* Search in gradient direction */
-+                    for (i = 0; i < n; i++)
-+                    {
-+                        dx[point][i] = ff[i];
-+                    }
-+                    /* Reset stepsize */
-+                    stepsize = 1.0/fnorm;
-+                    continue;
-+                }
-+            }
-+
-+            /* Select min energy state of A & C, put the best in xx/ff/Epot
-+             */
-+            if (EpotC < EpotA)
-+            {
-+                Epot = EpotC;
-+                /* Use state C */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xx[i] = xc[i];
-+                    ff[i] = fc[i];
-+                }
-+                stepsize = c;
-+            }
-+            else
-+            {
-+                Epot = EpotA;
-+                /* Use state A */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xx[i] = xa[i];
-+                    ff[i] = fa[i];
-+                }
-+                stepsize = a;
-+            }
-+
-+        }
-+        else
-+        {
-+            /* found lower */
-+            Epot = EpotC;
-+            /* Use state C */
-+            for (i = 0; i < n; i++)
-+            {
-+                xx[i] = xc[i];
-+                ff[i] = fc[i];
-+            }
-+            stepsize = c;
-+        }
-+
-+        /* Update the memory information, and calculate a new
-+         * approximation of the inverse hessian
-+         */
-+
-+        /* Have new data in Epot, xx, ff */
-+        if (ncorr < nmaxcorr)
-+        {
-+            ncorr++;
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            dg[point][i]  = lastf[i]-ff[i];
-+            dx[point][i] *= stepsize;
-+        }
-+
-+        dgdg = 0;
-+        dgdx = 0;
-+        for (i = 0; i < n; i++)
-+        {
-+            dgdg += dg[point][i]*dg[point][i];
-+            dgdx += dg[point][i]*dx[point][i];
-+        }
-+
-+        diag = dgdx/dgdg;
-+
-+        rho[point] = 1.0/dgdx;
-+        point++;
-+
-+        if (point >= nmaxcorr)
-+        {
-+            point = 0;
-+        }
-+
-+        /* Update */
-+        for (i = 0; i < n; i++)
-+        {
-+            p[i] = ff[i];
-+        }
-+
-+        cp = point;
-+
-+        /* Recursive update. First go back over the memory points */
-+        for (k = 0; k < ncorr; k++)
-+        {
-+            cp--;
-+            if (cp < 0)
-+            {
-+                cp = ncorr-1;
-+            }
-+
-+            sq = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                sq += dx[cp][i]*p[i];
-+            }
-+
-+            alpha[cp] = rho[cp]*sq;
-+
-+            for (i = 0; i < n; i++)
-+            {
-+                p[i] -= alpha[cp]*dg[cp][i];
-+            }
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            p[i] *= diag;
-+        }
-+
-+        /* And then go forward again */
-+        for (k = 0; k < ncorr; k++)
-+        {
-+            yr = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                yr += p[i]*dg[cp][i];
-+            }
-+
-+            beta = rho[cp]*yr;
-+            beta = alpha[cp]-beta;
-+
-+            for (i = 0; i < n; i++)
-+            {
-+                p[i] += beta*dx[cp][i];
-+            }
-+
-+            cp++;
-+            if (cp >= ncorr)
-+            {
-+                cp = 0;
-+            }
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            if (!frozen[i])
-+            {
-+                dx[point][i] = p[i];
-+            }
-+            else
-+            {
-+                dx[point][i] = 0;
-+            }
-+        }
-+
-+        stepsize = 1.0;
-+
-+        /* Test whether the convergence criterion is met */
-+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
-+
-+        /* Print it if necessary */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
-+                        step, Epot, fnorm/sqrt(state->natoms), fmax, nfmax+1);
-+            }
-+            /* Store the new (lower) energies */
-+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                       mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
-+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+            do_log = do_per_step(step, inputrec->nstlog);
-+            do_ene = do_per_step(step, inputrec->nstenergy);
-+            if (do_log)
-+            {
-+                print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+            }
-+            print_ebin(outf->fp_ene, do_ene, FALSE, FALSE,
-+                       do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+
-+        /* Stop when the maximum force lies below tolerance.
-+         * If we have reached machine precision, converged is already set to true.
-+         */
-+
-+        converged = converged || (fmax < inputrec->em_tol);
-+
-+    } /* End of the loop */
-+
-+    if (converged)
-+    {
-+        step--; /* we never took that last step in this case */
-+
-+    }
-+    if (fmax > inputrec->em_tol)
-+    {
-+        if (MASTER(cr))
-+        {
-+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
-+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
-+        }
-+        converged = FALSE;
-+    }
-+
-+    /* If we printed energy and/or logfile last step (which was the last step)
-+     * we don't have to do it again, but otherwise print the final values.
-+     */
-+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
-+    {
-+        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+    }
-+    if (!do_ene || !do_log) /* Write final energy file entries */
-+    {
-+        print_ebin(outf->fp_ene, !do_ene, FALSE, FALSE,
-+                   !do_log ? fplog : NULL, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+
-+    /* Print some stuff... */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+
-+    /* IMPORTANT!
-+     * For accurate normal mode calculation it is imperative that we
-+     * store the last conformation into the full precision binary trajectory.
-+     *
-+     * However, we should only do it if we did NOT already write this step
-+     * above (which we did if do_x or do_f was true).
-+     */
-+    do_x = !do_per_step(step, inputrec->nstxout);
-+    do_f = !do_per_step(step, inputrec->nstfout);
-+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, step,
-+                  &ems, state, f);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
-+                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
-+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
-+                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
-+
-+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-+    }
-+
-+    finish_em(fplog, cr, outf, runtime, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    runtime->nsteps_done = step;
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_steep(FILE *fplog, t_commrec *cr,
-+                int nfile, const t_filenm fnm[],
-+                const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
-+                int nstglobalcomm,
-+                gmx_vsite_t *vsite, gmx_constr_t constr,
-+                int stepout,
-+                t_inputrec *inputrec,
-+                gmx_mtop_t *top_global, t_fcdata *fcd,
-+                t_state *state_global,
-+                t_mdatoms *mdatoms,
-+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                gmx_edsam_t ed,
-+                t_forcerec *fr,
-+                int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
-+                gmx_membed_t membed,
-+                real cpt_period, real max_hours,
-+                const char *deviceOptions,
-+                unsigned long Flags,
-+                gmx_runtime_t *runtime)
-+{
-+    const char       *SD = "Steepest Descents";
-+    em_state_t       *s_min, *s_try;
-+    rvec             *f_global;
-+    gmx_localtop_t   *top;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f;
-+    gmx_global_stat_t gstat;
-+    t_graph          *graph;
-+    real              stepsize, constepsize;
-+    real              ustep, fnormn;
-+    gmx_mdoutf_t     *outf;
-+    t_mdebin         *mdebin;
-+    gmx_bool          bDone, bAbort, do_x, do_f;
-+    tensor            vir, pres;
-+    rvec              mu_tot;
-+    int               nsteps;
-+    int               count          = 0;
-+    int               steps_accepted = 0;
-+    /* not used */
-+    real              terminate = 0;
-+
-+    s_min = init_em_state();
-+    s_try = init_em_state();
-+
-+    /* Init em and store the local state in s_try */
-+    init_em(fplog, SD, cr, inputrec,
-+            state_global, top_global, s_try, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin);
-+
-+    /* Print to log file  */
-+    print_em_start(fplog, cr, runtime, wcycle, SD);
-+
-+    /* Set variables for stepsize (in nm). This is the largest
-+     * step that we are going to make in any direction.
-+     */
-+    ustep    = inputrec->em_stepsize;
-+    stepsize = 0;
-+
-+    /* Max number of steps  */
-+    nsteps = inputrec->nsteps;
-+
-+    if (MASTER(cr))
-+    {
-+        /* Print to the screen  */
-+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
-+    }
-+
-+    /**** HERE STARTS THE LOOP ****
-+     * count is the counter for the number of steps
-+     * bDone will be TRUE when the minimization has converged
-+     * bAbort will be TRUE when nsteps steps have been performed or when
-+     * the stepsize becomes smaller than is reasonable for machine precision
-+     */
-+    count  = 0;
-+    bDone  = FALSE;
-+    bAbort = FALSE;
-+    while (!bDone && !bAbort)
-+    {
-+        bAbort = (nsteps >= 0) && (count == nsteps);
-+
-+        /* set new coordinates, except for first step */
-+        if (count > 0)
-+        {
-+            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
-+                       s_min, stepsize, s_min->f, s_try,
-+                       constr, top, nrnb, wcycle, count);
-+        }
-+
-+        evaluate_energy(fplog, bVerbose, cr,
-+                        state_global, top_global, s_try, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, count, count == 0);
-+
-+        if (MASTER(cr))
-+        {
-+            print_ebin_header(fplog, count, count, s_try->s.lambda[efptFEP]);
-+        }
-+
-+        if (count == 0)
-+        {
-+            s_min->epot = s_try->epot + 1;
-+        }
-+
-+        /* Print it if necessary  */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
-+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
-+                        (s_try->epot < s_min->epot) ? '\n' : '\r');
-+            }
-+
-+            if (s_try->epot < s_min->epot)
-+            {
-+                /* Store the new (lower) energies  */
-+                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
-+                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
-+                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+                print_ebin(outf->fp_ene, TRUE,
-+                           do_per_step(steps_accepted, inputrec->nstdisreout),
-+                           do_per_step(steps_accepted, inputrec->nstorireout),
-+                           fplog, count, count, eprNORMAL, TRUE,
-+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+                fflush(fplog);
-+            }
-+        }
-+
-+        /* Now if the new energy is smaller than the previous...
-+         * or if this is the first step!
-+         * or if we did random steps!
-+         */
-+
-+        if ( (count == 0) || (s_try->epot < s_min->epot) )
-+        {
-+            steps_accepted++;
-+
-+            /* Test whether the convergence criterion is met...  */
-+            bDone = (s_try->fmax < inputrec->em_tol);
-+
-+            /* Copy the arrays for force, positions and energy  */
-+            /* The 'Min' array always holds the coords and forces of the minimal
-+               sampled energy  */
-+            swap_em_state(s_min, s_try);
-+            if (count > 0)
-+            {
-+                ustep *= 1.2;
-+            }
-+
-+            /* Write to trn, if necessary */
-+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
-+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
-+            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
-+                          top_global, inputrec, count,
-+                          s_min, state_global, f_global);
-+        }
-+        else
-+        {
-+            /* If energy is not smaller make the step smaller...  */
-+            ustep *= 0.5;
-+
-+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-+            {
-+                /* Reload the old state */
-+                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-+                                       s_min, top, mdatoms, fr, vsite, constr,
-+                                       nrnb, wcycle);
-+            }
-+        }
-+
-+        /* Determine new step  */
-+        stepsize = ustep/s_min->fmax;
-+
-+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
-+#ifdef GMX_DOUBLE
-+        if (count == nsteps || ustep < 1e-12)
-+#else
-+        if (count == nsteps || ustep < 1e-6)
-+#endif
-+        {
-+            if (MASTER(cr))
-+            {
-+                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
-+                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
-+            }
-+            bAbort = TRUE;
-+        }
-+
-+        count++;
-+    } /* End of the loop  */
-+
-+    /* Print some shit...  */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, count,
-+                  s_min, state_global, f_global);
-+
-+    fnormn = s_min->fnorm/sqrt(state_global->natoms);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+    }
-+
-+    finish_em(fplog, cr, outf, runtime, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    inputrec->nsteps = count;
-+
-+    runtime->nsteps_done = count;
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_nm(FILE *fplog, t_commrec *cr,
-+             int nfile, const t_filenm fnm[],
-+             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
-+             int nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int stepout,
-+             t_inputrec *inputrec,
-+             gmx_mtop_t *top_global, t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t ed,
-+             t_forcerec *fr,
-+             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
-+             gmx_membed_t membed,
-+             real cpt_period, real max_hours,
-+             const char *deviceOptions,
-+             unsigned long Flags,
-+             gmx_runtime_t *runtime)
-+{
-+    const char          *NM = "Normal Mode Analysis";
-+    gmx_mdoutf_t        *outf;
-+    int                  natoms, atom, d;
-+    int                  nnodes, node;
-+    rvec                *f_global;
-+    gmx_localtop_t      *top;
-+    gmx_enerdata_t      *enerd;
-+    rvec                *f;
-+    gmx_global_stat_t    gstat;
-+    t_graph             *graph;
-+    real                 t, t0, lambda, lam0;
-+    gmx_bool             bNS;
-+    tensor               vir, pres;
-+    rvec                 mu_tot;
-+    rvec                *fneg, *dfdx;
-+    gmx_bool             bSparse; /* use sparse matrix storage format */
-+    size_t               sz=0;
-+    gmx_sparsematrix_t * sparse_matrix           = NULL;
-+    real           *     full_matrix             = NULL;
-+    em_state_t       *   state_work;
-+
-+    /* added with respect to mdrun */
-+    int        i, j, k, row, col;
-+    real       der_range = 10.0*sqrt(GMX_REAL_EPS);
-+    real       x_min;
-+    real       fnorm, fmax;
-+
-+    if (constr != NULL)
-+    {
-+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
-+    }
-+
-+    state_work = init_em_state();
-+
-+    /* Init em and store the local state in state_minimum */
-+    init_em(fplog, NM, cr, inputrec,
-+            state_global, top_global, state_work, &top,
-+            &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, NULL);
-+
-+    natoms = top_global->natoms;
-+    snew(fneg, natoms);
-+    snew(dfdx, natoms);
-+
-+#ifndef GMX_DOUBLE
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr,
-+                "NOTE: This version of Gromacs has been compiled in single precision,\n"
-+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
-+                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
-+                "      are fairly modest even if you recompile in double precision.\n\n");
-+    }
-+#endif
-+
-+    /* Check if we can/should use sparse storage format.
-+     *
-+     * Sparse format is only useful when the Hessian itself is sparse, which it
-+     * will be when we use a cutoff.
-+     * For small systems (n<1000) it is easier to always use full matrix format, though.
-+     */
-+    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
-+    {
-+        md_print_info(cr, fplog, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
-+        bSparse = FALSE;
-+    }
-+    else if (top_global->natoms < 1000)
-+    {
-+        md_print_info(cr, fplog, "Small system size (N=%d), using full Hessian format.\n", top_global->natoms);
-+        bSparse = FALSE;
-+    }
-+    else
-+    {
-+        md_print_info(cr, fplog, "Using compressed symmetric sparse Hessian format.\n");
-+        bSparse = TRUE;
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        sz = DIM*top_global->natoms;
-+
-+        fprintf(stderr, "Allocating Hessian memory...\n\n");
-+
-+        if (bSparse)
-+        {
-+            sparse_matrix = gmx_sparsematrix_init(sz);
-+            sparse_matrix->compressed_symmetric = TRUE;
-+        }
-+        else
-+        {
-+            snew(full_matrix, sz*sz);
-+        }
-+    }
-+
-+    /* Initial values */
-+    t0           = inputrec->init_t;
-+    lam0         = inputrec->fepvals->init_lambda;
-+    t            = t0;
-+    lambda       = lam0;
-+
-+    init_nrnb(nrnb);
-+
-+    where();
-+
-+    /* Write start time and temperature */
-+    print_em_start(fplog, cr, runtime, wcycle, NM);
-+
-+    /* fudge nr of steps to nr of atoms */
-+    inputrec->nsteps = natoms*2;
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
-+                *(top_global->name), (int)inputrec->nsteps);
-+    }
-+
-+    nnodes = cr->nnodes;
-+
-+    /* Make evaluate_energy do a single node force calculation */
-+    cr->nnodes = 1;
-+    evaluate_energy(fplog, bVerbose, cr,
-+                    state_global, top_global, state_work, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    cr->nnodes = nnodes;
-+
-+    /* if forces are not small, warn user */
-+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
-+
-+    md_print_info(cr, fplog, "Maximum force:%12.5e\n", state_work->fmax);
-+    if (state_work->fmax > 1.0e-3)
-+    {
-+        md_print_info(cr, fplog,
-+                      "The force is probably not small enough to "
-+                      "ensure that you are at a minimum.\n"
-+                      "Be aware that negative eigenvalues may occur\n"
-+                      "when the resulting matrix is diagonalized.\n\n");
-+    }
-+
-+    /***********************************************************
-+     *
-+     *      Loop over all pairs in matrix
-+     *
-+     *      do_force called twice. Once with positive and
-+     *      once with negative displacement
-+     *
-+     ************************************************************/
-+
-+    /* Steps are divided one by one over the nodes */
-+    for (atom = cr->nodeid; atom < natoms; atom += nnodes)
-+    {
-+
-+        for (d = 0; d < DIM; d++)
-+        {
-+            x_min = state_work->s.x[atom][d];
-+
-+            state_work->s.x[atom][d] = x_min - der_range;
-+
-+            /* Make evaluate_energy do a single node force calculation */
-+            cr->nnodes = 1;
-+            evaluate_energy(fplog, bVerbose, cr,
-+                            state_global, top_global, state_work, top,
-+                            inputrec, nrnb, wcycle, gstat,
-+                            vsite, constr, fcd, graph, mdatoms, fr,
-+                            mu_tot, enerd, vir, pres, atom*2, FALSE);
-+
-+            for (i = 0; i < natoms; i++)
-+            {
-+                copy_rvec(state_work->f[i], fneg[i]);
-+            }
-+
-+            state_work->s.x[atom][d] = x_min + der_range;
-+
-+            evaluate_energy(fplog, bVerbose, cr,
-+                            state_global, top_global, state_work, top,
-+                            inputrec, nrnb, wcycle, gstat,
-+                            vsite, constr, fcd, graph, mdatoms, fr,
-+                            mu_tot, enerd, vir, pres, atom*2+1, FALSE);
-+            cr->nnodes = nnodes;
-+
-+            /* x is restored to original */
-+            state_work->s.x[atom][d] = x_min;
-+
-+            for (j = 0; j < natoms; j++)
-+            {
-+                for (k = 0; (k < DIM); k++)
-+                {
-+                    dfdx[j][k] =
-+                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
-+                }
-+            }
-+
-+            if (!MASTER(cr))
-+            {
-+#ifdef GMX_MPI
-+#ifdef GMX_DOUBLE
-+#define mpi_type MPI_DOUBLE
-+#else
-+#define mpi_type MPI_FLOAT
-+#endif
-+                MPI_Send(dfdx[0], natoms*DIM, mpi_type, MASTERNODE(cr), cr->nodeid,
-+                         cr->mpi_comm_mygroup);
-+#endif
-+            }
-+            else
-+            {
-+                for (node = 0; (node < nnodes && atom+node < natoms); node++)
-+                {
-+                    if (node > 0)
-+                    {
-+#ifdef GMX_MPI
-+                        MPI_Status stat;
-+                        MPI_Recv(dfdx[0], natoms*DIM, mpi_type, node, node,
-+                                 cr->mpi_comm_mygroup, &stat);
-+#undef mpi_type
-+#endif
-+                    }
-+
-+                    row = (atom + node)*DIM + d;
-+
-+                    for (j = 0; j < natoms; j++)
-+                    {
-+                        for (k = 0; k < DIM; k++)
-+                        {
-+                            col = j*DIM + k;
-+
-+                            if (bSparse)
-+                            {
-+                                if (col >= row && dfdx[j][k] != 0.0)
-+                                {
-+                                    gmx_sparsematrix_increment_value(sparse_matrix,
-+                                                                     row, col, dfdx[j][k]);
-+                                }
-+                            }
-+                            else
-+                            {
-+                                full_matrix[row*sz+col] = dfdx[j][k];
-+                            }
-+                        }
-+                    }
-+                }
-+            }
-+
-+            if (bVerbose && fplog)
-+            {
-+                fflush(fplog);
-+            }
-+        }
-+        /* write progress */
-+        if (MASTER(cr) && bVerbose)
-+        {
-+            fprintf(stderr, "\rFinished step %d out of %d",
-+                    min(atom+nnodes, natoms), natoms);
-+            fflush(stderr);
-+        }
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\n\nWriting Hessian...\n");
-+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
-+    }
-+
-+    finish_em(fplog, cr, outf, runtime, wcycle);
-+
-+    runtime->nsteps_done = natoms*2;
-+
-+    return 0;
-+}
diff --git a/g/GROMACS/gromacs-5.0.4-plumed-2.1.3-mpi.patch b/g/GROMACS/gromacs-5.0.4-plumed-2.1.3-mpi.patch
deleted file mode 100644
index 50df9639..00000000
--- a/g/GROMACS/gromacs-5.0.4-plumed-2.1.3-mpi.patch
+++ /dev/null
@@ -1,9575 +0,0 @@
-diff --git a/Plumed.cmake b/Plumed.cmake
-new file mode 100644
-index 0000000..01472f0
---- /dev/null
-+++ b/Plumed.cmake
-@@ -0,0 +1,3 @@
-+# PLUMED: shared installation
-+set(PLUMED_LOAD  /apps/all/PLUMED/2.1.3-foss-2015g/lib/plumed///src/lib/libplumed.so -ldl )
-+set(PLUMED_DEPENDENCIES  /apps/all/PLUMED/2.1.3-foss-2015g/lib/plumed///src/lib/libplumed.so)
-diff --git a/Plumed.h b/Plumed.h
-new file mode 100644
-index 0000000..16da74a
---- /dev/null
-+++ b/Plumed.h
-@@ -0,0 +1,494 @@
-+/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-+   Copyright (c) 2011-2014 The plumed team
-+   (see the PEOPLE file at the root of the distribution for a list of names)
-+
-+   See http://www.plumed-code.org for more information.
-+
-+   This file is part of plumed, version 2.
-+
-+   plumed is free software: you can redistribute it and/or modify
-+   it under the terms of the GNU Lesser General Public License as published by
-+   the Free Software Foundation, either version 3 of the License, or
-+   (at your option) any later version.
-+
-+   plumed is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+   GNU Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public License
-+   along with plumed.  If not, see <http://www.gnu.org/licenses/>.
-++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
-+#ifndef __PLUMED_wrapper_Plumed_h
-+#define __PLUMED_wrapper_Plumed_h
-+
-+/**
-+\page ReferencePlumedH Reference for interfacing MD codes with PLUMED
-+
-+  Plumed.h and Plumed.c contain the external plumed interface, which is used to
-+  integrate it with MD engines. This interface is very general, and is expected
-+  not to change across plumed versions. Plumed.c also implements a dummy version
-+  of the interface, so as to allow a code to be fully linked even if the plumed
-+  library is not available yet. These files could be directly included in the official
-+  host MD distribution. In this manner, it will be sufficient to link the plumed
-+  library at link time (on all systems) or directly at runtime (on system where
-+  dynamic loading is enabled) to include plumed features.
-+
-+  Why is Plumed.c written in C and not C++? The reason is that the resulting Plumed.o
-+  needs to be linked with the host MD code immediately (whereas the rest of plumed
-+  could be linked a posteriori). Imagine the MD code is written in FORTRAN: when we
-+  link the Plumed.o file we would like not to need any C++ library linked. In this
-+  manner, we do not need to know which C++ compiler will be used to compile plumed.
-+  The C++ library is only linked to the "rest" of plumed, which actually use it.
-+  Anyway, Plumed.c is written in such a manner to allow its compilation also in C++
-+  (C++ is a bit stricter than C; compatibility is checked when PlumedStatic.cpp,
-+  which basically includes Plumed.c, is compiled with the C++ compiler). This will
-+  allow e.g. MD codes written in C++ to just incorporate Plumed.c (maybe renamed into
-+  Plumed.cpp), without the need of configuring a plain C compiler.
-+
-+  Plumed interface can be used from C, C++ and FORTRAN. Everything concerning plumed
-+  is hidden inside a single object type, which is described in C by a structure
-+  (struct \ref plumed), in C++ by a class (PLMD::Plumed) and in FORTRAN by a
-+  fixed-length string (CHARACTER(LEN=32)). Obviously C++ can use both struct
-+  and class interfaces, but the first should be preferred. The reference interface
-+  is the C one, whereas FORTRAN and C++ interfaces are implemented as wrappers
-+  around it.
-+
-+  In the C++ interface, all the routines are implemented as methods of PLMD::Plumed.
-+  In the C and FORTRAN interfaces, all the routines are named plumed_*, to
-+  avoid potential name clashes. Notice that the entire plumed library
-+  is implemented in C++, and it is hidden inside the PLMD namespace.
-+
-+  Handlers to the plumed object can be converted among different representations,
-+  to allow inter-operability among languages. In C, there are tools to convert
-+  to/from FORTRAN, whereas in C++ there are tools to convert to/from FORTRAN and C.
-+
-+  These handlers only contain a pointer to the real structure, so that
-+  when a plumed object is brought from one language to another,
-+  it brings a reference to the same environment.
-+
-+  Moreover, to simplify life in all cases where a single Plumed object is
-+  required for the entire simulation (which covers most of the practical
-+  applications with conventional MD codes) it is possible to take advantage
-+  of a global interface, which is implicitly referring to a unique global instance.
-+  The global object should still be initialized and finalized properly.
-+
-+  The basic method to send a message to plumed is
-+\verbatim
-+  (C) plumed_cmd
-+  (C++) PLMD::Plumed::cmd
-+  (FORTRAN)  PLUMED_F_CMD
-+\endverbatim
-+
-+  To initialize a plumed object, use:
-+\verbatim
-+  (C)        plumed_create
-+  (C++)      (constructor of PLMD::Plumed)
-+  (FORTRAN)  PLUMED_F_CREATE
-+\endverbatim
-+
-+  To finalize it, use
-+\verbatim
-+  (C)        plumed_finalize
-+  (C++)      (destructor of PLMD::Plumed)
-+  (FORTRAN)  PLUMED_F_FINALIZE
-+\endverbatim
-+
-+  To access to the global-object, use
-+\verbatim
-+  (C)        plumed_gcreate, plumed_gfinalize, plumed_gcmd
-+  (C++)      PLMD::Plumed::gcreate, PLMD::Plumed::gfinalize, PLMD::Plumed::gcmd
-+  (FORTRAN)  PLUMED_F_GCREATE, PLUMED_F_GFINALIZE, PLUMED_F_GCMD
-+\endverbatim
-+
-+  To check if the global object has been initialized, use
-+\verbatim
-+  (C)        plumed_ginitialized
-+  (C++)      PLMD::Plumed::ginitialized
-+  (FORTRAN)  PLUMED_F_GINITIALIZED
-+\endverbatim
-+
-+  To check if plumed library is available (this is useful for runtime linking), use
-+\verbatim
-+  (C)        plumed_installed 
-+  (C++)      PLMD::Plumed::installed
-+  (FORTRAN)  PLUMED_F_INSTALLED
-+\endverbatim
-+
-+  To convert handlers use
-+\verbatim
-+  (C)        plumed_c2f                 (C to FORTRAN)
-+  (C)        plumed_f2c                 (FORTRAN to C)
-+  (C++)      Plumed(plumed) constructor (C to C++)
-+  (C++)      operator plumed() cast     (C++ to C)
-+  (C++)      Plumed(char*)  constructor (FORTRAN to C++)
-+  (C++)      toFortran(char*)           (C++ to FORTRAN)
-+\endverbatim
-+
-+\verbatim
-+  FORTRAN interface
-+    SUBROUTINE PLUMED_F_INSTALLED(i)
-+      INTEGER,           INTENT(OUT)   :: i
-+    SUBROUTINE PLUMED_F_GINITIALIZED(i)
-+      INTEGER,           INTENT(OUT)   :: i
-+    SUBROUTINE PLUMED_F_GCREATE()
-+    SUBROUTINE PLUMED_F_GCMD(key,val)
-+      CHARACTER(LEN=*), INTENT(IN)     :: key
-+      UNSPECIFIED_TYPE, INTENT(INOUT)  :: val(*)
-+    SUBROUTINE PLUMED_F_GFINALIZE()
-+    SUBROUTINE PLUMED_F_GLOBAL(p)
-+      CHARACTER(LEN=32), INTENT(OUT)   :: p
-+    SUBROUTINE PLUMED_F_CREATE(p)
-+      CHARACTER(LEN=32), INTENT(OUT)   :: p
-+    SUBROUTINE PLUMED_F_CMD(p,key,val)
-+      CHARACTER(LEN=32), INTENT(IN)    :: p
-+      CHARACTER(LEN=*),  INTENT(IN)    :: key
-+      UNSPECIFIED_TYPE,  INTENT(INOUT) :: val(*)
-+    SUBROUTINE PLUMED_F_FINALIZE(p)
-+      CHARACTER(LEN=32), INTENT(IN)    :: p
-+\endverbatim
-+
-+  The main routine is "cmd", which accepts two arguments:
-+  key is a string containing the name of the command
-+  val is the argument. it is declared const so as to use allow passing const objects, but in practice plumed
-+      is going to modify val in several cases (using a const_cast).
-+  In some cases val can be omitted: just pass a NULL pointer (in C++, val is optional and can be omitted).
-+  The set of possible keys is the real API of the plumed library, and will be expanded with time.
-+  New commands will be added, but backward compatibility will be retained as long as possible.
-+
-+  To pass plumed a callback function use the following syntax (not available in FORTRAN yet)
-+\verbatim
-+    plumed_function_holder ff;
-+    ff.p=your_function;
-+    plumed_cmd(plumed,"xxxx",&ff);
-+\endverbatim
-+  (this is passing the your_function() function to the "xxxx" command)
-+*/
-+
-+#ifdef __cplusplus
-+ extern "C" {
-+#endif
-+
-+/* Generic function pointer */
-+typedef void (*plumed_function_pointer)(void);
-+
-+/**
-+  \brief Holder for function pointer.
-+
-+  To pass plumed a callback function use the following syntax:
-+\verbatim
-+    plumed_function_holder ff;
-+    ff.p=your_function;
-+    plumed_cmd(plumed,"xxxx",&ff);
-+\endverbatim
-+  (this is going to pass the your_function() function to the "xxxx" command)
-+*/
-+
-+typedef struct {
-+  plumed_function_pointer p;
-+} plumed_function_holder;
-+
-+/**
-+  \brief Main plumed object
-+
-+  This is an object containing a Plumed instance, which should be used in
-+  the MD engine. It should first be initialized with plumed_create(),
-+  then it communicates with the MD engine using plumed_cmd(). Finally,
-+  before the termination, it should be deallocated with plumed_finalize().
-+  Its interface is very simple and general, and is expected
-+  not to change across plumed versions. See \ref ReferencePlumedH.
-+*/
-+typedef struct {
-+/**
-+  \private
-+  \brief Void pointer holding the real PlumedMain structure
-+*/
-+  void*p;
-+} plumed;
-+
-+/** \relates plumed
-+    \brief Constructor
-+
-+    \return The constructed plumed object
-+*/
-+plumed plumed_create(void);
-+
-+/** \relates plumed
-+    \brief Tells p to execute a command
-+
-+    \param p The plumed object on which command is acting
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like plumed_cmd(p,"A","B"),
-+               but for some choice of key it can change the content
-+*/
-+void plumed_cmd(plumed p,const char*key,const void*val);
-+
-+/** \relates plumed
-+    \brief Destructor
-+
-+    \param p The plumed object to be deallocated
-+*/
-+void plumed_finalize(plumed p);
-+
-+/** \relates plumed
-+    \brief Check if plumed is installed (for runtime binding)
-+
-+    \return 1 if plumed is installed, to 0 otherwise
-+*/
-+int plumed_installed(void);
-+
-+/** \relates plumed
-+    \brief Retrieves an handler to the global structure.
-+*/
-+plumed plumed_global(void);
-+
-+/** \relates plumed
-+    \brief Check if the global interface has been initialized
-+
-+    \return 1 if plumed has been initialized, 0 otherwise
-+*/
-+int plumed_ginitialized(void);
-+
-+/* global C interface, working on a global object */
-+
-+/** \relates plumed
-+    \brief Constructor for the global interface.
-+
-+    \note Equivalent to plumed_create(), but initialize a static global plumed object
-+*/
-+void plumed_gcreate(void);
-+
-+/** \relates plumed
-+    \brief Tells to the global interface to execute a command.
-+
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like plumed_gcmd("A","B"),
-+               but for some choice of key it can change the content
-+
-+    \note Equivalent to plumed_cmd(), but skipping the plumed argument
-+*/
-+void plumed_gcmd(const char* key,const void* val);
-+
-+/** \relates plumed
-+    \brief Destructor for the global interface.
-+
-+    \note Equivalent to plumed_finalize(), but skipping the plumed argument
-+*/
-+void plumed_gfinalize(void);
-+
-+/* routines to convert char handler from/to plumed objects */
-+
-+/** \related plumed
-+    \brief Converts a C handler to a FORTRAN handler
-+
-+    \param p The C handler
-+    \param c The FORTRAN handler (a char[32])
-+*/
-+void   plumed_c2f(plumed p,char* c);
-+
-+/** \related plumed
-+    \brief Converts a FORTRAN handler to a C handler
-+    \param c The FORTRAN handler (a char[32])
-+    \return The C handler
-+*/
-+plumed plumed_f2c(const char* c);
-+
-+#ifdef __cplusplus
-+ }
-+#endif
-+
-+#ifdef __cplusplus
-+
-+/* this is to include the NULL pointer */
-+#include <cstdlib>
-+
-+/* C++ interface is hidden in PLMD namespace (same as plumed library) */
-+namespace PLMD {
-+
-+/**
-+  C++ wrapper for \ref plumed.
-+
-+  This class provides a C++ interface to PLUMED.
-+*/
-+
-+class Plumed{
-+  plumed main;
-+/**
-+   keeps track if the object was created from scratch using 
-+   the defaults destructor (cloned=false) or if it was imported
-+   from C or FORTRAN (cloned-true). In the latter case, the
-+   plumed_finalize() method is not called when destructing the object,
-+   since it is expected to be finalized in the C/FORTRAN code
-+*/
-+  bool cloned;
-+public:
-+/**
-+   Check if plumed is installed (for runtime binding)
-+   \return true if plumed is installed, false otherwise
-+*/
-+  static bool installed();
-+/**
-+   Check if global-plumed has been initialized
-+   \return true if global plumed object (see global()) is initialized (i.e. if gcreate() has been
-+           called), false otherwise.
-+*/
-+  static bool ginitialized();
-+/**
-+   Initialize global-plumed
-+*/
-+  static void gcreate();
-+/**
-+   Send a command to global-plumed
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like gcmd("A","B"),
-+               but for some choice of key it can change the content
-+*/
-+  static void gcmd(const char* key,const void* val);
-+/**
-+   Finalize global-plumed
-+*/
-+  static void gfinalize();
-+/**
-+   Returns the Plumed global object
-+   \return The Plumed global object
-+*/
-+  static Plumed global();
-+/**
-+   Constructor
-+*/
-+  Plumed();
-+/**
-+   Clone a Plumed object from a FORTRAN char* handler
-+   \param c The FORTRAN handler (a char[32]).
-+
-+ \attention The Plumed object created in this manner
-+            will not finalize the corresponding plumed structure.
-+            It is expected that the FORTRAN code calls plumed_c_finalize for it
-+*/
-+  Plumed(const char*c);
-+/**
-+   Clone a Plumed object from a C plumed structure
-+   \param p The C plumed structure.
-+
-+ \attention The Plumed object created in this manner
-+            will not finalize the corresponding plumed structure.
-+            It is expected that the C code calls plumed_finalize for it
-+*/
-+  Plumed(plumed p);
-+private:
-+/** Copy constructor is disabled (private and unimplemented)
-+  The problem here is that after copying it will not be clear who is
-+  going to finalize the corresponding plumed structure.
-+*/
-+  Plumed(const Plumed&);
-+/** Assignment operator is disabled (private and unimplemented)
-+  The problem here is that after copying it will not be clear who is
-+  going to finalize the corresponding plumed structure.
-+*/
-+  Plumed&operator=(const Plumed&);
-+public:
-+/**
-+   Retrieve the C plumed structure for this object
-+*/
-+  operator plumed()const;
-+/**
-+   Retrieve a FORTRAN handler for this object
-+    \param c The FORTRAN handler (a char[32]).
-+*/
-+  void toFortran(char*c)const;
-+/**
-+   Send a command to this plumed object
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like p.cmd("A","B"),
-+               but for some choice of key it can change the content
-+*/
-+  void cmd(const char*key,const void*val=NULL);
-+/**
-+   Destructor
-+
-+   Destructor is virtual so as to allow correct inheritance from Plumed object.
-+   To avoid linking problems with g++, I specify "inline" also here (in principle
-+   it should be enough to specify it down in the definition of the function, but
-+   for some reason that I do not understand g++ does not inline it properly in that
-+   case and complains when Plumed.h is included but Plumed.o is not linked. Anyway, the
-+   way it is done here seems to work properly).
-+*/
-+  inline virtual ~Plumed();
-+};
-+
-+/* All methods are inlined so as to avoid the compilation of an extra c++ file */
-+
-+inline
-+bool Plumed::installed(){
-+  return plumed_installed();
-+}
-+
-+inline
-+Plumed::Plumed():
-+  main(plumed_create()),
-+  cloned(false)
-+{}
-+
-+inline
-+Plumed::Plumed(const char*c):
-+  main(plumed_f2c(c)),
-+  cloned(true)
-+{}
-+
-+inline
-+Plumed::Plumed(plumed p):
-+  main(p),
-+  cloned(true)
-+{}
-+
-+inline
-+Plumed::operator plumed()const{
-+  return main;
-+}
-+
-+inline
-+void Plumed::toFortran(char*c)const{
-+  plumed_c2f(main,c);
-+}
-+
-+inline
-+void Plumed::cmd(const char*key,const void*val){
-+  plumed_cmd(main,key,val);
-+}
-+
-+inline
-+Plumed::~Plumed(){
-+  if(!cloned)plumed_finalize(main);
-+}
-+
-+inline
-+bool Plumed::ginitialized(){
-+  return plumed_ginitialized();
-+}
-+
-+inline
-+void Plumed::gcreate(){
-+  plumed_gcreate();
-+}
-+
-+inline
-+void Plumed::gcmd(const char* key,const void* val){
-+  plumed_gcmd(key,val);
-+}
-+
-+inline
-+void Plumed::gfinalize(){
-+  plumed_gfinalize();
-+}
-+
-+inline
-+Plumed Plumed::global(){
-+  return plumed_global();
-+}
-+
-+}
-+
-+#endif
-+
-+
-+#endif
-diff --git a/Plumed.inc b/Plumed.inc
-new file mode 100644
-index 0000000..e1e29a7
---- /dev/null
-+++ b/Plumed.inc
-@@ -0,0 +1,3 @@
-+# PLUMED: shared installation
-+PLUMED_LOAD= /apps/all/PLUMED/2.1.3-foss-2015g/lib/plumed///src/lib/libplumed.so -ldl
-+PLUMED_DEPENDENCIES= /apps/all/PLUMED/2.1.3-foss-2015g/lib/plumed///src/lib/libplumed.so
-diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt
-index 6db37e2..cc97aa8 100644
---- a/src/gromacs/CMakeLists.txt
-+++ b/src/gromacs/CMakeLists.txt
-@@ -32,6 +32,8 @@
- # To help us fund GROMACS development, we humbly ask that you cite
- # the research papers on the package. Check out http://www.gromacs.org.
- 
-+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
-+
- set(LIBGROMACS_SOURCES)
- 
- function (gmx_install_headers DESTINATION)
-@@ -189,7 +191,7 @@ target_link_libraries(libgromacs
-                       ${TNG_IO_LIBRARIES}
-                       ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
-                       ${XML_LIBRARIES}
--                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
-+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${PLUMED_LOAD})
- set_target_properties(libgromacs PROPERTIES
-                       OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
-                       SOVERSION ${LIBRARY_SOVERSION}
-diff --git a/src/gromacs/CMakeLists.txt.preplumed b/src/gromacs/CMakeLists.txt.preplumed
-new file mode 100644
-index 0000000..6db37e2
---- /dev/null
-+++ b/src/gromacs/CMakeLists.txt.preplumed
-@@ -0,0 +1,232 @@
-+#
-+# This file is part of the GROMACS molecular simulation package.
-+#
-+# Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
-+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+# and including many others, as listed in the AUTHORS file in the
-+# top-level source directory and at http://www.gromacs.org.
-+#
-+# GROMACS is free software; you can redistribute it and/or
-+# modify it under the terms of the GNU Lesser General Public License
-+# as published by the Free Software Foundation; either version 2.1
-+# of the License, or (at your option) any later version.
-+#
-+# GROMACS is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+# Lesser General Public License for more details.
-+#
-+# You should have received a copy of the GNU Lesser General Public
-+# License along with GROMACS; if not, see
-+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+#
-+# If you want to redistribute modifications to GROMACS, please
-+# consider that scientific software is very special. Version
-+# control is crucial - bugs must be traceable. We will be happy to
-+# consider code for inclusion in the official distribution, but
-+# derived work must not be called official GROMACS. Details are found
-+# in the README & COPYING files - if they are missing, get the
-+# official version at http://www.gromacs.org.
-+#
-+# To help us fund GROMACS development, we humbly ask that you cite
-+# the research papers on the package. Check out http://www.gromacs.org.
-+
-+set(LIBGROMACS_SOURCES)
-+
-+function (gmx_install_headers DESTINATION)
-+    if (NOT GMX_BUILD_MDRUN_ONLY)
-+        if (DESTINATION)
-+            set(DESTINATION ${INCL_INSTALL_DIR}/gromacs/${DESTINATION})
-+        else()
-+            set(DESTINATION ${INCL_INSTALL_DIR}/gromacs)
-+        endif()
-+        install(FILES ${ARGN} DESTINATION ${DESTINATION} COMPONENT development)
-+    endif()
-+endfunction ()
-+
-+if(GMX_USE_TNG)
-+    option(GMX_EXTERNAL_TNG "Use external TNG instead of compiling the version shipped with GROMACS."
-+           OFF)
-+    # Detect TNG if GMX_EXTERNAL_TNG is explicitly ON
-+    if(GMX_EXTERNAL_TNG)
-+        find_package(TNG_IO 1.6.0)
-+        if(NOT TNG_IO_FOUND)
-+            message(FATAL_ERROR
-+                "TNG >= 1.6.0 not found. "
-+                "You can set GMX_EXTERNAL_TNG=OFF to compile TNG.")
-+        endif()
-+        include_directories(${TNG_IO_INCLUDE_DIRS})
-+    endif()
-+    if(NOT GMX_EXTERNAL_TNG)
-+        include(${CMAKE_SOURCE_DIR}/src/external/tng_io/BuildTNG.cmake)
-+        tng_get_source_list(TNG_SOURCES TNG_IO_DEFINITIONS)
-+        list(APPEND LIBGROMACS_SOURCES ${TNG_SOURCES})
-+        tng_set_source_properties(WITH_ZLIB ${HAVE_ZLIB})
-+
-+        if (HAVE_ZLIB)
-+            list(APPEND GMX_EXTRA_LIBRARIES ${ZLIB_LIBRARIES})
-+            include_directories(${ZLIB_INCLUDE_DIRS})
-+        endif()
-+    endif()
-+else()
-+    # We still need to get tng/tng_io_fwd.h from somewhere!
-+    include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
-+endif()
-+
-+add_subdirectory(gmxlib)
-+add_subdirectory(mdlib)
-+add_subdirectory(gmxpreprocess)
-+add_subdirectory(commandline)
-+add_subdirectory(fft)
-+add_subdirectory(linearalgebra)
-+add_subdirectory(math)
-+add_subdirectory(random)
-+add_subdirectory(onlinehelp)
-+add_subdirectory(options)
-+add_subdirectory(timing)
-+add_subdirectory(utility)
-+add_subdirectory(fileio)
-+add_subdirectory(swap)
-+add_subdirectory(essentialdynamics)
-+add_subdirectory(pulling)
-+add_subdirectory(simd)
-+add_subdirectory(imd)
-+if (NOT GMX_BUILD_MDRUN_ONLY)
-+    add_subdirectory(legacyheaders)
-+    add_subdirectory(gmxana)
-+    add_subdirectory(statistics)
-+    add_subdirectory(analysisdata)
-+    add_subdirectory(selection)
-+    add_subdirectory(trajectoryanalysis)
-+    add_subdirectory(tools)
-+endif()
-+
-+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES})
-+
-+# This would be the standard way to include thread_mpi, but
-+# we want libgromacs to link the functions directly
-+#if(GMX_THREAD_MPI)
-+#    add_subdirectory(thread_mpi)
-+#endif()
-+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
-+
-+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
-+list(APPEND LIBGROMACS_SOURCES ${THREAD_MPI_SOURCES})
-+
-+file(GLOB LIBGROMACS_HEADERS *.h)
-+configure_file(version.h.cmakein version.h)
-+gmx_install_headers("" ${LIBGROMACS_HEADERS})
-+gmx_install_headers("" ${CMAKE_CURRENT_BINARY_DIR}/version.h)
-+
-+# Add target that generates baseversion-gen.c every time make is run
-+# if git version info is requested, or create it statically.
-+# This code is here instead of utility/CMakeLists.txt because CMake
-+# ignores set_source_file_properties from subdirectories.
-+set(GENERATED_VERSION_FILE
-+    ${CMAKE_CURRENT_BINARY_DIR}/utility/baseversion-gen.c)
-+set(GENERATED_VERSION_FILE_SOURCE
-+    ${CMAKE_CURRENT_SOURCE_DIR}/utility/baseversion-gen.c.cmakein)
-+if (GMX_GIT_VERSION_INFO)
-+    add_custom_target(gmx-version ALL
-+            COMMAND ${CMAKE_COMMAND}
-+                -D GIT_EXECUTABLE="${GIT_EXECUTABLE}"
-+                -D PROJECT_VERSION="${PROJECT_VERSION}"
-+                -D PROJECT_SOURCE_DIR="${PROJECT_SOURCE_DIR}"
-+                -D VERSION_CMAKEIN=${GENERATED_VERSION_FILE_SOURCE}
-+                -D VERSION_OUT=${GENERATED_VERSION_FILE}
-+                -P ${CMAKE_SOURCE_DIR}/cmake/gmxGenerateVersionInfo.cmake
-+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-+            DEPENDS ${GENERATED_VERSION_FILE_SOURCE}
-+            COMMENT "Generating git version information")
-+    set_source_files_properties(${GENERATED_VERSION_FILE}
-+                                PROPERTIES GENERATED true)
-+else()
-+    set(GMX_PROJECT_VERSION_STR ${PROJECT_VERSION})
-+    configure_file(${GENERATED_VERSION_FILE_SOURCE} ${GENERATED_VERSION_FILE})
-+endif()
-+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
-+
-+# apply gcc 4.4.x bug workaround
-+if(GMX_USE_GCC44_BUG_WORKAROUND)
-+   include(gmxGCC44O3BugWorkaround)
-+   gmx_apply_gcc44_bug_workaround("gmxlib/bondfree.c")
-+   gmx_apply_gcc44_bug_workaround("mdlib/force.c")
-+   gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
-+endif()
-+
-+add_library(libgromacs ${LIBGROMACS_SOURCES})
-+if (GMX_GIT_VERSION_INFO)
-+    add_dependencies(libgromacs gmx-version)
-+endif()
-+
-+# Recent versions of gcc and clang give warnings on scanner.cpp, which
-+# is a generated source file. These are awkward to suppress inline, so
-+# we do it in the compilation command (after testing that the compiler
-+# supports the suppressions). Setting the properties only works after
-+# the related target has been created, e.g. after when the file is
-+# used with add_library().
-+include(CheckCXXCompilerFlag)
-+check_cxx_compiler_flag(-Wno-unused-parameter HAS_NO_UNUSED_PARAMETER)
-+if (HAS_NO_UNUSED_PARAMETER)
-+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-unused-parameter")
-+endif()
-+check_cxx_compiler_flag(-Wno-deprecated-register HAS_NO_DEPRECATED_REGISTER)
-+if (HAS_NO_DEPRECATED_REGISTER)
-+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated-register")
-+else()
-+    check_cxx_compiler_flag(-Wno-deprecated HAS_NO_DEPRECATED)
-+    if (HAS_NO_DEPRECATED)
-+        set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated")
-+    endif()
-+endif()
-+set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
-+
-+target_link_libraries(libgromacs
-+                      ${EXTRAE_LIBRARIES}
-+                      ${GMX_GPU_LIBRARIES}
-+                      ${GMX_EXTRA_LIBRARIES}
-+                      ${TNG_IO_LIBRARIES}
-+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
-+                      ${XML_LIBRARIES}
-+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
-+set_target_properties(libgromacs PROPERTIES
-+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
-+                      SOVERSION ${LIBRARY_SOVERSION}
-+                      VERSION ${LIBRARY_VERSION}
-+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
-+
-+# Only install the library in mdrun-only mode if it is actually necessary
-+# for the binary
-+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
-+    install(TARGETS libgromacs
-+        LIBRARY DESTINATION ${LIB_INSTALL_DIR}
-+        RUNTIME DESTINATION ${BIN_INSTALL_DIR}
-+        ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
-+        COMPONENT libraries)
-+endif()
-+
-+if (NOT GMX_BUILD_MDRUN_ONLY)
-+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgromacs.pc.cmakein
-+                   ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc @ONLY)
-+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc
-+            DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
-+            RENAME "libgromacs${GMX_LIBS_SUFFIX}.pc"
-+            COMPONENT development)
-+endif()
-+
-+if (INSTALL_CUDART_LIB) #can be set manual by user
-+    if (GMX_GPU)
-+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
-+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
-+            if(IS_CUDART) #libcuda should not be installed
-+                #install also name-links (linker uses those)
-+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
-+                install(FILES ${CUDA_LIBS} DESTINATION
-+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
-+            endif()
-+        endforeach()
-+    else()
-+        message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
-+    endif()
-+endif()
-diff --git a/src/gromacs/mdlib/force.c b/src/gromacs/mdlib/force.c
-index 5230983..8227d5b 100644
---- a/src/gromacs/mdlib/force.c
-+++ b/src/gromacs/mdlib/force.c
-@@ -67,6 +67,14 @@
- #include "gromacs/timing/wallcycle.h"
- #include "gmx_fatal.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+int    plumedswitch=0;
-+plumed plumedmain;
-+void(*plumedcmd)(plumed,const char*,const void*)=NULL;
-+/* END PLUMED */
-+
-+
- void ns(FILE              *fp,
-         t_forcerec        *fr,
-         matrix             box,
-@@ -737,6 +745,13 @@ void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
-         pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
-     }
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      int plumedNeedsEnergy;
-+      (*plumedcmd)(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+      if(!plumedNeedsEnergy) (*plumedcmd)(plumedmain,"performCalc",NULL);
-+    }
-+    /* END PLUMED */
- }
- 
- void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
-diff --git a/src/gromacs/mdlib/force.c.preplumed b/src/gromacs/mdlib/force.c.preplumed
-new file mode 100644
-index 0000000..5230983
---- /dev/null
-+++ b/src/gromacs/mdlib/force.c.preplumed
-@@ -0,0 +1,1018 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <math.h>
-+#include <string.h>
-+#include <assert.h>
-+#include "sysstuff.h"
-+#include "typedefs.h"
-+#include "macros.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "macros.h"
-+#include "physics.h"
-+#include "force.h"
-+#include "nonbonded.h"
-+#include "names.h"
-+#include "network.h"
-+#include "pbc.h"
-+#include "ns.h"
-+#include "nrnb.h"
-+#include "bondf.h"
-+#include "mshift.h"
-+#include "txtdump.h"
-+#include "coulomb.h"
-+#include "pme.h"
-+#include "mdrun.h"
-+#include "domdec.h"
-+#include "qmmm.h"
-+#include "gmx_omp_nthreads.h"
-+
-+#include "gromacs/timing/wallcycle.h"
-+#include "gmx_fatal.h"
-+
-+void ns(FILE              *fp,
-+        t_forcerec        *fr,
-+        matrix             box,
-+        gmx_groups_t      *groups,
-+        gmx_localtop_t    *top,
-+        t_mdatoms         *md,
-+        t_commrec         *cr,
-+        t_nrnb            *nrnb,
-+        gmx_bool           bFillGrid,
-+        gmx_bool           bDoLongRangeNS)
-+{
-+    char   *ptr;
-+    int     nsearch;
-+
-+
-+    if (!fr->ns.nblist_initialized)
-+    {
-+        init_neighbor_list(fp, fr, md->homenr);
-+    }
-+
-+    if (fr->bTwinRange)
-+    {
-+        fr->nlr = 0;
-+    }
-+
-+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
-+                                bFillGrid, bDoLongRangeNS);
-+    if (debug)
-+    {
-+        fprintf(debug, "nsearch = %d\n", nsearch);
-+    }
-+
-+    /* Check whether we have to do dynamic load balancing */
-+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
-+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
-+       &(top->idef),opts->ngener);
-+     */
-+    if (fr->ns.dump_nl > 0)
-+    {
-+        dump_nblist(fp, cr, fr, fr->ns.dump_nl);
-+    }
-+}
-+
-+static void reduce_thread_forces(int n, rvec *f,
-+                                 tensor vir_q, tensor vir_lj,
-+                                 real *Vcorr_q, real *Vcorr_lj,
-+                                 real *dvdl_q, real *dvdl_lj,
-+                                 int nthreads, f_thread_t *f_t)
-+{
-+    int t, i;
-+    int nthreads_loop gmx_unused;
-+
-+    /* This reduction can run over any number of threads */
-+    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
-+#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
-+    for (i = 0; i < n; i++)
-+    {
-+        for (t = 1; t < nthreads; t++)
-+        {
-+            rvec_inc(f[i], f_t[t].f[i]);
-+        }
-+    }
-+    for (t = 1; t < nthreads; t++)
-+    {
-+        *Vcorr_q  += f_t[t].Vcorr_q;
-+        *Vcorr_lj += f_t[t].Vcorr_lj;
-+        *dvdl_q   += f_t[t].dvdl[efptCOUL];
-+        *dvdl_lj  += f_t[t].dvdl[efptVDW];
-+        m_add(vir_q, f_t[t].vir_q, vir_q);
-+        m_add(vir_lj, f_t[t].vir_lj, vir_lj);
-+    }
-+}
-+
-+void gmx_print_sepdvdl(FILE *fplog, const char *s, real v, real dvdlambda)
-+{
-+    fprintf(fplog, "  %-30s V %12.5e  dVdl %12.5e\n", s, v, dvdlambda);
-+}
-+
-+void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
-+                       t_forcerec *fr,      t_inputrec *ir,
-+                       t_idef     *idef,    t_commrec  *cr,
-+                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
-+                       t_mdatoms  *md,
-+                       rvec       x[],      history_t  *hist,
-+                       rvec       f[],
-+                       rvec       f_longrange[],
-+                       gmx_enerdata_t *enerd,
-+                       t_fcdata   *fcd,
-+                       gmx_localtop_t *top,
-+                       gmx_genborn_t *born,
-+                       t_atomtypes *atype,
-+                       gmx_bool       bBornRadii,
-+                       matrix     box,
-+                       t_lambda   *fepvals,
-+                       real       *lambda,
-+                       t_graph    *graph,
-+                       t_blocka   *excl,
-+                       rvec       mu_tot[],
-+                       int        flags,
-+                       float      *cycles_pme)
-+{
-+    int         i, j;
-+    int         donb_flags;
-+    gmx_bool    bDoEpot, bSepDVDL, bSB;
-+    int         pme_flags;
-+    matrix      boxs;
-+    rvec        box_size;
-+    t_pbc       pbc;
-+    char        buf[22];
-+    double      clam_i, vlam_i;
-+    real        dvdl_dum[efptNR], dvdl_nb[efptNR], lam_i[efptNR];
-+    real        dvdl_q, dvdl_lj;
-+
-+#ifdef GMX_MPI
-+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
-+#endif
-+
-+#define PRINT_SEPDVDL(s, v, dvdlambda) if (bSepDVDL) { gmx_print_sepdvdl(fplog, s, v, dvdlambda); }
-+
-+    set_pbc(&pbc, fr->ePBC, box);
-+
-+    /* reset free energy components */
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        dvdl_nb[i]  = 0;
-+        dvdl_dum[i] = 0;
-+    }
-+
-+    /* Reset box */
-+    for (i = 0; (i < DIM); i++)
-+    {
-+        box_size[i] = box[i][i];
-+    }
-+
-+    bSepDVDL = (fr->bSepDVDL && do_per_step(step, ir->nstlog));
-+    debug_gmx();
-+
-+    /* do QMMM first if requested */
-+    if (fr->bQMMM)
-+    {
-+        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr);
-+    }
-+
-+    if (bSepDVDL)
-+    {
-+        fprintf(fplog, "Step %s: non-bonded V and dVdl for rank %d:\n",
-+                gmx_step_str(step, buf), cr->nodeid);
-+    }
-+
-+    /* Call the short range functions all in one go. */
-+
-+#ifdef GMX_MPI
-+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
-+#define TAKETIME FALSE
-+    if (TAKETIME)
-+    {
-+        MPI_Barrier(cr->mpi_comm_mygroup);
-+        t0 = MPI_Wtime();
-+    }
-+#endif
-+
-+    if (ir->nwall)
-+    {
-+        /* foreign lambda component for walls */
-+        real dvdl_walls = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
-+                                   enerd->grpp.ener[egLJSR], nrnb);
-+        PRINT_SEPDVDL("Walls", 0.0, dvdl_walls);
-+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
-+    }
-+
-+    /* If doing GB, reset dvda and calculate the Born radii */
-+    if (ir->implicit_solvent)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
-+
-+        for (i = 0; i < born->nr; i++)
-+        {
-+            fr->dvda[i] = 0;
-+        }
-+
-+        if (bBornRadii)
-+        {
-+            calc_gb_rad(cr, fr, ir, top, x, &(fr->gblist), born, md, nrnb);
-+        }
-+
-+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
-+    }
-+
-+    where();
-+    /* We only do non-bonded calculation with group scheme here, the verlet
-+     * calls are done from do_force_cutsVERLET(). */
-+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
-+    {
-+        donb_flags = 0;
-+        /* Add short-range interactions */
-+        donb_flags |= GMX_NONBONDED_DO_SR;
-+
-+        /* Currently all group scheme kernels always calculate (shift-)forces */
-+        if (flags & GMX_FORCE_FORCES)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_FORCE;
-+        }
-+        if (flags & GMX_FORCE_VIRIAL)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
-+        }
-+        if (flags & GMX_FORCE_ENERGY)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
-+        }
-+        if (flags & GMX_FORCE_DO_LR)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_LR;
-+        }
-+
-+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
-+        do_nonbonded(fr, x, f, f_longrange, md, excl,
-+                     &enerd->grpp, nrnb,
-+                     lambda, dvdl_nb, -1, -1, donb_flags);
-+
-+        /* If we do foreign lambda and we have soft-core interactions
-+         * we have to recalculate the (non-linear) energies contributions.
-+         */
-+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
-+        {
-+            for (i = 0; i < enerd->n_lambda; i++)
-+            {
-+                for (j = 0; j < efptNR; j++)
-+                {
-+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
-+                }
-+                reset_foreign_enerdata(enerd);
-+                do_nonbonded(fr, x, f, f_longrange, md, excl,
-+                             &(enerd->foreign_grpp), nrnb,
-+                             lam_i, dvdl_dum, -1, -1,
-+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
-+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
-+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
-+            }
-+        }
-+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
-+        where();
-+    }
-+
-+    /* If we are doing GB, calculate bonded forces and apply corrections
-+     * to the solvation forces */
-+    /* MRS: Eventually, many need to include free energy contribution here! */
-+    if (ir->implicit_solvent)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsBONDED);
-+        calc_gb_forces(cr, md, born, top, x, f, fr, idef,
-+                       ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
-+        wallcycle_sub_stop(wcycle, ewcsBONDED);
-+    }
-+
-+#ifdef GMX_MPI
-+    if (TAKETIME)
-+    {
-+        t1          = MPI_Wtime();
-+        fr->t_fnbf += t1-t0;
-+    }
-+#endif
-+
-+    if (fepvals->sc_alpha != 0)
-+    {
-+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
-+    }
-+    else
-+    {
-+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
-+    }
-+
-+    if (fepvals->sc_alpha != 0)
-+
-+    /* even though coulomb part is linear, we already added it, beacuse we
-+       need to go through the vdw calculation anyway */
-+    {
-+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
-+    }
-+    else
-+    {
-+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
-+    }
-+
-+    if (bSepDVDL)
-+    {
-+        real V_short_range    = 0;
-+        real dvdl_short_range = 0;
-+
-+        for (i = 0; i < enerd->grpp.nener; i++)
-+        {
-+            V_short_range +=
-+                (fr->bBHAM ?
-+                 enerd->grpp.ener[egBHAMSR][i] :
-+                 enerd->grpp.ener[egLJSR][i])
-+                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
-+        }
-+        dvdl_short_range = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
-+        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.",
-+                      V_short_range,
-+                      dvdl_short_range);
-+    }
-+    debug_gmx();
-+
-+
-+    if (debug)
-+    {
-+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
-+    }
-+
-+    /* Shift the coordinates. Must be done before bonded forces and PPPM,
-+     * but is also necessary for SHAKE and update, therefore it can NOT
-+     * go when no bonded forces have to be evaluated.
-+     */
-+
-+    /* Here sometimes we would not need to shift with NBFonly,
-+     * but we do so anyhow for consistency of the returned coordinates.
-+     */
-+    if (graph)
-+    {
-+        shift_self(graph, box, x);
-+        if (TRICLINIC(box))
-+        {
-+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
-+        }
-+        else
-+        {
-+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
-+        }
-+    }
-+    /* Check whether we need to do bondeds or correct for exclusions */
-+    if (fr->bMolPBC &&
-+        ((flags & GMX_FORCE_BONDED)
-+         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype)))
-+    {
-+        /* Since all atoms are in the rectangular or triclinic unit-cell,
-+         * only single box vector shifts (2 in x) are required.
-+         */
-+        set_pbc_dd(&pbc, fr->ePBC, cr->dd, TRUE, box);
-+    }
-+    debug_gmx();
-+
-+    if (flags & GMX_FORCE_BONDED)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsBONDED);
-+        calc_bonds(fplog, cr->ms,
-+                   idef, x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
-+                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
-+                   flags,
-+                   fr->bSepDVDL && do_per_step(step, ir->nstlog), step);
-+
-+        /* Check if we have to determine energy differences
-+         * at foreign lambda's.
-+         */
-+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) &&
-+            idef->ilsort != ilsortNO_FE)
-+        {
-+            if (idef->ilsort != ilsortFE_SORTED)
-+            {
-+                gmx_incons("The bonded interactions are not sorted for free energy");
-+            }
-+            for (i = 0; i < enerd->n_lambda; i++)
-+            {
-+                reset_foreign_enerdata(enerd);
-+                for (j = 0; j < efptNR; j++)
-+                {
-+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
-+                }
-+                calc_bonds_lambda(fplog, idef, x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
-+                                  fcd, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
-+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
-+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
-+            }
-+        }
-+        debug_gmx();
-+
-+        wallcycle_sub_stop(wcycle, ewcsBONDED);
-+    }
-+
-+    where();
-+
-+    *cycles_pme = 0;
-+    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+    {
-+        real Vlr             = 0, Vcorr = 0;
-+        real dvdl_long_range = 0;
-+        int  status          = 0;
-+
-+        bSB = (ir->nwall == 2);
-+        if (bSB)
-+        {
-+            copy_mat(box, boxs);
-+            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
-+            box_size[ZZ] *= ir->wall_ewald_zfac;
-+        }
-+    }
-+
-+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
-+     * corrections.
-+     */
-+
-+    clear_mat(fr->vir_el_recip);
-+    clear_mat(fr->vir_lj_recip);
-+
-+    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+    {
-+        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
-+        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
-+        int  status            = 0;
-+
-+        if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+        {
-+            real dvdl_long_range_correction_q   = 0;
-+            real dvdl_long_range_correction_lj  = 0;
-+            /* With the Verlet scheme exclusion forces are calculated
-+             * in the non-bonded kernel.
-+             */
-+            /* The TPI molecule does not have exclusions with the rest
-+             * of the system and no intra-molecular PME grid
-+             * contributions will be calculated in
-+             * gmx_pme_calc_energy.
-+             */
-+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
-+                ir->ewald_geometry != eewg3D ||
-+                ir->epsilon_surface != 0)
-+            {
-+                int nthreads, t;
-+
-+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
-+
-+                if (fr->n_tpi > 0)
-+                {
-+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
-+                }
-+
-+                nthreads = gmx_omp_nthreads_get(emntBonded);
-+#pragma omp parallel for num_threads(nthreads) schedule(static)
-+                for (t = 0; t < nthreads; t++)
-+                {
-+                    int     s, e, i;
-+                    rvec   *fnv;
-+                    tensor *vir_q, *vir_lj;
-+                    real   *Vcorrt_q, *Vcorrt_lj, *dvdlt_q, *dvdlt_lj;
-+                    if (t == 0)
-+                    {
-+                        fnv       = fr->f_novirsum;
-+                        vir_q     = &fr->vir_el_recip;
-+                        vir_lj    = &fr->vir_lj_recip;
-+                        Vcorrt_q  = &Vcorr_q;
-+                        Vcorrt_lj = &Vcorr_lj;
-+                        dvdlt_q   = &dvdl_long_range_correction_q;
-+                        dvdlt_lj  = &dvdl_long_range_correction_lj;
-+                    }
-+                    else
-+                    {
-+                        fnv       = fr->f_t[t].f;
-+                        vir_q     = &fr->f_t[t].vir_q;
-+                        vir_lj    = &fr->f_t[t].vir_lj;
-+                        Vcorrt_q  = &fr->f_t[t].Vcorr_q;
-+                        Vcorrt_lj = &fr->f_t[t].Vcorr_lj;
-+                        dvdlt_q   = &fr->f_t[t].dvdl[efptCOUL];
-+                        dvdlt_lj  = &fr->f_t[t].dvdl[efptVDW];
-+                        for (i = 0; i < fr->natoms_force; i++)
-+                        {
-+                            clear_rvec(fnv[i]);
-+                        }
-+                        clear_mat(*vir_q);
-+                        clear_mat(*vir_lj);
-+                    }
-+                    *dvdlt_q  = 0;
-+                    *dvdlt_lj = 0;
-+
-+                    ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
-+                                       cr, t, fr,
-+                                       md->chargeA, md->chargeB,
-+                                       md->sqrt_c6A, md->sqrt_c6B,
-+                                       md->sigmaA, md->sigmaB,
-+                                       md->sigma3A, md->sigma3B,
-+                                       md->nChargePerturbed || md->nTypePerturbed,
-+                                       ir->cutoff_scheme != ecutsVERLET,
-+                                       excl, x, bSB ? boxs : box, mu_tot,
-+                                       ir->ewald_geometry,
-+                                       ir->epsilon_surface,
-+                                       fnv, *vir_q, *vir_lj,
-+                                       Vcorrt_q, Vcorrt_lj,
-+                                       lambda[efptCOUL], lambda[efptVDW],
-+                                       dvdlt_q, dvdlt_lj);
-+                }
-+                if (nthreads > 1)
-+                {
-+                    reduce_thread_forces(fr->natoms_force, fr->f_novirsum,
-+                                         fr->vir_el_recip, fr->vir_lj_recip,
-+                                         &Vcorr_q, &Vcorr_lj,
-+                                         &dvdl_long_range_correction_q,
-+                                         &dvdl_long_range_correction_lj,
-+                                         nthreads, fr->f_t);
-+                }
-+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
-+            }
-+
-+            if (EEL_PME_EWALD(fr->eeltype) && fr->n_tpi == 0)
-+            {
-+                Vcorr_q += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
-+                                                   &dvdl_long_range_correction_q,
-+                                                   fr->vir_el_recip);
-+            }
-+
-+            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.", Vcorr_q, dvdl_long_range_correction_q);
-+            PRINT_SEPDVDL("Ewald excl. corr. LJ", Vcorr_lj, dvdl_long_range_correction_lj);
-+            enerd->dvdl_lin[efptCOUL] += dvdl_long_range_correction_q;
-+            enerd->dvdl_lin[efptVDW]  += dvdl_long_range_correction_lj;
-+        }
-+
-+        if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)))
-+        {
-+            if (cr->duty & DUTY_PME)
-+            {
-+                /* Do reciprocal PME for Coulomb and/or LJ. */
-+                assert(fr->n_tpi >= 0);
-+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
-+                {
-+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
-+                    if (EEL_PME(fr->eeltype))
-+                    {
-+                        pme_flags     |= GMX_PME_DO_COULOMB;
-+                    }
-+                    if (EVDW_PME(fr->vdwtype))
-+                    {
-+                        pme_flags |= GMX_PME_DO_LJ;
-+                    }
-+                    if (flags & GMX_FORCE_FORCES)
-+                    {
-+                        pme_flags |= GMX_PME_CALC_F;
-+                    }
-+                    if (flags & GMX_FORCE_VIRIAL)
-+                    {
-+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
-+                    }
-+                    if (fr->n_tpi > 0)
-+                    {
-+                        /* We don't calculate f, but we do want the potential */
-+                        pme_flags |= GMX_PME_CALC_POT;
-+                    }
-+                    wallcycle_start(wcycle, ewcPMEMESH);
-+                    status = gmx_pme_do(fr->pmedata,
-+                                        0, md->homenr - fr->n_tpi,
-+                                        x, fr->f_novirsum,
-+                                        md->chargeA, md->chargeB,
-+                                        md->sqrt_c6A, md->sqrt_c6B,
-+                                        md->sigmaA, md->sigmaB,
-+                                        bSB ? boxs : box, cr,
-+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
-+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
-+                                        nrnb, wcycle,
-+                                        fr->vir_el_recip, fr->ewaldcoeff_q,
-+                                        fr->vir_lj_recip, fr->ewaldcoeff_lj,
-+                                        &Vlr_q, &Vlr_lj,
-+                                        lambda[efptCOUL], lambda[efptVDW],
-+                                        &dvdl_long_range_q, &dvdl_long_range_lj, pme_flags);
-+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
-+                    if (status != 0)
-+                    {
-+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
-+                    }
-+                    /* We should try to do as little computation after
-+                     * this as possible, because parallel PME synchronizes
-+                     * the nodes, so we want all load imbalance of the
-+                     * rest of the force calculation to be before the PME
-+                     * call.  DD load balancing is done on the whole time
-+                     * of the force call (without PME).
-+                     */
-+                }
-+                if (fr->n_tpi > 0)
-+                {
-+                    if (EVDW_PME(ir->vdwtype))
-+                    {
-+
-+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
-+                    }
-+                    /* Determine the PME grid energy of the test molecule
-+                     * with the PME grid potential of the other charges.
-+                     */
-+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
-+                                        x + md->homenr - fr->n_tpi,
-+                                        md->chargeA + md->homenr - fr->n_tpi,
-+                                        &Vlr_q);
-+                }
-+                PRINT_SEPDVDL("PME mesh", Vlr_q + Vlr_lj, dvdl_long_range_q+dvdl_long_range_lj);
-+            }
-+        }
-+
-+        if (!EEL_PME(fr->eeltype) && EEL_PME_EWALD(fr->eeltype))
-+        {
-+            Vlr_q = do_ewald(ir, x, fr->f_novirsum,
-+                             md->chargeA, md->chargeB,
-+                             box_size, cr, md->homenr,
-+                             fr->vir_el_recip, fr->ewaldcoeff_q,
-+                             lambda[efptCOUL], &dvdl_long_range_q, fr->ewald_table);
-+            PRINT_SEPDVDL("Ewald long-range", Vlr_q, dvdl_long_range_q);
-+        }
-+
-+        /* Note that with separate PME nodes we get the real energies later */
-+        enerd->dvdl_lin[efptCOUL] += dvdl_long_range_q;
-+        enerd->dvdl_lin[efptVDW]  += dvdl_long_range_lj;
-+        enerd->term[F_COUL_RECIP]  = Vlr_q + Vcorr_q;
-+        enerd->term[F_LJ_RECIP]    = Vlr_lj + Vcorr_lj;
-+        if (debug)
-+        {
-+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
-+                    Vlr_q, Vcorr_q, enerd->term[F_COUL_RECIP]);
-+            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
-+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
-+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
-+                    Vlr_lj, Vcorr_lj, enerd->term[F_LJ_RECIP]);
-+            pr_rvecs(debug, 0, "vir_lj_recip after corr", fr->vir_lj_recip, DIM);
-+        }
-+    }
-+    else
-+    {
-+        /* Is there a reaction-field exclusion correction needed? */
-+        if (EEL_RF(fr->eeltype) && eelRF_NEC != fr->eeltype)
-+        {
-+            /* With the Verlet scheme, exclusion forces are calculated
-+             * in the non-bonded kernel.
-+             */
-+            if (ir->cutoff_scheme != ecutsVERLET)
-+            {
-+                real dvdl_rf_excl      = 0;
-+                enerd->term[F_RF_EXCL] =
-+                    RF_excl_correction(fr, graph, md, excl, x, f,
-+                                       fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
-+
-+                enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
-+                PRINT_SEPDVDL("RF exclusion correction",
-+                              enerd->term[F_RF_EXCL], dvdl_rf_excl);
-+            }
-+        }
-+    }
-+    where();
-+    debug_gmx();
-+
-+    if (debug)
-+    {
-+        print_nrnb(debug, nrnb);
-+    }
-+    debug_gmx();
-+
-+#ifdef GMX_MPI
-+    if (TAKETIME)
-+    {
-+        t2 = MPI_Wtime();
-+        MPI_Barrier(cr->mpi_comm_mygroup);
-+        t3          = MPI_Wtime();
-+        fr->t_wait += t3-t2;
-+        if (fr->timesteps == 11)
-+        {
-+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
-+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
-+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
-+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
-+        }
-+        fr->timesteps++;
-+    }
-+#endif
-+
-+    if (debug)
-+    {
-+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
-+    }
-+
-+}
-+
-+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
-+{
-+    int i, n2;
-+
-+    for (i = 0; i < F_NRE; i++)
-+    {
-+        enerd->term[i]         = 0;
-+        enerd->foreign_term[i] = 0;
-+    }
-+
-+
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        enerd->dvdl_lin[i]     = 0;
-+        enerd->dvdl_nonlin[i]  = 0;
-+    }
-+
-+    n2 = ngener*ngener;
-+    if (debug)
-+    {
-+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
-+    }
-+    enerd->grpp.nener         = n2;
-+    enerd->foreign_grpp.nener = n2;
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        snew(enerd->grpp.ener[i], n2);
-+        snew(enerd->foreign_grpp.ener[i], n2);
-+    }
-+
-+    if (n_lambda)
-+    {
-+        enerd->n_lambda = 1 + n_lambda;
-+        snew(enerd->enerpart_lambda, enerd->n_lambda);
-+    }
-+    else
-+    {
-+        enerd->n_lambda = 0;
-+    }
-+}
-+
-+void destroy_enerdata(gmx_enerdata_t *enerd)
-+{
-+    int i;
-+
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        sfree(enerd->grpp.ener[i]);
-+    }
-+
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        sfree(enerd->foreign_grpp.ener[i]);
-+    }
-+
-+    if (enerd->n_lambda)
-+    {
-+        sfree(enerd->enerpart_lambda);
-+    }
-+}
-+
-+static real sum_v(int n, real v[])
-+{
-+    real t;
-+    int  i;
-+
-+    t = 0.0;
-+    for (i = 0; (i < n); i++)
-+    {
-+        t = t + v[i];
-+    }
-+
-+    return t;
-+}
-+
-+void sum_epot(gmx_grppairener_t *grpp, real *epot)
-+{
-+    int i;
-+
-+    /* Accumulate energies */
-+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
-+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
-+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
-+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
-+    epot[F_COUL_LR]  = sum_v(grpp->nener, grpp->ener[egCOULLR]);
-+    epot[F_LJ_LR]    = sum_v(grpp->nener, grpp->ener[egLJLR]);
-+    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
-+    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
-+
-+/* lattice part of LR doesnt belong to any group
-+ * and has been added earlier
-+ */
-+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
-+    epot[F_BHAM_LR]  = sum_v(grpp->nener, grpp->ener[egBHAMLR]);
-+
-+    epot[F_EPOT] = 0;
-+    for (i = 0; (i < F_EPOT); i++)
-+    {
-+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
-+        {
-+            epot[F_EPOT] += epot[i];
-+        }
-+    }
-+}
-+
-+void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
-+{
-+    int    i, j, index;
-+    double dlam;
-+
-+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
-+    enerd->term[F_DVDL]       = 0.0;
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        if (fepvals->separate_dvdl[i])
-+        {
-+            /* could this be done more readably/compactly? */
-+            switch (i)
-+            {
-+                case (efptMASS):
-+                    index = F_DKDL;
-+                    break;
-+                case (efptCOUL):
-+                    index = F_DVDL_COUL;
-+                    break;
-+                case (efptVDW):
-+                    index = F_DVDL_VDW;
-+                    break;
-+                case (efptBONDED):
-+                    index = F_DVDL_BONDED;
-+                    break;
-+                case (efptRESTRAINT):
-+                    index = F_DVDL_RESTRAINT;
-+                    break;
-+                default:
-+                    index = F_DVDL;
-+                    break;
-+            }
-+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
-+            if (debug)
-+            {
-+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
-+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
-+            }
-+        }
-+        else
-+        {
-+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
-+            if (debug)
-+            {
-+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
-+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
-+            }
-+        }
-+    }
-+
-+    /* Notes on the foreign lambda free energy difference evaluation:
-+     * Adding the potential and ekin terms that depend linearly on lambda
-+     * as delta lam * dvdl to the energy differences is exact.
-+     * For the constraints this is not exact, but we have no other option
-+     * without literally changing the lengths and reevaluating the energies at each step.
-+     * (try to remedy this post 4.6 - MRS)
-+     * For the non-bonded LR term we assume that the soft-core (if present)
-+     * no longer affects the energy beyond the short-range cut-off,
-+     * which is a very good approximation (except for exotic settings).
-+     * (investigate how to overcome this post 4.6 - MRS)
-+     */
-+    if (fepvals->separate_dvdl[efptBONDED])
-+    {
-+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
-+    }
-+    else
-+    {
-+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
-+    }
-+    enerd->term[F_DVDL_CONSTR] = 0;
-+
-+    for (i = 0; i < fepvals->n_lambda; i++)
-+    {
-+        /* note we are iterating over fepvals here!
-+           For the current lam, dlam = 0 automatically,
-+           so we don't need to add anything to the
-+           enerd->enerpart_lambda[0] */
-+
-+        /* we don't need to worry about dvdl_lin contributions to dE at
-+           current lambda, because the contributions to the current
-+           lambda are automatically zeroed */
-+
-+        for (j = 0; j < efptNR; j++)
-+        {
-+            /* Note that this loop is over all dhdl components, not just the separated ones */
-+            dlam = (fepvals->all_lambda[j][i]-lambda[j]);
-+            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
-+            if (debug)
-+            {
-+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
-+                        fepvals->all_lambda[j][i], efpt_names[j],
-+                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
-+                        dlam, enerd->dvdl_lin[j]);
-+            }
-+        }
-+    }
-+}
-+
-+
-+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
-+{
-+    int  i, j;
-+
-+    /* First reset all foreign energy components.  Foreign energies always called on
-+       neighbor search steps */
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        for (j = 0; (j < enerd->grpp.nener); j++)
-+        {
-+            enerd->foreign_grpp.ener[i][j] = 0.0;
-+        }
-+    }
-+
-+    /* potential energy components */
-+    for (i = 0; (i <= F_EPOT); i++)
-+    {
-+        enerd->foreign_term[i] = 0.0;
-+    }
-+}
-+
-+void reset_enerdata(t_forcerec *fr, gmx_bool bNS,
-+                    gmx_enerdata_t *enerd,
-+                    gmx_bool bMaster)
-+{
-+    gmx_bool bKeepLR;
-+    int      i, j;
-+
-+    /* First reset all energy components, except for the long range terms
-+     * on the master at non neighbor search steps, since the long range
-+     * terms have already been summed at the last neighbor search step.
-+     */
-+    bKeepLR = (fr->bTwinRange && !bNS);
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        if (!(bKeepLR && bMaster && (i == egCOULLR || i == egLJLR)))
-+        {
-+            for (j = 0; (j < enerd->grpp.nener); j++)
-+            {
-+                enerd->grpp.ener[i][j] = 0.0;
-+            }
-+        }
-+    }
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        enerd->dvdl_lin[i]    = 0.0;
-+        enerd->dvdl_nonlin[i] = 0.0;
-+    }
-+
-+    /* Normal potential energy components */
-+    for (i = 0; (i <= F_EPOT); i++)
-+    {
-+        enerd->term[i] = 0.0;
-+    }
-+    /* Initialize the dVdlambda term with the long range contribution */
-+    /* Initialize the dvdl term with the long range contribution */
-+    enerd->term[F_DVDL]            = 0.0;
-+    enerd->term[F_DVDL_COUL]       = 0.0;
-+    enerd->term[F_DVDL_VDW]        = 0.0;
-+    enerd->term[F_DVDL_BONDED]     = 0.0;
-+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
-+    enerd->term[F_DKDL]            = 0.0;
-+    if (enerd->n_lambda > 0)
-+    {
-+        for (i = 0; i < enerd->n_lambda; i++)
-+        {
-+            enerd->enerpart_lambda[i] = 0.0;
-+        }
-+    }
-+    /* reset foreign energy data - separate function since we also call it elsewhere */
-+    reset_foreign_enerdata(enerd);
-+}
-diff --git a/src/gromacs/mdlib/minimize.c b/src/gromacs/mdlib/minimize.c
-index 69008f5..5114fa0 100644
---- a/src/gromacs/mdlib/minimize.c
-+++ b/src/gromacs/mdlib/minimize.c
-@@ -80,6 +80,13 @@
- #include "gromacs/timing/walltime_accounting.h"
- #include "gromacs/imd/imd.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+extern void(*plumedcmd)(plumed,const char*,const void*);
-+/* END PLUMED */
-+
- typedef struct {
-     t_state  s;
-     rvec    *f;
-@@ -442,6 +449,43 @@ void init_em(FILE *fplog, const char *title,
- 
-     clear_rvec(mu_tot);
-     calc_shifts(ems->s.box, fr->shift_vec);
-+
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) (*plumedcmd) (plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        (*plumedcmd) (plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }else{
-+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
-+        }
-+      }
-+      (*plumedcmd) (plumedmain,"setNatoms",&top_global->natoms);
-+      (*plumedcmd) (plumedmain,"setMDEngine","gromacs");
-+      (*plumedcmd) (plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      (*plumedcmd) (plumedmain,"setTimestep",&real_delta_t);
-+      (*plumedcmd) (plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
- }
- 
- static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
-@@ -737,12 +781,34 @@ static void evaluate_energy(FILE *fplog, t_commrec *cr,
-         em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-                                ems, top, mdatoms, fr, vsite, constr,
-                                nrnb, wcycle);
-+        /* PLUMED */
-+        if(plumedswitch){
-+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+        /* END PLUMED */
-     }
- 
-     /* Calc force & energy on new trial position  */
-     /* do_force always puts the charge groups in the box and shifts again
-      * We do not unshift, so molecules are always whole in congrad.c
-      */
-+    /* PLUMED */
-+    int plumedNeedsEnergy=0;
-+    matrix plumed_vir;
-+    if(plumedswitch){
-+      long int lstep=count; (*plumedcmd)(plumedmain,"setStepLong",&count);
-+      (*plumedcmd) (plumedmain,"setPositions",&ems->s.x[0][0]);
-+      (*plumedcmd) (plumedmain,"setMasses",&mdatoms->massT[0]);
-+      (*plumedcmd) (plumedmain,"setCharges",&mdatoms->chargeA[0]);
-+      (*plumedcmd) (plumedmain,"setBox",&ems->s.box[0][0]);
-+      (*plumedcmd) (plumedmain,"prepareCalc",NULL);
-+      (*plumedcmd) (plumedmain,"setForces",&ems->f[0][0]);
-+      (*plumedcmd) (plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+      clear_mat(plumed_vir);
-+      (*plumedcmd) (plumedmain,"setVirial",&plumed_vir[0][0]);
-+    }
-+    /* END PLUMED */
-     do_force(fplog, cr, inputrec,
-              count, nrnb, wcycle, top, &top_global->groups,
-              ems->s.box, ems->s.x, &ems->s.hist,
-@@ -751,6 +817,19 @@ static void evaluate_energy(FILE *fplog, t_commrec *cr,
-              GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
-              GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
-              (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(plumedNeedsEnergy) {
-+        msmul(force_vir,2.0,plumed_vir);
-+        (*plumedcmd) (plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+        (*plumedcmd) (plumedmain,"performCalc",NULL);
-+        msmul(plumed_vir,0.5,force_vir);
-+      } else {
-+        msmul(plumed_vir,0.5,plumed_vir);
-+        m_add(force_vir,plumed_vir,force_vir);
-+      }
-+    }
-+    /* END PLUMED */
- 
-     /* Clear the unused shake virial and pressure */
-     clear_mat(shake_vir);
-diff --git a/src/gromacs/mdlib/minimize.c.preplumed b/src/gromacs/mdlib/minimize.c.preplumed
-new file mode 100644
-index 0000000..69008f5
---- /dev/null
-+++ b/src/gromacs/mdlib/minimize.c.preplumed
-@@ -0,0 +1,2906 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <string.h>
-+#include <time.h>
-+#include <math.h>
-+#include "sysstuff.h"
-+#include "gromacs/utility/cstringutil.h"
-+#include "network.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "nrnb.h"
-+#include "main.h"
-+#include "force.h"
-+#include "macros.h"
-+#include "names.h"
-+#include "gmx_fatal.h"
-+#include "txtdump.h"
-+#include "typedefs.h"
-+#include "update.h"
-+#include "constr.h"
-+#include "vec.h"
-+#include "tgroup.h"
-+#include "mdebin.h"
-+#include "vsite.h"
-+#include "force.h"
-+#include "mdrun.h"
-+#include "md_support.h"
-+#include "sim_util.h"
-+#include "domdec.h"
-+#include "mdatoms.h"
-+#include "ns.h"
-+#include "mtop_util.h"
-+#include "pme.h"
-+#include "bondf.h"
-+#include "gmx_omp_nthreads.h"
-+#include "md_logging.h"
-+
-+#include "gromacs/fileio/confio.h"
-+#include "gromacs/fileio/trajectory_writing.h"
-+#include "gromacs/linearalgebra/mtxio.h"
-+#include "gromacs/linearalgebra/sparsematrix.h"
-+#include "gromacs/timing/wallcycle.h"
-+#include "gromacs/timing/walltime_accounting.h"
-+#include "gromacs/imd/imd.h"
-+
-+typedef struct {
-+    t_state  s;
-+    rvec    *f;
-+    real     epot;
-+    real     fnorm;
-+    real     fmax;
-+    int      a_fmax;
-+} em_state_t;
-+
-+static em_state_t *init_em_state()
-+{
-+    em_state_t *ems;
-+
-+    snew(ems, 1);
-+
-+    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
-+    snew(ems->s.lambda, efptNR);
-+
-+    return ems;
-+}
-+
-+static void print_em_start(FILE                     *fplog,
-+                           t_commrec                *cr,
-+                           gmx_walltime_accounting_t walltime_accounting,
-+                           gmx_wallcycle_t           wcycle,
-+                           const char               *name)
-+{
-+    walltime_accounting_start(walltime_accounting);
-+    wallcycle_start(wcycle, ewcRUN);
-+    print_start(fplog, cr, walltime_accounting, name);
-+}
-+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
-+                        gmx_wallcycle_t           wcycle)
-+{
-+    wallcycle_stop(wcycle, ewcRUN);
-+
-+    walltime_accounting_end(walltime_accounting);
-+}
-+
-+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
-+{
-+    fprintf(out, "\n");
-+    fprintf(out, "%s:\n", minimizer);
-+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
-+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
-+}
-+
-+static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
-+{
-+    char buffer[2048];
-+    if (bLastStep)
-+    {
-+        sprintf(buffer,
-+                "\nEnergy minimization reached the maximum number "
-+                "of steps before the forces reached the requested "
-+                "precision Fmax < %g.\n", ftol);
-+    }
-+    else
-+    {
-+        sprintf(buffer,
-+                "\nEnergy minimization has stopped, but the forces have "
-+                "not converged to the requested precision Fmax < %g (which "
-+                "may not be possible for your system). It stopped "
-+                "because the algorithm tried to make a new step whose size "
-+                "was too small, or there was no change in the energy since "
-+                "last step. Either way, we regard the minimization as "
-+                "converged to within the available machine precision, "
-+                "given your starting configuration and EM parameters.\n%s%s",
-+                ftol,
-+                sizeof(real) < sizeof(double) ?
-+                "\nDouble precision normally gives you higher accuracy, but "
-+                "this is often not needed for preparing to run molecular "
-+                "dynamics.\n" :
-+                "",
-+                bConstrain ?
-+                "You might need to increase your constraint accuracy, or turn\n"
-+                "off constraints altogether (set constraints = none in mdp file)\n" :
-+                "");
-+    }
-+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
-+}
-+
-+
-+
-+static void print_converged(FILE *fp, const char *alg, real ftol,
-+                            gmx_int64_t count, gmx_bool bDone, gmx_int64_t nsteps,
-+                            real epot, real fmax, int nfmax, real fnorm)
-+{
-+    char buf[STEPSTRSIZE];
-+
-+    if (bDone)
-+    {
-+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
-+                alg, ftol, gmx_step_str(count, buf));
-+    }
-+    else if (count < nsteps)
-+    {
-+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
-+                "but did not reach the requested Fmax < %g.\n",
-+                alg, gmx_step_str(count, buf), ftol);
-+    }
-+    else
-+    {
-+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
-+                alg, ftol, gmx_step_str(count, buf));
-+    }
-+
-+#ifdef GMX_DOUBLE
-+    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
-+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
-+    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
-+#else
-+    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
-+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
-+    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
-+#endif
-+}
-+
-+static void get_f_norm_max(t_commrec *cr,
-+                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
-+                           real *fnorm, real *fmax, int *a_fmax)
-+{
-+    double fnorm2, *sum;
-+    real   fmax2, fmax2_0, fam;
-+    int    la_max, a_max, start, end, i, m, gf;
-+
-+    /* This routine finds the largest force and returns it.
-+     * On parallel machines the global max is taken.
-+     */
-+    fnorm2 = 0;
-+    fmax2  = 0;
-+    la_max = -1;
-+    gf     = 0;
-+    start  = 0;
-+    end    = mdatoms->homenr;
-+    if (mdatoms->cFREEZE)
-+    {
-+        for (i = start; i < end; i++)
-+        {
-+            gf  = mdatoms->cFREEZE[i];
-+            fam = 0;
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    fam += sqr(f[i][m]);
-+                }
-+            }
-+            fnorm2 += fam;
-+            if (fam > fmax2)
-+            {
-+                fmax2  = fam;
-+                la_max = i;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        for (i = start; i < end; i++)
-+        {
-+            fam     = norm2(f[i]);
-+            fnorm2 += fam;
-+            if (fam > fmax2)
-+            {
-+                fmax2  = fam;
-+                la_max = i;
-+            }
-+        }
-+    }
-+
-+    if (la_max >= 0 && DOMAINDECOMP(cr))
-+    {
-+        a_max = cr->dd->gatindex[la_max];
-+    }
-+    else
-+    {
-+        a_max = la_max;
-+    }
-+    if (PAR(cr))
-+    {
-+        snew(sum, 2*cr->nnodes+1);
-+        sum[2*cr->nodeid]   = fmax2;
-+        sum[2*cr->nodeid+1] = a_max;
-+        sum[2*cr->nnodes]   = fnorm2;
-+        gmx_sumd(2*cr->nnodes+1, sum, cr);
-+        fnorm2 = sum[2*cr->nnodes];
-+        /* Determine the global maximum */
-+        for (i = 0; i < cr->nnodes; i++)
-+        {
-+            if (sum[2*i] > fmax2)
-+            {
-+                fmax2 = sum[2*i];
-+                a_max = (int)(sum[2*i+1] + 0.5);
-+            }
-+        }
-+        sfree(sum);
-+    }
-+
-+    if (fnorm)
-+    {
-+        *fnorm = sqrt(fnorm2);
-+    }
-+    if (fmax)
-+    {
-+        *fmax  = sqrt(fmax2);
-+    }
-+    if (a_fmax)
-+    {
-+        *a_fmax = a_max;
-+    }
-+}
-+
-+static void get_state_f_norm_max(t_commrec *cr,
-+                                 t_grpopts *opts, t_mdatoms *mdatoms,
-+                                 em_state_t *ems)
-+{
-+    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
-+}
-+
-+void init_em(FILE *fplog, const char *title,
-+             t_commrec *cr, t_inputrec *ir,
-+             t_state *state_global, gmx_mtop_t *top_global,
-+             em_state_t *ems, gmx_localtop_t **top,
-+             rvec **f, rvec **f_global,
-+             t_nrnb *nrnb, rvec mu_tot,
-+             t_forcerec *fr, gmx_enerdata_t **enerd,
-+             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int nfile, const t_filenm fnm[],
-+             gmx_mdoutf_t *outf, t_mdebin **mdebin,
-+             int imdport, unsigned long gmx_unused Flags,
-+             gmx_wallcycle_t wcycle)
-+{
-+    int  i;
-+    real dvdl_constr;
-+
-+    if (fplog)
-+    {
-+        fprintf(fplog, "Initiating %s\n", title);
-+    }
-+
-+    state_global->ngtc = 0;
-+
-+    /* Initialize lambda variables */
-+    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
-+
-+    init_nrnb(nrnb);
-+
-+    /* Interactive molecular dynamics */
-+    init_IMD(ir, cr, top_global, fplog, 1, state_global->x,
-+             nfile, fnm, NULL, imdport, Flags);
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        *top = dd_init_local_top(top_global);
-+
-+        dd_init_local_state(cr->dd, state_global, &ems->s);
-+
-+        *f = NULL;
-+
-+        /* Distribute the charge groups over the nodes from the master node */
-+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
-+                            state_global, top_global, ir,
-+                            &ems->s, &ems->f, mdatoms, *top,
-+                            fr, vsite, NULL, constr,
-+                            nrnb, NULL, FALSE);
-+        dd_store_state(cr->dd, &ems->s);
-+
-+        if (ir->nstfout)
-+        {
-+            snew(*f_global, top_global->natoms);
-+        }
-+        else
-+        {
-+            *f_global = NULL;
-+        }
-+        *graph = NULL;
-+    }
-+    else
-+    {
-+        snew(*f, top_global->natoms);
-+
-+        /* Just copy the state */
-+        ems->s = *state_global;
-+        snew(ems->s.x, ems->s.nalloc);
-+        snew(ems->f, ems->s.nalloc);
-+        for (i = 0; i < state_global->natoms; i++)
-+        {
-+            copy_rvec(state_global->x[i], ems->s.x[i]);
-+        }
-+        copy_mat(state_global->box, ems->s.box);
-+
-+        *top      = gmx_mtop_generate_local_top(top_global, ir);
-+        *f_global = *f;
-+
-+        forcerec_set_excl_load(fr, *top);
-+
-+        setup_bonded_threading(fr, &(*top)->idef);
-+
-+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
-+        {
-+            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
-+        }
-+        else
-+        {
-+            *graph = NULL;
-+        }
-+
-+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
-+        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
-+
-+        if (vsite)
-+        {
-+            set_vsite_top(vsite, *top, mdatoms, cr);
-+        }
-+    }
-+
-+    if (constr)
-+    {
-+        if (ir->eConstrAlg == econtSHAKE &&
-+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
-+        {
-+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
-+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
-+        }
-+
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            set_constraints(constr, *top, ir, mdatoms, cr);
-+        }
-+
-+        if (!ir->bContinuation)
-+        {
-+            /* Constrain the starting coordinates */
-+            dvdl_constr = 0;
-+            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
-+                      ir, NULL, cr, -1, 0, 1.0, mdatoms,
-+                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
-+                      ems->s.lambda[efptFEP], &dvdl_constr,
-+                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
-+        }
-+    }
-+
-+    if (PAR(cr))
-+    {
-+        *gstat = global_stat_init(ir);
-+    }
-+
-+    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL, wcycle);
-+
-+    snew(*enerd, 1);
-+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
-+                  *enerd);
-+
-+    if (mdebin != NULL)
-+    {
-+        /* Init bin for energy stuff */
-+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, NULL);
-+    }
-+
-+    clear_rvec(mu_tot);
-+    calc_shifts(ems->s.box, fr->shift_vec);
-+}
-+
-+static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
-+                      gmx_walltime_accounting_t walltime_accounting,
-+                      gmx_wallcycle_t wcycle)
-+{
-+    if (!(cr->duty & DUTY_PME))
-+    {
-+        /* Tell the PME only node to finish */
-+        gmx_pme_send_finish(cr);
-+    }
-+
-+    done_mdoutf(outf);
-+
-+    em_time_end(walltime_accounting, wcycle);
-+}
-+
-+static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
-+{
-+    em_state_t tmp;
-+
-+    tmp   = *ems1;
-+    *ems1 = *ems2;
-+    *ems2 = tmp;
-+}
-+
-+static void copy_em_coords(em_state_t *ems, t_state *state)
-+{
-+    int i;
-+
-+    for (i = 0; (i < state->natoms); i++)
-+    {
-+        copy_rvec(ems->s.x[i], state->x[i]);
-+    }
-+}
-+
-+static void write_em_traj(FILE *fplog, t_commrec *cr,
-+                          gmx_mdoutf_t outf,
-+                          gmx_bool bX, gmx_bool bF, const char *confout,
-+                          gmx_mtop_t *top_global,
-+                          t_inputrec *ir, gmx_int64_t step,
-+                          em_state_t *state,
-+                          t_state *state_global, rvec *f_global)
-+{
-+    int      mdof_flags;
-+    gmx_bool bIMDout = FALSE;
-+
-+
-+    /* Shall we do IMD output? */
-+    if (ir->bIMD)
-+    {
-+        bIMDout = do_per_step(step, IMD_get_step(ir->imd->setup));
-+    }
-+
-+    if ((bX || bF || bIMDout || confout != NULL) && !DOMAINDECOMP(cr))
-+    {
-+        copy_em_coords(state, state_global);
-+        f_global = state->f;
-+    }
-+
-+    mdof_flags = 0;
-+    if (bX)
-+    {
-+        mdof_flags |= MDOF_X;
-+    }
-+    if (bF)
-+    {
-+        mdof_flags |= MDOF_F;
-+    }
-+
-+    /* If we want IMD output, set appropriate MDOF flag */
-+    if (ir->bIMD)
-+    {
-+        mdof_flags |= MDOF_IMD;
-+    }
-+
-+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
-+                                     top_global, step, (double)step,
-+                                     &state->s, state_global, state->f, f_global);
-+
-+    if (confout != NULL && MASTER(cr))
-+    {
-+        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
-+        {
-+            /* Make molecules whole only for confout writing */
-+            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
-+                        state_global->x);
-+        }
-+
-+        write_sto_conf_mtop(confout,
-+                            *top_global->name, top_global,
-+                            state_global->x, NULL, ir->ePBC, state_global->box);
-+    }
-+}
-+
-+static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
-+                       gmx_bool bMolPBC,
-+                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
-+                       gmx_constr_t constr, gmx_localtop_t *top,
-+                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                       gmx_int64_t count)
-+
-+{
-+    t_state *s1, *s2;
-+    int      i;
-+    int      start, end;
-+    rvec    *x1, *x2;
-+    real     dvdl_constr;
-+    int      nthreads gmx_unused;
-+
-+    s1 = &ems1->s;
-+    s2 = &ems2->s;
-+
-+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
-+    {
-+        gmx_incons("state mismatch in do_em_step");
-+    }
-+
-+    s2->flags = s1->flags;
-+
-+    if (s2->nalloc != s1->nalloc)
-+    {
-+        s2->nalloc = s1->nalloc;
-+        srenew(s2->x, s1->nalloc);
-+        srenew(ems2->f,  s1->nalloc);
-+        if (s2->flags & (1<<estCGP))
-+        {
-+            srenew(s2->cg_p,  s1->nalloc);
-+        }
-+    }
-+
-+    s2->natoms = s1->natoms;
-+    copy_mat(s1->box, s2->box);
-+    /* Copy free energy state */
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        s2->lambda[i] = s1->lambda[i];
-+    }
-+    copy_mat(s1->box, s2->box);
-+
-+    start = 0;
-+    end   = md->homenr;
-+
-+    x1 = s1->x;
-+    x2 = s2->x;
-+
-+    nthreads = gmx_omp_nthreads_get(emntUpdate);
-+#pragma omp parallel num_threads(nthreads)
-+    {
-+        int gf, i, m;
-+
-+        gf = 0;
-+#pragma omp for schedule(static) nowait
-+        for (i = start; i < end; i++)
-+        {
-+            if (md->cFREEZE)
-+            {
-+                gf = md->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (ir->opts.nFreeze[gf][m])
-+                {
-+                    x2[i][m] = x1[i][m];
-+                }
-+                else
-+                {
-+                    x2[i][m] = x1[i][m] + a*f[i][m];
-+                }
-+            }
-+        }
-+
-+        if (s2->flags & (1<<estCGP))
-+        {
-+            /* Copy the CG p vector */
-+            x1 = s1->cg_p;
-+            x2 = s2->cg_p;
-+#pragma omp for schedule(static) nowait
-+            for (i = start; i < end; i++)
-+            {
-+                copy_rvec(x1[i], x2[i]);
-+            }
-+        }
-+
-+        if (DOMAINDECOMP(cr))
-+        {
-+            s2->ddp_count = s1->ddp_count;
-+            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
-+            {
-+#pragma omp barrier
-+                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
-+                srenew(s2->cg_gl, s2->cg_gl_nalloc);
-+#pragma omp barrier
-+            }
-+            s2->ncg_gl = s1->ncg_gl;
-+#pragma omp for schedule(static) nowait
-+            for (i = 0; i < s2->ncg_gl; i++)
-+            {
-+                s2->cg_gl[i] = s1->cg_gl[i];
-+            }
-+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
-+        }
-+    }
-+
-+    if (constr)
-+    {
-+        wallcycle_start(wcycle, ewcCONSTR);
-+        dvdl_constr = 0;
-+        constrain(NULL, TRUE, TRUE, constr, &top->idef,
-+                  ir, NULL, cr, count, 0, 1.0, md,
-+                  s1->x, s2->x, NULL, bMolPBC, s2->box,
-+                  s2->lambda[efptBONDED], &dvdl_constr,
-+                  NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
-+        wallcycle_stop(wcycle, ewcCONSTR);
-+    }
-+}
-+
-+static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
-+                                   gmx_mtop_t *top_global, t_inputrec *ir,
-+                                   em_state_t *ems, gmx_localtop_t *top,
-+                                   t_mdatoms *mdatoms, t_forcerec *fr,
-+                                   gmx_vsite_t *vsite, gmx_constr_t constr,
-+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
-+{
-+    /* Repartition the domain decomposition */
-+    wallcycle_start(wcycle, ewcDOMDEC);
-+    dd_partition_system(fplog, step, cr, FALSE, 1,
-+                        NULL, top_global, ir,
-+                        &ems->s, &ems->f,
-+                        mdatoms, top, fr, vsite, NULL, constr,
-+                        nrnb, wcycle, FALSE);
-+    dd_store_state(cr->dd, &ems->s);
-+    wallcycle_stop(wcycle, ewcDOMDEC);
-+}
-+
-+static void evaluate_energy(FILE *fplog, t_commrec *cr,
-+                            gmx_mtop_t *top_global,
-+                            em_state_t *ems, gmx_localtop_t *top,
-+                            t_inputrec *inputrec,
-+                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                            gmx_global_stat_t gstat,
-+                            gmx_vsite_t *vsite, gmx_constr_t constr,
-+                            t_fcdata *fcd,
-+                            t_graph *graph, t_mdatoms *mdatoms,
-+                            t_forcerec *fr, rvec mu_tot,
-+                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
-+                            gmx_int64_t count, gmx_bool bFirst)
-+{
-+    real     t;
-+    gmx_bool bNS;
-+    int      nabnsb;
-+    tensor   force_vir, shake_vir, ekin;
-+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
-+    real     terminate = 0;
-+
-+    /* Set the time to the initial time, the time does not change during EM */
-+    t = inputrec->init_t;
-+
-+    if (bFirst ||
-+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
-+    {
-+        /* This is the first state or an old state used before the last ns */
-+        bNS = TRUE;
-+    }
-+    else
-+    {
-+        bNS = FALSE;
-+        if (inputrec->nstlist > 0)
-+        {
-+            bNS = TRUE;
-+        }
-+        else if (inputrec->nstlist == -1)
-+        {
-+            nabnsb = natoms_beyond_ns_buffer(inputrec, fr, &top->cgs, NULL, ems->s.x);
-+            if (PAR(cr))
-+            {
-+                gmx_sumi(1, &nabnsb, cr);
-+            }
-+            bNS = (nabnsb > 0);
-+        }
-+    }
-+
-+    if (vsite)
-+    {
-+        construct_vsites(vsite, ems->s.x, 1, NULL,
-+                         top->idef.iparams, top->idef.il,
-+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
-+    }
-+
-+    if (DOMAINDECOMP(cr) && bNS)
-+    {
-+        /* Repartition the domain decomposition */
-+        em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-+                               ems, top, mdatoms, fr, vsite, constr,
-+                               nrnb, wcycle);
-+    }
-+
-+    /* Calc force & energy on new trial position  */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole in congrad.c
-+     */
-+    do_force(fplog, cr, inputrec,
-+             count, nrnb, wcycle, top, &top_global->groups,
-+             ems->s.box, ems->s.x, &ems->s.hist,
-+             ems->f, force_vir, mdatoms, enerd, fcd,
-+             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
-+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
-+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
-+             (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
-+
-+    /* Clear the unused shake virial and pressure */
-+    clear_mat(shake_vir);
-+    clear_mat(pres);
-+
-+    /* Communicate stuff when parallel */
-+    if (PAR(cr) && inputrec->eI != eiNM)
-+    {
-+        wallcycle_start(wcycle, ewcMoveE);
-+
-+        global_stat(fplog, gstat, cr, enerd, force_vir, shake_vir, mu_tot,
-+                    inputrec, NULL, NULL, NULL, 1, &terminate,
-+                    top_global, &ems->s, FALSE,
-+                    CGLO_ENERGY |
-+                    CGLO_PRESSURE |
-+                    CGLO_CONSTRAINT |
-+                    CGLO_FIRSTITERATE);
-+
-+        wallcycle_stop(wcycle, ewcMoveE);
-+    }
-+
-+    /* Calculate long range corrections to pressure and energy */
-+    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
-+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
-+    enerd->term[F_DISPCORR] = enercorr;
-+    enerd->term[F_EPOT]    += enercorr;
-+    enerd->term[F_PRES]    += prescorr;
-+    enerd->term[F_DVDL]    += dvdlcorr;
-+
-+    ems->epot = enerd->term[F_EPOT];
-+
-+    if (constr)
-+    {
-+        /* Project out the constraint components of the force */
-+        wallcycle_start(wcycle, ewcCONSTR);
-+        dvdl_constr = 0;
-+        constrain(NULL, FALSE, FALSE, constr, &top->idef,
-+                  inputrec, NULL, cr, count, 0, 1.0, mdatoms,
-+                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
-+                  ems->s.lambda[efptBONDED], &dvdl_constr,
-+                  NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
-+        if (fr->bSepDVDL && fplog)
-+        {
-+            gmx_print_sepdvdl(fplog, "Constraints", t, dvdl_constr);
-+        }
-+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-+        m_add(force_vir, shake_vir, vir);
-+        wallcycle_stop(wcycle, ewcCONSTR);
-+    }
-+    else
-+    {
-+        copy_mat(force_vir, vir);
-+    }
-+
-+    clear_mat(ekin);
-+    enerd->term[F_PRES] =
-+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
-+
-+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
-+
-+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
-+    {
-+        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
-+    }
-+}
-+
-+static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
-+                              gmx_mtop_t *mtop,
-+                              em_state_t *s_min, em_state_t *s_b)
-+{
-+    rvec          *fm, *fb, *fmg;
-+    t_block       *cgs_gl;
-+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
-+    double         partsum;
-+    unsigned char *grpnrFREEZE;
-+
-+    if (debug)
-+    {
-+        fprintf(debug, "Doing reorder_partsum\n");
-+    }
-+
-+    fm = s_min->f;
-+    fb = s_b->f;
-+
-+    cgs_gl = dd_charge_groups_global(cr->dd);
-+    index  = cgs_gl->index;
-+
-+    /* Collect fm in a global vector fmg.
-+     * This conflicts with the spirit of domain decomposition,
-+     * but to fully optimize this a much more complicated algorithm is required.
-+     */
-+    snew(fmg, mtop->natoms);
-+
-+    ncg   = s_min->s.ncg_gl;
-+    cg_gl = s_min->s.cg_gl;
-+    i     = 0;
-+    for (c = 0; c < ncg; c++)
-+    {
-+        cg = cg_gl[c];
-+        a0 = index[cg];
-+        a1 = index[cg+1];
-+        for (a = a0; a < a1; a++)
-+        {
-+            copy_rvec(fm[i], fmg[a]);
-+            i++;
-+        }
-+    }
-+    gmx_sum(mtop->natoms*3, fmg[0], cr);
-+
-+    /* Now we will determine the part of the sum for the cgs in state s_b */
-+    ncg         = s_b->s.ncg_gl;
-+    cg_gl       = s_b->s.cg_gl;
-+    partsum     = 0;
-+    i           = 0;
-+    gf          = 0;
-+    grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
-+    for (c = 0; c < ncg; c++)
-+    {
-+        cg = cg_gl[c];
-+        a0 = index[cg];
-+        a1 = index[cg+1];
-+        for (a = a0; a < a1; a++)
-+        {
-+            if (mdatoms->cFREEZE && grpnrFREEZE)
-+            {
-+                gf = grpnrFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
-+                }
-+            }
-+            i++;
-+        }
-+    }
-+
-+    sfree(fmg);
-+
-+    return partsum;
-+}
-+
-+static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
-+                    gmx_mtop_t *mtop,
-+                    em_state_t *s_min, em_state_t *s_b)
-+{
-+    rvec  *fm, *fb;
-+    double sum;
-+    int    gf, i, m;
-+
-+    /* This is just the classical Polak-Ribiere calculation of beta;
-+     * it looks a bit complicated since we take freeze groups into account,
-+     * and might have to sum it in parallel runs.
-+     */
-+
-+    if (!DOMAINDECOMP(cr) ||
-+        (s_min->s.ddp_count == cr->dd->ddp_count &&
-+         s_b->s.ddp_count   == cr->dd->ddp_count))
-+    {
-+        fm  = s_min->f;
-+        fb  = s_b->f;
-+        sum = 0;
-+        gf  = 0;
-+        /* This part of code can be incorrect with DD,
-+         * since the atom ordering in s_b and s_min might differ.
-+         */
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            if (mdatoms->cFREEZE)
-+            {
-+                gf = mdatoms->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
-+                }
-+            }
-+        }
-+    }
-+    else
-+    {
-+        /* We need to reorder cgs while summing */
-+        sum = reorder_partsum(cr, opts, mdatoms, mtop, s_min, s_b);
-+    }
-+    if (PAR(cr))
-+    {
-+        gmx_sumd(1, &sum, cr);
-+    }
-+
-+    return sum/sqr(s_min->fnorm);
-+}
-+
-+double do_cg(FILE *fplog, t_commrec *cr,
-+             int nfile, const t_filenm fnm[],
-+             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+             int gmx_unused nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int gmx_unused stepout,
-+             t_inputrec *inputrec,
-+             gmx_mtop_t *top_global, t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t gmx_unused ed,
-+             t_forcerec *fr,
-+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+             gmx_membed_t gmx_unused membed,
-+             real gmx_unused cpt_period, real gmx_unused max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long gmx_unused Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
-+
-+    em_state_t       *s_min, *s_a, *s_b, *s_c;
-+    gmx_localtop_t   *top;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f;
-+    gmx_global_stat_t gstat;
-+    t_graph          *graph;
-+    rvec             *f_global, *p, *sf, *sfm;
-+    double            gpa, gpb, gpc, tmp, sum[2], minstep;
-+    real              fnormn;
-+    real              stepsize;
-+    real              a, b, c, beta = 0.0;
-+    real              epot_repl = 0;
-+    real              pnorm;
-+    t_mdebin         *mdebin;
-+    gmx_bool          converged, foundlower;
-+    rvec              mu_tot;
-+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
-+    tensor            vir, pres;
-+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
-+    gmx_mdoutf_t      outf;
-+    int               i, m, gf, step, nminstep;
-+    real              terminate = 0;
-+
-+    step = 0;
-+
-+    s_min = init_em_state();
-+    s_a   = init_em_state();
-+    s_b   = init_em_state();
-+    s_c   = init_em_state();
-+
-+    /* Init em and store the local state in s_min */
-+    init_em(fplog, CG, cr, inputrec,
-+            state_global, top_global, s_min, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+
-+    /* Print to log file */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
-+
-+    /* Max number of steps */
-+    number_steps = inputrec->nsteps;
-+
-+    if (MASTER(cr))
-+    {
-+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
-+    }
-+
-+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole in congrad.c
-+     */
-+    evaluate_energy(fplog, cr,
-+                    top_global, s_min, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    where();
-+
-+    if (MASTER(cr))
-+    {
-+        /* Copy stuff to the energy bin for easy printing etc. */
-+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
-+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+        print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+    where();
-+
-+    /* Estimate/guess the initial stepsize */
-+    stepsize = inputrec->em_stepsize/s_min->fnorm;
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
-+                s_min->fmax, s_min->a_fmax+1);
-+        fprintf(stderr, "   F-Norm            = %12.5e\n",
-+                s_min->fnorm/sqrt(state_global->natoms));
-+        fprintf(stderr, "\n");
-+        /* and copy to the log file too... */
-+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
-+                s_min->fmax, s_min->a_fmax+1);
-+        fprintf(fplog, "   F-Norm            = %12.5e\n",
-+                s_min->fnorm/sqrt(state_global->natoms));
-+        fprintf(fplog, "\n");
-+    }
-+    /* Start the loop over CG steps.
-+     * Each successful step is counted, and we continue until
-+     * we either converge or reach the max number of steps.
-+     */
-+    converged = FALSE;
-+    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
-+    {
-+
-+        /* start taking steps in a new direction
-+         * First time we enter the routine, beta=0, and the direction is
-+         * simply the negative gradient.
-+         */
-+
-+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
-+        p   = s_min->s.cg_p;
-+        sf  = s_min->f;
-+        gpa = 0;
-+        gf  = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            if (mdatoms->cFREEZE)
-+            {
-+                gf = mdatoms->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!inputrec->opts.nFreeze[gf][m])
-+                {
-+                    p[i][m] = sf[i][m] + beta*p[i][m];
-+                    gpa    -= p[i][m]*sf[i][m];
-+                    /* f is negative gradient, thus the sign */
-+                }
-+                else
-+                {
-+                    p[i][m] = 0;
-+                }
-+            }
-+        }
-+
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpa, cr);
-+        }
-+
-+        /* Calculate the norm of the search vector */
-+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
-+
-+        /* Just in case stepsize reaches zero due to numerical precision... */
-+        if (stepsize <= 0)
-+        {
-+            stepsize = inputrec->em_stepsize/pnorm;
-+        }
-+
-+        /*
-+         * Double check the value of the derivative in the search direction.
-+         * If it is positive it must be due to the old information in the
-+         * CG formula, so just remove that and start over with beta=0.
-+         * This corresponds to a steepest descent step.
-+         */
-+        if (gpa > 0)
-+        {
-+            beta = 0;
-+            step--;   /* Don't count this step since we are restarting */
-+            continue; /* Go back to the beginning of the big for-loop */
-+        }
-+
-+        /* Calculate minimum allowed stepsize, before the average (norm)
-+         * relative change in coordinate is smaller than precision
-+         */
-+        minstep = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            for (m = 0; m < DIM; m++)
-+            {
-+                tmp = fabs(s_min->s.x[i][m]);
-+                if (tmp < 1.0)
-+                {
-+                    tmp = 1.0;
-+                }
-+                tmp      = p[i][m]/tmp;
-+                minstep += tmp*tmp;
-+            }
-+        }
-+        /* Add up from all CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &minstep, cr);
-+        }
-+
-+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
-+
-+        if (stepsize < minstep)
-+        {
-+            converged = TRUE;
-+            break;
-+        }
-+
-+        /* Write coordinates if necessary */
-+        do_x = do_per_step(step, inputrec->nstxout);
-+        do_f = do_per_step(step, inputrec->nstfout);
-+
-+        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
-+                      top_global, inputrec, step,
-+                      s_min, state_global, f_global);
-+
-+        /* Take a step downhill.
-+         * In theory, we should minimize the function along this direction.
-+         * That is quite possible, but it turns out to take 5-10 function evaluations
-+         * for each line. However, we dont really need to find the exact minimum -
-+         * it is much better to start a new CG step in a modified direction as soon
-+         * as we are close to it. This will save a lot of energy evaluations.
-+         *
-+         * In practice, we just try to take a single step.
-+         * If it worked (i.e. lowered the energy), we increase the stepsize but
-+         * the continue straight to the next CG step without trying to find any minimum.
-+         * If it didn't work (higher energy), there must be a minimum somewhere between
-+         * the old position and the new one.
-+         *
-+         * Due to the finite numerical accuracy, it turns out that it is a good idea
-+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-+         * This leads to lower final energies in the tests I've done. / Erik
-+         */
-+        s_a->epot = s_min->epot;
-+        a         = 0.0;
-+        c         = a + stepsize; /* reference position along line is zero */
-+
-+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
-+        {
-+            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
-+                                   s_min, top, mdatoms, fr, vsite, constr,
-+                                   nrnb, wcycle);
-+        }
-+
-+        /* Take a trial step (new coords in s_c) */
-+        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
-+                   constr, top, nrnb, wcycle, -1);
-+
-+        neval++;
-+        /* Calculate energy for the trial step */
-+        evaluate_energy(fplog, cr,
-+                        top_global, s_c, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, -1, FALSE);
-+
-+        /* Calc derivative along line */
-+        p   = s_c->s.cg_p;
-+        sf  = s_c->f;
-+        gpc = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            for (m = 0; m < DIM; m++)
-+            {
-+                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
-+            }
-+        }
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpc, cr);
-+        }
-+
-+        /* This is the max amount of increase in energy we tolerate */
-+        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
-+
-+        /* Accept the step if the energy is lower, or if it is not significantly higher
-+         * and the line derivative is still negative.
-+         */
-+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
-+        {
-+            foundlower = TRUE;
-+            /* Great, we found a better energy. Increase step for next iteration
-+             * if we are still going down, decrease it otherwise
-+             */
-+            if (gpc < 0)
-+            {
-+                stepsize *= 1.618034; /* The golden section */
-+            }
-+            else
-+            {
-+                stepsize *= 0.618034; /* 1/golden section */
-+            }
-+        }
-+        else
-+        {
-+            /* New energy is the same or higher. We will have to do some work
-+             * to find a smaller value in the interval. Take smaller step next time!
-+             */
-+            foundlower = FALSE;
-+            stepsize  *= 0.618034;
-+        }
-+
-+
-+
-+
-+        /* OK, if we didn't find a lower value we will have to locate one now - there must
-+         * be one in the interval [a=0,c].
-+         * The same thing is valid here, though: Don't spend dozens of iterations to find
-+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-+         *
-+         * I also have a safeguard for potentially really patological functions so we never
-+         * take more than 20 steps before we give up ...
-+         *
-+         * If we already found a lower value we just skip this step and continue to the update.
-+         */
-+        if (!foundlower)
-+        {
-+            nminstep = 0;
-+
-+            do
-+            {
-+                /* Select a new trial point.
-+                 * If the derivatives at points a & c have different sign we interpolate to zero,
-+                 * otherwise just do a bisection.
-+                 */
-+                if (gpa < 0 && gpc > 0)
-+                {
-+                    b = a + gpa*(a-c)/(gpc-gpa);
-+                }
-+                else
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* safeguard if interpolation close to machine accuracy causes errors:
-+                 * never go outside the interval
-+                 */
-+                if (b <= a || b >= c)
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-+                {
-+                    /* Reload the old state */
-+                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
-+                                           s_min, top, mdatoms, fr, vsite, constr,
-+                                           nrnb, wcycle);
-+                }
-+
-+                /* Take a trial step to this new point - new coords in s_b */
-+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
-+                           constr, top, nrnb, wcycle, -1);
-+
-+                neval++;
-+                /* Calculate energy for the trial step */
-+                evaluate_energy(fplog, cr,
-+                                top_global, s_b, top,
-+                                inputrec, nrnb, wcycle, gstat,
-+                                vsite, constr, fcd, graph, mdatoms, fr,
-+                                mu_tot, enerd, vir, pres, -1, FALSE);
-+
-+                /* p does not change within a step, but since the domain decomposition
-+                 * might change, we have to use cg_p of s_b here.
-+                 */
-+                p   = s_b->s.cg_p;
-+                sf  = s_b->f;
-+                gpb = 0;
-+                for (i = 0; i < mdatoms->homenr; i++)
-+                {
-+                    for (m = 0; m < DIM; m++)
-+                    {
-+                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
-+                    }
-+                }
-+                /* Sum the gradient along the line across CPUs */
-+                if (PAR(cr))
-+                {
-+                    gmx_sumd(1, &gpb, cr);
-+                }
-+
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
-+                            s_a->epot, s_b->epot, s_c->epot, gpb);
-+                }
-+
-+                epot_repl = s_b->epot;
-+
-+                /* Keep one of the intervals based on the value of the derivative at the new point */
-+                if (gpb > 0)
-+                {
-+                    /* Replace c endpoint with b */
-+                    swap_em_state(s_b, s_c);
-+                    c   = b;
-+                    gpc = gpb;
-+                }
-+                else
-+                {
-+                    /* Replace a endpoint with b */
-+                    swap_em_state(s_b, s_a);
-+                    a   = b;
-+                    gpa = gpb;
-+                }
-+
-+                /*
-+                 * Stop search as soon as we find a value smaller than the endpoints.
-+                 * Never run more than 20 steps, no matter what.
-+                 */
-+                nminstep++;
-+            }
-+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
-+                   (nminstep < 20));
-+
-+            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
-+                nminstep >= 20)
-+            {
-+                /* OK. We couldn't find a significantly lower energy.
-+                 * If beta==0 this was steepest descent, and then we give up.
-+                 * If not, set beta=0 and restart with steepest descent before quitting.
-+                 */
-+                if (beta == 0.0)
-+                {
-+                    /* Converged */
-+                    converged = TRUE;
-+                    break;
-+                }
-+                else
-+                {
-+                    /* Reset memory before giving up */
-+                    beta = 0.0;
-+                    continue;
-+                }
-+            }
-+
-+            /* Select min energy state of A & C, put the best in B.
-+             */
-+            if (s_c->epot < s_a->epot)
-+            {
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
-+                            s_c->epot, s_a->epot);
-+                }
-+                swap_em_state(s_b, s_c);
-+                gpb = gpc;
-+                b   = c;
-+            }
-+            else
-+            {
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
-+                            s_a->epot, s_c->epot);
-+                }
-+                swap_em_state(s_b, s_a);
-+                gpb = gpa;
-+                b   = a;
-+            }
-+
-+        }
-+        else
-+        {
-+            if (debug)
-+            {
-+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
-+                        s_c->epot);
-+            }
-+            swap_em_state(s_b, s_c);
-+            gpb = gpc;
-+            b   = c;
-+        }
-+
-+        /* new search direction */
-+        /* beta = 0 means forget all memory and restart with steepest descents. */
-+        if (nstcg && ((step % nstcg) == 0))
-+        {
-+            beta = 0.0;
-+        }
-+        else
-+        {
-+            /* s_min->fnorm cannot be zero, because then we would have converged
-+             * and broken out.
-+             */
-+
-+            /* Polak-Ribiere update.
-+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
-+             */
-+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
-+        }
-+        /* Limit beta to prevent oscillations */
-+        if (fabs(beta) > 5.0)
-+        {
-+            beta = 0.0;
-+        }
-+
-+
-+        /* update positions */
-+        swap_em_state(s_min, s_b);
-+        gpa = gpb;
-+
-+        /* Print it if necessary */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
-+                        step, s_min->epot, s_min->fnorm/sqrt(state_global->natoms),
-+                        s_min->fmax, s_min->a_fmax+1);
-+            }
-+            /* Store the new (lower) energies */
-+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
-+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+            do_log = do_per_step(step, inputrec->nstlog);
-+            do_ene = do_per_step(step, inputrec->nstenergy);
-+
-+            /* Prepare IMD energy record, if bIMD is TRUE. */
-+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
-+
-+            if (do_log)
-+            {
-+                print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+            }
-+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-+                       do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+
-+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        /* Stop when the maximum force lies below tolerance.
-+         * If we have reached machine precision, converged is already set to true.
-+         */
-+        converged = converged || (s_min->fmax < inputrec->em_tol);
-+
-+    } /* End of the loop */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    if (converged)
-+    {
-+        step--; /* we never took that last step in this case */
-+
-+    }
-+    if (s_min->fmax > inputrec->em_tol)
-+    {
-+        if (MASTER(cr))
-+        {
-+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
-+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
-+        }
-+        converged = FALSE;
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        /* If we printed energy and/or logfile last step (which was the last step)
-+         * we don't have to do it again, but otherwise print the final values.
-+         */
-+        if (!do_log)
-+        {
-+            /* Write final value to log since we didn't do anything the last step */
-+            print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+        }
-+        if (!do_ene || !do_log)
-+        {
-+            /* Write final energy file entries */
-+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-+                       !do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+    }
-+
-+    /* Print some stuff... */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+
-+    /* IMPORTANT!
-+     * For accurate normal mode calculation it is imperative that we
-+     * store the last conformation into the full precision binary trajectory.
-+     *
-+     * However, we should only do it if we did NOT already write this step
-+     * above (which we did if do_x or do_f was true).
-+     */
-+    do_x = !do_per_step(step, inputrec->nstxout);
-+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
-+
-+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, step,
-+                  s_min, state_global, f_global);
-+
-+    fnormn = s_min->fnorm/sqrt(state_global->natoms);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+
-+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_lbfgs(FILE *fplog, t_commrec *cr,
-+                int nfile, const t_filenm fnm[],
-+                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+                int gmx_unused nstglobalcomm,
-+                gmx_vsite_t *vsite, gmx_constr_t constr,
-+                int gmx_unused stepout,
-+                t_inputrec *inputrec,
-+                gmx_mtop_t *top_global, t_fcdata *fcd,
-+                t_state *state,
-+                t_mdatoms *mdatoms,
-+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                gmx_edsam_t gmx_unused ed,
-+                t_forcerec *fr,
-+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+                gmx_membed_t gmx_unused membed,
-+                real gmx_unused cpt_period, real gmx_unused max_hours,
-+                const char gmx_unused *deviceOptions,
-+                int imdport,
-+                unsigned long gmx_unused Flags,
-+                gmx_walltime_accounting_t walltime_accounting)
-+{
-+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
-+    em_state_t         ems;
-+    gmx_localtop_t    *top;
-+    gmx_enerdata_t    *enerd;
-+    rvec              *f;
-+    gmx_global_stat_t  gstat;
-+    t_graph           *graph;
-+    rvec              *f_global;
-+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
-+    double             stepsize, gpa, gpb, gpc, tmp, minstep;
-+    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
-+    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
-+    real               a, b, c, maxdelta, delta;
-+    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
-+    real               dgdx, dgdg, sq, yr, beta;
-+    t_mdebin          *mdebin;
-+    gmx_bool           converged, first;
-+    rvec               mu_tot;
-+    real               fnorm, fmax;
-+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
-+    tensor             vir, pres;
-+    int                start, end, number_steps;
-+    gmx_mdoutf_t       outf;
-+    int                i, k, m, n, nfmax, gf, step;
-+    int                mdof_flags;
-+    /* not used */
-+    real               terminate;
-+
-+    if (PAR(cr))
-+    {
-+        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
-+    }
-+
-+    if (NULL != constr)
-+    {
-+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
-+    }
-+
-+    n        = 3*state->natoms;
-+    nmaxcorr = inputrec->nbfgscorr;
-+
-+    /* Allocate memory */
-+    /* Use pointers to real so we dont have to loop over both atoms and
-+     * dimensions all the time...
-+     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
-+     * that point to the same memory.
-+     */
-+    snew(xa, n);
-+    snew(xb, n);
-+    snew(xc, n);
-+    snew(fa, n);
-+    snew(fb, n);
-+    snew(fc, n);
-+    snew(frozen, n);
-+
-+    snew(p, n);
-+    snew(lastx, n);
-+    snew(lastf, n);
-+    snew(rho, nmaxcorr);
-+    snew(alpha, nmaxcorr);
-+
-+    snew(dx, nmaxcorr);
-+    for (i = 0; i < nmaxcorr; i++)
-+    {
-+        snew(dx[i], n);
-+    }
-+
-+    snew(dg, nmaxcorr);
-+    for (i = 0; i < nmaxcorr; i++)
-+    {
-+        snew(dg[i], n);
-+    }
-+
-+    step  = 0;
-+    neval = 0;
-+
-+    /* Init em */
-+    init_em(fplog, LBFGS, cr, inputrec,
-+            state, top_global, &ems, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+    /* Do_lbfgs is not completely updated like do_steep and do_cg,
-+     * so we free some memory again.
-+     */
-+    sfree(ems.s.x);
-+    sfree(ems.f);
-+
-+    xx = (real *)state->x;
-+    ff = (real *)f;
-+
-+    start = 0;
-+    end   = mdatoms->homenr;
-+
-+    /* Print to log file */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
-+
-+    do_log = do_ene = do_x = do_f = TRUE;
-+
-+    /* Max number of steps */
-+    number_steps = inputrec->nsteps;
-+
-+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
-+    gf = 0;
-+    for (i = start; i < end; i++)
-+    {
-+        if (mdatoms->cFREEZE)
-+        {
-+            gf = mdatoms->cFREEZE[i];
-+        }
-+        for (m = 0; m < DIM; m++)
-+        {
-+            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
-+        }
-+    }
-+    if (MASTER(cr))
-+    {
-+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
-+    }
-+
-+    if (vsite)
-+    {
-+        construct_vsites(vsite, state->x, 1, NULL,
-+                         top->idef.iparams, top->idef.il,
-+                         fr->ePBC, fr->bMolPBC, cr, state->box);
-+    }
-+
-+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole
-+     */
-+    neval++;
-+    ems.s.x = state->x;
-+    ems.f   = f;
-+    evaluate_energy(fplog, cr,
-+                    top_global, &ems, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    where();
-+
-+    if (MASTER(cr))
-+    {
-+        /* Copy stuff to the energy bin for easy printing etc. */
-+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                   mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
-+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+    where();
-+
-+    /* This is the starting energy */
-+    Epot = enerd->term[F_EPOT];
-+
-+    fnorm = ems.fnorm;
-+    fmax  = ems.fmax;
-+    nfmax = ems.a_fmax;
-+
-+    /* Set the initial step.
-+     * since it will be multiplied by the non-normalized search direction
-+     * vector (force vector the first time), we scale it by the
-+     * norm of the force.
-+     */
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
-+        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
-+        fprintf(stderr, "\n");
-+        /* and copy to the log file too... */
-+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
-+        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
-+        fprintf(fplog, "\n");
-+    }
-+
-+    point = 0;
-+    for (i = 0; i < n; i++)
-+    {
-+        if (!frozen[i])
-+        {
-+            dx[point][i] = ff[i]; /* Initial search direction */
-+        }
-+        else
-+        {
-+            dx[point][i] = 0;
-+        }
-+    }
-+
-+    stepsize  = 1.0/fnorm;
-+    converged = FALSE;
-+
-+    /* Start the loop over BFGS steps.
-+     * Each successful step is counted, and we continue until
-+     * we either converge or reach the max number of steps.
-+     */
-+
-+    ncorr = 0;
-+
-+    /* Set the gradient from the force */
-+    converged = FALSE;
-+    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
-+    {
-+
-+        /* Write coordinates if necessary */
-+        do_x = do_per_step(step, inputrec->nstxout);
-+        do_f = do_per_step(step, inputrec->nstfout);
-+
-+        mdof_flags = 0;
-+        if (do_x)
-+        {
-+            mdof_flags |= MDOF_X;
-+        }
-+
-+        if (do_f)
-+        {
-+            mdof_flags |= MDOF_F;
-+        }
-+
-+        if (inputrec->bIMD)
-+        {
-+            mdof_flags |= MDOF_IMD;
-+        }
-+
-+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
-+                                         top_global, step, (real)step, state, state, f, f);
-+
-+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
-+
-+        /* pointer to current direction - point=0 first time here */
-+        s = dx[point];
-+
-+        /* calculate line gradient */
-+        for (gpa = 0, i = 0; i < n; i++)
-+        {
-+            gpa -= s[i]*ff[i];
-+        }
-+
-+        /* Calculate minimum allowed stepsize, before the average (norm)
-+         * relative change in coordinate is smaller than precision
-+         */
-+        for (minstep = 0, i = 0; i < n; i++)
-+        {
-+            tmp = fabs(xx[i]);
-+            if (tmp < 1.0)
-+            {
-+                tmp = 1.0;
-+            }
-+            tmp      = s[i]/tmp;
-+            minstep += tmp*tmp;
-+        }
-+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
-+
-+        if (stepsize < minstep)
-+        {
-+            converged = TRUE;
-+            break;
-+        }
-+
-+        /* Store old forces and coordinates */
-+        for (i = 0; i < n; i++)
-+        {
-+            lastx[i] = xx[i];
-+            lastf[i] = ff[i];
-+        }
-+        Epot0 = Epot;
-+
-+        first = TRUE;
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            xa[i] = xx[i];
-+        }
-+
-+        /* Take a step downhill.
-+         * In theory, we should minimize the function along this direction.
-+         * That is quite possible, but it turns out to take 5-10 function evaluations
-+         * for each line. However, we dont really need to find the exact minimum -
-+         * it is much better to start a new BFGS step in a modified direction as soon
-+         * as we are close to it. This will save a lot of energy evaluations.
-+         *
-+         * In practice, we just try to take a single step.
-+         * If it worked (i.e. lowered the energy), we increase the stepsize but
-+         * the continue straight to the next BFGS step without trying to find any minimum.
-+         * If it didn't work (higher energy), there must be a minimum somewhere between
-+         * the old position and the new one.
-+         *
-+         * Due to the finite numerical accuracy, it turns out that it is a good idea
-+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-+         * This leads to lower final energies in the tests I've done. / Erik
-+         */
-+        foundlower = FALSE;
-+        EpotA      = Epot0;
-+        a          = 0.0;
-+        c          = a + stepsize; /* reference position along line is zero */
-+
-+        /* Check stepsize first. We do not allow displacements
-+         * larger than emstep.
-+         */
-+        do
-+        {
-+            c        = a + stepsize;
-+            maxdelta = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                delta = c*s[i];
-+                if (delta > maxdelta)
-+                {
-+                    maxdelta = delta;
-+                }
-+            }
-+            if (maxdelta > inputrec->em_stepsize)
-+            {
-+                stepsize *= 0.1;
-+            }
-+        }
-+        while (maxdelta > inputrec->em_stepsize);
-+
-+        /* Take a trial step */
-+        for (i = 0; i < n; i++)
-+        {
-+            xc[i] = lastx[i] + c*s[i];
-+        }
-+
-+        neval++;
-+        /* Calculate energy for the trial step */
-+        ems.s.x = (rvec *)xc;
-+        ems.f   = (rvec *)fc;
-+        evaluate_energy(fplog, cr,
-+                        top_global, &ems, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, step, FALSE);
-+        EpotC = ems.epot;
-+
-+        /* Calc derivative along line */
-+        for (gpc = 0, i = 0; i < n; i++)
-+        {
-+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
-+        }
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpc, cr);
-+        }
-+
-+        /* This is the max amount of increase in energy we tolerate */
-+        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
-+
-+        /* Accept the step if the energy is lower, or if it is not significantly higher
-+         * and the line derivative is still negative.
-+         */
-+        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
-+        {
-+            foundlower = TRUE;
-+            /* Great, we found a better energy. Increase step for next iteration
-+             * if we are still going down, decrease it otherwise
-+             */
-+            if (gpc < 0)
-+            {
-+                stepsize *= 1.618034; /* The golden section */
-+            }
-+            else
-+            {
-+                stepsize *= 0.618034; /* 1/golden section */
-+            }
-+        }
-+        else
-+        {
-+            /* New energy is the same or higher. We will have to do some work
-+             * to find a smaller value in the interval. Take smaller step next time!
-+             */
-+            foundlower = FALSE;
-+            stepsize  *= 0.618034;
-+        }
-+
-+        /* OK, if we didn't find a lower value we will have to locate one now - there must
-+         * be one in the interval [a=0,c].
-+         * The same thing is valid here, though: Don't spend dozens of iterations to find
-+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-+         *
-+         * I also have a safeguard for potentially really patological functions so we never
-+         * take more than 20 steps before we give up ...
-+         *
-+         * If we already found a lower value we just skip this step and continue to the update.
-+         */
-+
-+        if (!foundlower)
-+        {
-+
-+            nminstep = 0;
-+            do
-+            {
-+                /* Select a new trial point.
-+                 * If the derivatives at points a & c have different sign we interpolate to zero,
-+                 * otherwise just do a bisection.
-+                 */
-+
-+                if (gpa < 0 && gpc > 0)
-+                {
-+                    b = a + gpa*(a-c)/(gpc-gpa);
-+                }
-+                else
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* safeguard if interpolation close to machine accuracy causes errors:
-+                 * never go outside the interval
-+                 */
-+                if (b <= a || b >= c)
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* Take a trial step */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xb[i] = lastx[i] + b*s[i];
-+                }
-+
-+                neval++;
-+                /* Calculate energy for the trial step */
-+                ems.s.x = (rvec *)xb;
-+                ems.f   = (rvec *)fb;
-+                evaluate_energy(fplog, cr,
-+                                top_global, &ems, top,
-+                                inputrec, nrnb, wcycle, gstat,
-+                                vsite, constr, fcd, graph, mdatoms, fr,
-+                                mu_tot, enerd, vir, pres, step, FALSE);
-+                EpotB = ems.epot;
-+
-+                fnorm = ems.fnorm;
-+
-+                for (gpb = 0, i = 0; i < n; i++)
-+                {
-+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
-+
-+                }
-+                /* Sum the gradient along the line across CPUs */
-+                if (PAR(cr))
-+                {
-+                    gmx_sumd(1, &gpb, cr);
-+                }
-+
-+                /* Keep one of the intervals based on the value of the derivative at the new point */
-+                if (gpb > 0)
-+                {
-+                    /* Replace c endpoint with b */
-+                    EpotC = EpotB;
-+                    c     = b;
-+                    gpc   = gpb;
-+                    /* swap coord pointers b/c */
-+                    xtmp = xb;
-+                    ftmp = fb;
-+                    xb   = xc;
-+                    fb   = fc;
-+                    xc   = xtmp;
-+                    fc   = ftmp;
-+                }
-+                else
-+                {
-+                    /* Replace a endpoint with b */
-+                    EpotA = EpotB;
-+                    a     = b;
-+                    gpa   = gpb;
-+                    /* swap coord pointers a/b */
-+                    xtmp = xb;
-+                    ftmp = fb;
-+                    xb   = xa;
-+                    fb   = fa;
-+                    xa   = xtmp;
-+                    fa   = ftmp;
-+                }
-+
-+                /*
-+                 * Stop search as soon as we find a value smaller than the endpoints,
-+                 * or if the tolerance is below machine precision.
-+                 * Never run more than 20 steps, no matter what.
-+                 */
-+                nminstep++;
-+            }
-+            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
-+
-+            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
-+            {
-+                /* OK. We couldn't find a significantly lower energy.
-+                 * If ncorr==0 this was steepest descent, and then we give up.
-+                 * If not, reset memory to restart as steepest descent before quitting.
-+                 */
-+                if (ncorr == 0)
-+                {
-+                    /* Converged */
-+                    converged = TRUE;
-+                    break;
-+                }
-+                else
-+                {
-+                    /* Reset memory */
-+                    ncorr = 0;
-+                    /* Search in gradient direction */
-+                    for (i = 0; i < n; i++)
-+                    {
-+                        dx[point][i] = ff[i];
-+                    }
-+                    /* Reset stepsize */
-+                    stepsize = 1.0/fnorm;
-+                    continue;
-+                }
-+            }
-+
-+            /* Select min energy state of A & C, put the best in xx/ff/Epot
-+             */
-+            if (EpotC < EpotA)
-+            {
-+                Epot = EpotC;
-+                /* Use state C */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xx[i] = xc[i];
-+                    ff[i] = fc[i];
-+                }
-+                stepsize = c;
-+            }
-+            else
-+            {
-+                Epot = EpotA;
-+                /* Use state A */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xx[i] = xa[i];
-+                    ff[i] = fa[i];
-+                }
-+                stepsize = a;
-+            }
-+
-+        }
-+        else
-+        {
-+            /* found lower */
-+            Epot = EpotC;
-+            /* Use state C */
-+            for (i = 0; i < n; i++)
-+            {
-+                xx[i] = xc[i];
-+                ff[i] = fc[i];
-+            }
-+            stepsize = c;
-+        }
-+
-+        /* Update the memory information, and calculate a new
-+         * approximation of the inverse hessian
-+         */
-+
-+        /* Have new data in Epot, xx, ff */
-+        if (ncorr < nmaxcorr)
-+        {
-+            ncorr++;
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            dg[point][i]  = lastf[i]-ff[i];
-+            dx[point][i] *= stepsize;
-+        }
-+
-+        dgdg = 0;
-+        dgdx = 0;
-+        for (i = 0; i < n; i++)
-+        {
-+            dgdg += dg[point][i]*dg[point][i];
-+            dgdx += dg[point][i]*dx[point][i];
-+        }
-+
-+        diag = dgdx/dgdg;
-+
-+        rho[point] = 1.0/dgdx;
-+        point++;
-+
-+        if (point >= nmaxcorr)
-+        {
-+            point = 0;
-+        }
-+
-+        /* Update */
-+        for (i = 0; i < n; i++)
-+        {
-+            p[i] = ff[i];
-+        }
-+
-+        cp = point;
-+
-+        /* Recursive update. First go back over the memory points */
-+        for (k = 0; k < ncorr; k++)
-+        {
-+            cp--;
-+            if (cp < 0)
-+            {
-+                cp = ncorr-1;
-+            }
-+
-+            sq = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                sq += dx[cp][i]*p[i];
-+            }
-+
-+            alpha[cp] = rho[cp]*sq;
-+
-+            for (i = 0; i < n; i++)
-+            {
-+                p[i] -= alpha[cp]*dg[cp][i];
-+            }
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            p[i] *= diag;
-+        }
-+
-+        /* And then go forward again */
-+        for (k = 0; k < ncorr; k++)
-+        {
-+            yr = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                yr += p[i]*dg[cp][i];
-+            }
-+
-+            beta = rho[cp]*yr;
-+            beta = alpha[cp]-beta;
-+
-+            for (i = 0; i < n; i++)
-+            {
-+                p[i] += beta*dx[cp][i];
-+            }
-+
-+            cp++;
-+            if (cp >= ncorr)
-+            {
-+                cp = 0;
-+            }
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            if (!frozen[i])
-+            {
-+                dx[point][i] = p[i];
-+            }
-+            else
-+            {
-+                dx[point][i] = 0;
-+            }
-+        }
-+
-+        stepsize = 1.0;
-+
-+        /* Test whether the convergence criterion is met */
-+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
-+
-+        /* Print it if necessary */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
-+                        step, Epot, fnorm/sqrt(state->natoms), fmax, nfmax+1);
-+            }
-+            /* Store the new (lower) energies */
-+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                       mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
-+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+            do_log = do_per_step(step, inputrec->nstlog);
-+            do_ene = do_per_step(step, inputrec->nstenergy);
-+            if (do_log)
-+            {
-+                print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+            }
-+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-+                       do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+
-+        /* Send x and E to IMD client, if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state->box, state->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        /* Stop when the maximum force lies below tolerance.
-+         * If we have reached machine precision, converged is already set to true.
-+         */
-+
-+        converged = converged || (fmax < inputrec->em_tol);
-+
-+    } /* End of the loop */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    if (converged)
-+    {
-+        step--; /* we never took that last step in this case */
-+
-+    }
-+    if (fmax > inputrec->em_tol)
-+    {
-+        if (MASTER(cr))
-+        {
-+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
-+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
-+        }
-+        converged = FALSE;
-+    }
-+
-+    /* If we printed energy and/or logfile last step (which was the last step)
-+     * we don't have to do it again, but otherwise print the final values.
-+     */
-+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
-+    {
-+        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+    }
-+    if (!do_ene || !do_log) /* Write final energy file entries */
-+    {
-+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-+                   !do_log ? fplog : NULL, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+
-+    /* Print some stuff... */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+
-+    /* IMPORTANT!
-+     * For accurate normal mode calculation it is imperative that we
-+     * store the last conformation into the full precision binary trajectory.
-+     *
-+     * However, we should only do it if we did NOT already write this step
-+     * above (which we did if do_x or do_f was true).
-+     */
-+    do_x = !do_per_step(step, inputrec->nstxout);
-+    do_f = !do_per_step(step, inputrec->nstfout);
-+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, step,
-+                  &ems, state, f);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
-+                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
-+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
-+                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
-+
-+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_steep(FILE *fplog, t_commrec *cr,
-+                int nfile, const t_filenm fnm[],
-+                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+                int gmx_unused nstglobalcomm,
-+                gmx_vsite_t *vsite, gmx_constr_t constr,
-+                int gmx_unused stepout,
-+                t_inputrec *inputrec,
-+                gmx_mtop_t *top_global, t_fcdata *fcd,
-+                t_state *state_global,
-+                t_mdatoms *mdatoms,
-+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                gmx_edsam_t gmx_unused  ed,
-+                t_forcerec *fr,
-+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+                gmx_membed_t gmx_unused membed,
-+                real gmx_unused cpt_period, real gmx_unused max_hours,
-+                const char  gmx_unused *deviceOptions,
-+                int imdport,
-+                unsigned long gmx_unused Flags,
-+                gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char       *SD = "Steepest Descents";
-+    em_state_t       *s_min, *s_try;
-+    rvec             *f_global;
-+    gmx_localtop_t   *top;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f;
-+    gmx_global_stat_t gstat;
-+    t_graph          *graph;
-+    real              stepsize, constepsize;
-+    real              ustep, fnormn;
-+    gmx_mdoutf_t      outf;
-+    t_mdebin         *mdebin;
-+    gmx_bool          bDone, bAbort, do_x, do_f;
-+    tensor            vir, pres;
-+    rvec              mu_tot;
-+    int               nsteps;
-+    int               count          = 0;
-+    int               steps_accepted = 0;
-+    /* not used */
-+    real              terminate = 0;
-+
-+    s_min = init_em_state();
-+    s_try = init_em_state();
-+
-+    /* Init em and store the local state in s_try */
-+    init_em(fplog, SD, cr, inputrec,
-+            state_global, top_global, s_try, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+
-+    /* Print to log file  */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
-+
-+    /* Set variables for stepsize (in nm). This is the largest
-+     * step that we are going to make in any direction.
-+     */
-+    ustep    = inputrec->em_stepsize;
-+    stepsize = 0;
-+
-+    /* Max number of steps  */
-+    nsteps = inputrec->nsteps;
-+
-+    if (MASTER(cr))
-+    {
-+        /* Print to the screen  */
-+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
-+    }
-+
-+    /**** HERE STARTS THE LOOP ****
-+     * count is the counter for the number of steps
-+     * bDone will be TRUE when the minimization has converged
-+     * bAbort will be TRUE when nsteps steps have been performed or when
-+     * the stepsize becomes smaller than is reasonable for machine precision
-+     */
-+    count  = 0;
-+    bDone  = FALSE;
-+    bAbort = FALSE;
-+    while (!bDone && !bAbort)
-+    {
-+        bAbort = (nsteps >= 0) && (count == nsteps);
-+
-+        /* set new coordinates, except for first step */
-+        if (count > 0)
-+        {
-+            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
-+                       s_min, stepsize, s_min->f, s_try,
-+                       constr, top, nrnb, wcycle, count);
-+        }
-+
-+        evaluate_energy(fplog, cr,
-+                        top_global, s_try, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, count, count == 0);
-+
-+        if (MASTER(cr))
-+        {
-+            print_ebin_header(fplog, count, count, s_try->s.lambda[efptFEP]);
-+        }
-+
-+        if (count == 0)
-+        {
-+            s_min->epot = s_try->epot + 1;
-+        }
-+
-+        /* Print it if necessary  */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
-+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
-+                        (s_try->epot < s_min->epot) ? '\n' : '\r');
-+            }
-+
-+            if (s_try->epot < s_min->epot)
-+            {
-+                /* Store the new (lower) energies  */
-+                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
-+                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
-+                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+                /* Prepare IMD energy record, if bIMD is TRUE. */
-+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
-+
-+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
-+                           do_per_step(steps_accepted, inputrec->nstdisreout),
-+                           do_per_step(steps_accepted, inputrec->nstorireout),
-+                           fplog, count, count, eprNORMAL, TRUE,
-+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+                fflush(fplog);
-+            }
-+        }
-+
-+        /* Now if the new energy is smaller than the previous...
-+         * or if this is the first step!
-+         * or if we did random steps!
-+         */
-+
-+        if ( (count == 0) || (s_try->epot < s_min->epot) )
-+        {
-+            steps_accepted++;
-+
-+            /* Test whether the convergence criterion is met...  */
-+            bDone = (s_try->fmax < inputrec->em_tol);
-+
-+            /* Copy the arrays for force, positions and energy  */
-+            /* The 'Min' array always holds the coords and forces of the minimal
-+               sampled energy  */
-+            swap_em_state(s_min, s_try);
-+            if (count > 0)
-+            {
-+                ustep *= 1.2;
-+            }
-+
-+            /* Write to trn, if necessary */
-+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
-+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
-+            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
-+                          top_global, inputrec, count,
-+                          s_min, state_global, f_global);
-+        }
-+        else
-+        {
-+            /* If energy is not smaller make the step smaller...  */
-+            ustep *= 0.5;
-+
-+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-+            {
-+                /* Reload the old state */
-+                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-+                                       s_min, top, mdatoms, fr, vsite, constr,
-+                                       nrnb, wcycle);
-+            }
-+        }
-+
-+        /* Determine new step  */
-+        stepsize = ustep/s_min->fmax;
-+
-+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
-+#ifdef GMX_DOUBLE
-+        if (count == nsteps || ustep < 1e-12)
-+#else
-+        if (count == nsteps || ustep < 1e-6)
-+#endif
-+        {
-+            if (MASTER(cr))
-+            {
-+                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
-+                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
-+            }
-+            bAbort = TRUE;
-+        }
-+
-+        /* Send IMD energies and positions, if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        count++;
-+    } /* End of the loop  */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    /* Print some data...  */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, count,
-+                  s_min, state_global, f_global);
-+
-+    fnormn = s_min->fnorm/sqrt(state_global->natoms);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    inputrec->nsteps = count;
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_nm(FILE *fplog, t_commrec *cr,
-+             int nfile, const t_filenm fnm[],
-+             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused  bCompact,
-+             int gmx_unused nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int gmx_unused stepout,
-+             t_inputrec *inputrec,
-+             gmx_mtop_t *top_global, t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t  gmx_unused ed,
-+             t_forcerec *fr,
-+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+             gmx_membed_t gmx_unused membed,
-+             real gmx_unused cpt_period, real gmx_unused max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long gmx_unused Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char          *NM = "Normal Mode Analysis";
-+    gmx_mdoutf_t         outf;
-+    int                  natoms, atom, d;
-+    int                  nnodes, node;
-+    rvec                *f_global;
-+    gmx_localtop_t      *top;
-+    gmx_enerdata_t      *enerd;
-+    rvec                *f;
-+    gmx_global_stat_t    gstat;
-+    t_graph             *graph;
-+    real                 t, t0, lambda, lam0;
-+    gmx_bool             bNS;
-+    tensor               vir, pres;
-+    rvec                 mu_tot;
-+    rvec                *fneg, *dfdx;
-+    gmx_bool             bSparse; /* use sparse matrix storage format */
-+    size_t               sz = 0;
-+    gmx_sparsematrix_t * sparse_matrix           = NULL;
-+    real           *     full_matrix             = NULL;
-+    em_state_t       *   state_work;
-+
-+    /* added with respect to mdrun */
-+    int        i, j, k, row, col;
-+    real       der_range = 10.0*sqrt(GMX_REAL_EPS);
-+    real       x_min;
-+    real       fnorm, fmax;
-+
-+    if (constr != NULL)
-+    {
-+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
-+    }
-+
-+    state_work = init_em_state();
-+
-+    /* Init em and store the local state in state_minimum */
-+    init_em(fplog, NM, cr, inputrec,
-+            state_global, top_global, state_work, &top,
-+            &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, NULL, imdport, Flags, wcycle);
-+
-+    natoms = top_global->natoms;
-+    snew(fneg, natoms);
-+    snew(dfdx, natoms);
-+
-+#ifndef GMX_DOUBLE
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr,
-+                "NOTE: This version of Gromacs has been compiled in single precision,\n"
-+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
-+                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
-+                "      are fairly modest even if you recompile in double precision.\n\n");
-+    }
-+#endif
-+
-+    /* Check if we can/should use sparse storage format.
-+     *
-+     * Sparse format is only useful when the Hessian itself is sparse, which it
-+     * will be when we use a cutoff.
-+     * For small systems (n<1000) it is easier to always use full matrix format, though.
-+     */
-+    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
-+    {
-+        md_print_info(cr, fplog, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
-+        bSparse = FALSE;
-+    }
-+    else if (top_global->natoms < 1000)
-+    {
-+        md_print_info(cr, fplog, "Small system size (N=%d), using full Hessian format.\n", top_global->natoms);
-+        bSparse = FALSE;
-+    }
-+    else
-+    {
-+        md_print_info(cr, fplog, "Using compressed symmetric sparse Hessian format.\n");
-+        bSparse = TRUE;
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        sz = DIM*top_global->natoms;
-+
-+        fprintf(stderr, "Allocating Hessian memory...\n\n");
-+
-+        if (bSparse)
-+        {
-+            sparse_matrix = gmx_sparsematrix_init(sz);
-+            sparse_matrix->compressed_symmetric = TRUE;
-+        }
-+        else
-+        {
-+            snew(full_matrix, sz*sz);
-+        }
-+    }
-+
-+    /* Initial values */
-+    t0           = inputrec->init_t;
-+    lam0         = inputrec->fepvals->init_lambda;
-+    t            = t0;
-+    lambda       = lam0;
-+
-+    init_nrnb(nrnb);
-+
-+    where();
-+
-+    /* Write start time and temperature */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
-+
-+    /* fudge nr of steps to nr of atoms */
-+    inputrec->nsteps = natoms*2;
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
-+                *(top_global->name), (int)inputrec->nsteps);
-+    }
-+
-+    nnodes = cr->nnodes;
-+
-+    /* Make evaluate_energy do a single node force calculation */
-+    cr->nnodes = 1;
-+    evaluate_energy(fplog, cr,
-+                    top_global, state_work, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    cr->nnodes = nnodes;
-+
-+    /* if forces are not small, warn user */
-+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
-+
-+    md_print_info(cr, fplog, "Maximum force:%12.5e\n", state_work->fmax);
-+    if (state_work->fmax > 1.0e-3)
-+    {
-+        md_print_info(cr, fplog,
-+                      "The force is probably not small enough to "
-+                      "ensure that you are at a minimum.\n"
-+                      "Be aware that negative eigenvalues may occur\n"
-+                      "when the resulting matrix is diagonalized.\n\n");
-+    }
-+
-+    /***********************************************************
-+     *
-+     *      Loop over all pairs in matrix
-+     *
-+     *      do_force called twice. Once with positive and
-+     *      once with negative displacement
-+     *
-+     ************************************************************/
-+
-+    /* Steps are divided one by one over the nodes */
-+    for (atom = cr->nodeid; atom < natoms; atom += nnodes)
-+    {
-+
-+        for (d = 0; d < DIM; d++)
-+        {
-+            x_min = state_work->s.x[atom][d];
-+
-+            state_work->s.x[atom][d] = x_min - der_range;
-+
-+            /* Make evaluate_energy do a single node force calculation */
-+            cr->nnodes = 1;
-+            evaluate_energy(fplog, cr,
-+                            top_global, state_work, top,
-+                            inputrec, nrnb, wcycle, gstat,
-+                            vsite, constr, fcd, graph, mdatoms, fr,
-+                            mu_tot, enerd, vir, pres, atom*2, FALSE);
-+
-+            for (i = 0; i < natoms; i++)
-+            {
-+                copy_rvec(state_work->f[i], fneg[i]);
-+            }
-+
-+            state_work->s.x[atom][d] = x_min + der_range;
-+
-+            evaluate_energy(fplog, cr,
-+                            top_global, state_work, top,
-+                            inputrec, nrnb, wcycle, gstat,
-+                            vsite, constr, fcd, graph, mdatoms, fr,
-+                            mu_tot, enerd, vir, pres, atom*2+1, FALSE);
-+            cr->nnodes = nnodes;
-+
-+            /* x is restored to original */
-+            state_work->s.x[atom][d] = x_min;
-+
-+            for (j = 0; j < natoms; j++)
-+            {
-+                for (k = 0; (k < DIM); k++)
-+                {
-+                    dfdx[j][k] =
-+                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
-+                }
-+            }
-+
-+            if (!MASTER(cr))
-+            {
-+#ifdef GMX_MPI
-+#ifdef GMX_DOUBLE
-+#define mpi_type MPI_DOUBLE
-+#else
-+#define mpi_type MPI_FLOAT
-+#endif
-+                MPI_Send(dfdx[0], natoms*DIM, mpi_type, MASTERNODE(cr), cr->nodeid,
-+                         cr->mpi_comm_mygroup);
-+#endif
-+            }
-+            else
-+            {
-+                for (node = 0; (node < nnodes && atom+node < natoms); node++)
-+                {
-+                    if (node > 0)
-+                    {
-+#ifdef GMX_MPI
-+                        MPI_Status stat;
-+                        MPI_Recv(dfdx[0], natoms*DIM, mpi_type, node, node,
-+                                 cr->mpi_comm_mygroup, &stat);
-+#undef mpi_type
-+#endif
-+                    }
-+
-+                    row = (atom + node)*DIM + d;
-+
-+                    for (j = 0; j < natoms; j++)
-+                    {
-+                        for (k = 0; k < DIM; k++)
-+                        {
-+                            col = j*DIM + k;
-+
-+                            if (bSparse)
-+                            {
-+                                if (col >= row && dfdx[j][k] != 0.0)
-+                                {
-+                                    gmx_sparsematrix_increment_value(sparse_matrix,
-+                                                                     row, col, dfdx[j][k]);
-+                                }
-+                            }
-+                            else
-+                            {
-+                                full_matrix[row*sz+col] = dfdx[j][k];
-+                            }
-+                        }
-+                    }
-+                }
-+            }
-+
-+            if (bVerbose && fplog)
-+            {
-+                fflush(fplog);
-+            }
-+        }
-+        /* write progress */
-+        if (MASTER(cr) && bVerbose)
-+        {
-+            fprintf(stderr, "\rFinished step %d out of %d",
-+                    min(atom+nnodes, natoms), natoms);
-+            fflush(stderr);
-+        }
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\n\nWriting Hessian...\n");
-+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, natoms*2);
-+
-+    return 0;
-+}
-diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c
-index 3d98d59..b34d23c 100644
---- a/src/programs/mdrun/md.c
-+++ b/src/programs/mdrun/md.c
-@@ -96,6 +96,12 @@
- #include "gromacs/swap/swapcoords.h"
- #include "gromacs/imd/imd.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #ifdef GMX_FAHCORE
- #include "corewrap.h"
- #endif
-@@ -224,6 +230,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-     /* Interactive MD */
-     gmx_bool          bIMDstep = FALSE;
- 
-+    /* PLUMED */
-+    int plumedNeedsEnergy=0;
-+    int plumedWantsToStop=0;
-+    matrix plumed_vir;
-+    /* END PLUMED */
-+
- #ifdef GMX_FAHCORE
-     /* Temporary addition for FAHCORE checkpointing */
-     int chkpt_ret;
-@@ -651,6 +663,48 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-         fprintf(fplog, "\n");
-     }
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      /* detect plumed API version */
-+      int pversion=0;
-+      plumed_cmd(plumedmain,"getApiVersion",&pversion);
-+      /* setting kbT is only implemented with api>1) */
-+      real kbT=ir->opts.ref_t[0]*BOLTZ;
-+      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
-+
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        plumed_cmd(plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }
-+      }
-+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
-+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
-+      plumed_cmd(plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
-+      plumed_cmd(plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
-+
-     walltime_accounting_start(walltime_accounting);
-     wallcycle_start(wcycle, ewcRUN);
-     print_start(fplog, cr, walltime_accounting, "mdrun");
-@@ -955,6 +1009,13 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-                                     do_verbose && !bPMETuneRunning);
-                 wallcycle_stop(wcycle, ewcDOMDEC);
-                 /* If using an iterative integrator, reallocate space to match the decomposition */
-+
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+                }
-+                /* END PLUMED */
-             }
-         }
- 
-@@ -1078,12 +1139,45 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-              * This is parallellized as well, and does communication too.
-              * Check comments in sim_util.c
-              */
-+
-+            /* PLUMED */
-+            plumedNeedsEnergy=0;
-+            if(plumedswitch){
-+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
-+              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
-+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
-+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
-+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
-+              plumed_cmd(plumedmain,"prepareCalc",NULL);
-+              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
-+              plumed_cmd(plumedmain,"setForces",&f[0][0]);
-+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+              clear_mat(plumed_vir);
-+              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
-+            }
-+            /* END PLUMED */
-             do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
-                      state->box, state->x, &state->hist,
-                      f, force_vir, mdatoms, enerd, fcd,
-                      state->lambda, graph,
-                      fr, vsite, mu_tot, t, mdoutf_get_fp_field(outf), ed, bBornRadii,
-                      (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+            /* PLUMED */
-+            if(plumedswitch){
-+              if(plumedNeedsEnergy){
-+                msmul(force_vir,2.0,plumed_vir);
-+                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+                plumed_cmd(plumedmain,"performCalc",NULL);
-+                msmul(plumed_vir,0.5,force_vir);
-+              } else {
-+                msmul(plumed_vir,0.5,plumed_vir);
-+                m_add(force_vir,plumed_vir,force_vir);
-+              }
-+              if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+                 do_per_step(step,repl_ex_nst)) plumed_cmd(plumedmain,"GREX savePositions",NULL);
-+              if(plumedWantsToStop) ir->nsteps=step_rel+1;
-+            }
-+            /* END PLUMED */
-         }
- 
-         if (bVV && !bStartingFromCpt && !bRerunMD)
-diff --git a/src/programs/mdrun/md.c.preplumed b/src/programs/mdrun/md.c.preplumed
-new file mode 100644
-index 0000000..3d98d59
---- /dev/null
-+++ b/src/programs/mdrun/md.c.preplumed
-@@ -0,0 +1,2058 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include "typedefs.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "sysstuff.h"
-+#include "vec.h"
-+#include "vcm.h"
-+#include "mdebin.h"
-+#include "nrnb.h"
-+#include "calcmu.h"
-+#include "index.h"
-+#include "vsite.h"
-+#include "update.h"
-+#include "ns.h"
-+#include "mdrun.h"
-+#include "md_support.h"
-+#include "md_logging.h"
-+#include "network.h"
-+#include "xvgr.h"
-+#include "physics.h"
-+#include "names.h"
-+#include "force.h"
-+#include "disre.h"
-+#include "orires.h"
-+#include "pme.h"
-+#include "mdatoms.h"
-+#include "repl_ex.h"
-+#include "deform.h"
-+#include "qmmm.h"
-+#include "domdec.h"
-+#include "domdec_network.h"
-+#include "gromacs/gmxlib/topsort.h"
-+#include "coulomb.h"
-+#include "constr.h"
-+#include "shellfc.h"
-+#include "gromacs/gmxpreprocess/compute_io.h"
-+#include "checkpoint.h"
-+#include "mtop_util.h"
-+#include "sighandler.h"
-+#include "txtdump.h"
-+#include "gromacs/utility/cstringutil.h"
-+#include "pme_loadbal.h"
-+#include "bondf.h"
-+#include "membed.h"
-+#include "types/nlistheuristics.h"
-+#include "types/iteratedconstraints.h"
-+#include "nbnxn_cuda_data_mgmt.h"
-+
-+#include "gromacs/utility/gmxmpi.h"
-+#include "gromacs/fileio/confio.h"
-+#include "gromacs/fileio/trajectory_writing.h"
-+#include "gromacs/fileio/trnio.h"
-+#include "gromacs/fileio/trxio.h"
-+#include "gromacs/fileio/xtcio.h"
-+#include "gromacs/timing/wallcycle.h"
-+#include "gromacs/timing/walltime_accounting.h"
-+#include "gromacs/pulling/pull.h"
-+#include "gromacs/swap/swapcoords.h"
-+#include "gromacs/imd/imd.h"
-+
-+#ifdef GMX_FAHCORE
-+#include "corewrap.h"
-+#endif
-+
-+static void reset_all_counters(FILE *fplog, t_commrec *cr,
-+                               gmx_int64_t step,
-+                               gmx_int64_t *step_rel, t_inputrec *ir,
-+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
-+                               gmx_walltime_accounting_t walltime_accounting,
-+                               nbnxn_cuda_ptr_t cu_nbv)
-+{
-+    char sbuf[STEPSTRSIZE];
-+
-+    /* Reset all the counters related to performance over the run */
-+    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
-+                  gmx_step_str(step, sbuf));
-+
-+    if (cu_nbv)
-+    {
-+        nbnxn_cuda_reset_timings(cu_nbv);
-+    }
-+
-+    wallcycle_stop(wcycle, ewcRUN);
-+    wallcycle_reset_all(wcycle);
-+    if (DOMAINDECOMP(cr))
-+    {
-+        reset_dd_statistics_counters(cr->dd);
-+    }
-+    init_nrnb(nrnb);
-+    ir->init_step += *step_rel;
-+    ir->nsteps    -= *step_rel;
-+    *step_rel      = 0;
-+    wallcycle_start(wcycle, ewcRUN);
-+    walltime_accounting_start(walltime_accounting);
-+    print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
-+}
-+
-+double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-+             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
-+             int nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int stepout, t_inputrec *ir,
-+             gmx_mtop_t *top_global,
-+             t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t ed, t_forcerec *fr,
-+             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
-+             real cpt_period, real max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    gmx_mdoutf_t    outf = NULL;
-+    gmx_int64_t     step, step_rel;
-+    double          elapsed_time;
-+    double          t, t0, lam0[efptNR];
-+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
-+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
-+                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
-+                    bBornRadii, bStartingFromCpt;
-+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
-+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
-+                      bForceUpdate = FALSE, bCPT;
-+    gmx_bool          bMasterState;
-+    int               force_flags, cglo_flags;
-+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
-+    int               i, m;
-+    t_trxstatus      *status;
-+    rvec              mu_tot;
-+    t_vcm            *vcm;
-+    t_state          *bufstate = NULL;
-+    matrix           *scale_tot, pcoupl_mu, M, ebox;
-+    gmx_nlheur_t      nlh;
-+    t_trxframe        rerun_fr;
-+    gmx_repl_ex_t     repl_ex = NULL;
-+    int               nchkpt  = 1;
-+    gmx_localtop_t   *top;
-+    t_mdebin         *mdebin   = NULL;
-+    t_state          *state    = NULL;
-+    rvec             *f_global = NULL;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f = NULL;
-+    gmx_global_stat_t gstat;
-+    gmx_update_t      upd   = NULL;
-+    t_graph          *graph = NULL;
-+    globsig_t         gs;
-+    gmx_groups_t     *groups;
-+    gmx_ekindata_t   *ekind, *ekind_save;
-+    gmx_shellfc_t     shellfc;
-+    int               count, nconverged = 0;
-+    real              timestep   = 0;
-+    double            tcount     = 0;
-+    gmx_bool          bConverged = TRUE, bOK, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
-+    gmx_bool          bAppend;
-+    gmx_bool          bResetCountersHalfMaxH = FALSE;
-+    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
-+    gmx_bool          bUpdateDoLR;
-+    real              dvdl_constr;
-+    rvec             *cbuf = NULL;
-+    matrix            lastbox;
-+    real              veta_save, scalevir, tracevir;
-+    real              vetanew = 0;
-+    int               lamnew  = 0;
-+    /* for FEP */
-+    int               nstfep;
-+    double            cycles;
-+    real              saved_conserved_quantity = 0;
-+    real              last_ekin                = 0;
-+    int               iter_i;
-+    t_extmass         MassQ;
-+    int             **trotter_seq;
-+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
-+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
-+    gmx_iterate_t     iterate;
-+    gmx_int64_t       multisim_nsteps = -1;                        /* number of steps to do  before first multisim
-+                                                                          simulation stops. If equal to zero, don't
-+                                                                          communicate any more between multisims.*/
-+    /* PME load balancing data for GPU kernels */
-+    pme_load_balancing_t pme_loadbal = NULL;
-+    double               cycles_pmes;
-+    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
-+
-+    /* Interactive MD */
-+    gmx_bool          bIMDstep = FALSE;
-+
-+#ifdef GMX_FAHCORE
-+    /* Temporary addition for FAHCORE checkpointing */
-+    int chkpt_ret;
-+#endif
-+
-+    /* Check for special mdrun options */
-+    bRerunMD = (Flags & MD_RERUN);
-+    bAppend  = (Flags & MD_APPENDFILES);
-+    if (Flags & MD_RESETCOUNTERSHALFWAY)
-+    {
-+        if (ir->nsteps > 0)
-+        {
-+            /* Signal to reset the counters half the simulation steps. */
-+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
-+        }
-+        /* Signal to reset the counters halfway the simulation time. */
-+        bResetCountersHalfMaxH = (max_hours > 0);
-+    }
-+
-+    /* md-vv uses averaged full step velocities for T-control
-+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
-+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
-+    bVV = EI_VV(ir->eI);
-+    if (bVV) /* to store the initial velocities while computing virial */
-+    {
-+        snew(cbuf, top_global->natoms);
-+    }
-+    /* all the iteratative cases - only if there are constraints */
-+    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
-+    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
-+                                          false in this step.  The correct value, true or false,
-+                                          is set at each step, as it depends on the frequency of temperature
-+                                          and pressure control.*/
-+    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
-+
-+    if (bRerunMD)
-+    {
-+        /* Since we don't know if the frames read are related in any way,
-+         * rebuild the neighborlist at every step.
-+         */
-+        ir->nstlist       = 1;
-+        ir->nstcalcenergy = 1;
-+        nstglobalcomm     = 1;
-+    }
-+
-+    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
-+
-+    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
-+    bGStatEveryStep = (nstglobalcomm == 1);
-+
-+    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
-+    {
-+        fprintf(fplog,
-+                "To reduce the energy communication with nstlist = -1\n"
-+                "the neighbor list validity should not be checked at every step,\n"
-+                "this means that exact integration is not guaranteed.\n"
-+                "The neighbor list validity is checked after:\n"
-+                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
-+                "In most cases this will result in exact integration.\n"
-+                "This reduces the energy communication by a factor of 2 to 3.\n"
-+                "If you want less energy communication, set nstlist > 3.\n\n");
-+    }
-+
-+    if (bRerunMD)
-+    {
-+        ir->nstxout_compressed = 0;
-+    }
-+    groups = &top_global->groups;
-+
-+    /* Initial values */
-+    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
-+            &(state_global->fep_state), lam0,
-+            nrnb, top_global, &upd,
-+            nfile, fnm, &outf, &mdebin,
-+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags, wcycle);
-+
-+    clear_mat(total_vir);
-+    clear_mat(pres);
-+    /* Energy terms and groups */
-+    snew(enerd, 1);
-+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
-+                  enerd);
-+    if (DOMAINDECOMP(cr))
-+    {
-+        f = NULL;
-+    }
-+    else
-+    {
-+        snew(f, top_global->natoms);
-+    }
-+
-+    /* Kinetic energy data */
-+    snew(ekind, 1);
-+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
-+    /* needed for iteration of constraints */
-+    snew(ekind_save, 1);
-+    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
-+    /* Copy the cos acceleration to the groups struct */
-+    ekind->cosacc.cos_accel = ir->cos_accel;
-+
-+    gstat = global_stat_init(ir);
-+    debug_gmx();
-+
-+    /* Check for polarizable models and flexible constraints */
-+    shellfc = init_shell_flexcon(fplog,
-+                                 top_global, n_flexible_constraints(constr),
-+                                 (ir->bContinuation ||
-+                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
-+                                 NULL : state_global->x);
-+    if (shellfc && ir->nstcalcenergy != 1)
-+    {
-+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
-+    }
-+    if (shellfc && DOMAINDECOMP(cr))
-+    {
-+        gmx_fatal(FARGS, "Shell particles are not implemented with domain decomposition, use a single rank");
-+    }
-+    if (shellfc && ir->eI == eiNM)
-+    {
-+        /* Currently shells don't work with Normal Modes */
-+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with shells.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
-+    }
-+
-+    if (vsite && ir->eI == eiNM)
-+    {
-+        /* Currently virtual sites don't work with Normal Modes */
-+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
-+    }
-+
-+    if (DEFORM(*ir))
-+    {
-+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
-+        set_deform_reference_box(upd,
-+                                 deform_init_init_step_tpx,
-+                                 deform_init_box_tpx);
-+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
-+    }
-+
-+    {
-+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
-+        if ((io > 2000) && MASTER(cr))
-+        {
-+            fprintf(stderr,
-+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
-+                    io);
-+        }
-+    }
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        top = dd_init_local_top(top_global);
-+
-+        snew(state, 1);
-+        dd_init_local_state(cr->dd, state_global, state);
-+
-+        if (DDMASTER(cr->dd) && ir->nstfout)
-+        {
-+            snew(f_global, state_global->natoms);
-+        }
-+    }
-+    else
-+    {
-+        top = gmx_mtop_generate_local_top(top_global, ir);
-+
-+        forcerec_set_excl_load(fr, top);
-+
-+        state    = serial_init_local_state(state_global);
-+        f_global = f;
-+
-+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
-+
-+        if (vsite)
-+        {
-+            set_vsite_top(vsite, top, mdatoms, cr);
-+        }
-+
-+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
-+        {
-+            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
-+        }
-+
-+        if (shellfc)
-+        {
-+            make_local_shells(cr, mdatoms, shellfc);
-+        }
-+
-+        setup_bonded_threading(fr, &top->idef);
-+    }
-+
-+    /* Set up interactive MD (IMD) */
-+    init_IMD(ir, cr, top_global, fplog, ir->nstcalcenergy, state_global->x,
-+             nfile, fnm, oenv, imdport, Flags);
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        /* Distribute the charge groups over the nodes from the master node */
-+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
-+                            state_global, top_global, ir,
-+                            state, &f, mdatoms, top, fr,
-+                            vsite, shellfc, constr,
-+                            nrnb, wcycle, FALSE);
-+
-+    }
-+
-+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
-+
-+    if (opt2bSet("-cpi", nfile, fnm))
-+    {
-+        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
-+    }
-+    else
-+    {
-+        bStateFromCP = FALSE;
-+    }
-+
-+    if (ir->bExpanded)
-+    {
-+        init_expanded_ensemble(bStateFromCP, ir, &state->dfhist);
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        if (bStateFromCP)
-+        {
-+            /* Update mdebin with energy history if appending to output files */
-+            if (Flags & MD_APPENDFILES)
-+            {
-+                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
-+            }
-+            else
-+            {
-+                /* We might have read an energy history from checkpoint,
-+                 * free the allocated memory and reset the counts.
-+                 */
-+                done_energyhistory(&state_global->enerhist);
-+                init_energyhistory(&state_global->enerhist);
-+            }
-+        }
-+        /* Set the initial energy history in state by updating once */
-+        update_energyhistory(&state_global->enerhist, mdebin);
-+    }
-+
-+    /* Initialize constraints */
-+    if (constr && !DOMAINDECOMP(cr))
-+    {
-+        set_constraints(constr, top, ir, mdatoms, cr);
-+    }
-+
-+    if (repl_ex_nst > 0 && MASTER(cr))
-+    {
-+        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
-+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
-+    }
-+
-+    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
-+     * PME tuning is not supported with PME only for LJ and not for Coulomb.
-+     */
-+    if ((Flags & MD_TUNEPME) &&
-+        EEL_PME(fr->eeltype) &&
-+        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
-+        !bRerunMD)
-+    {
-+        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
-+        cycles_pmes = 0;
-+        if (cr->duty & DUTY_PME)
-+        {
-+            /* Start tuning right away, as we can't measure the load */
-+            bPMETuneRunning = TRUE;
-+        }
-+        else
-+        {
-+            /* Separate PME nodes, we can measure the PP/PME load balance */
-+            bPMETuneTry = TRUE;
-+        }
-+    }
-+
-+    if (!ir->bContinuation && !bRerunMD)
-+    {
-+        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
-+        {
-+            /* Set the velocities of frozen particles to zero */
-+            for (i = 0; i < mdatoms->homenr; i++)
-+            {
-+                for (m = 0; m < DIM; m++)
-+                {
-+                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
-+                    {
-+                        state->v[i][m] = 0;
-+                    }
-+                }
-+            }
-+        }
-+
-+        if (constr)
-+        {
-+            /* Constrain the initial coordinates and velocities */
-+            do_constrain_first(fplog, constr, ir, mdatoms, state,
-+                               cr, nrnb, fr, top);
-+        }
-+        if (vsite)
-+        {
-+            /* Construct the virtual sites for the initial configuration */
-+            construct_vsites(vsite, state->x, ir->delta_t, NULL,
-+                             top->idef.iparams, top->idef.il,
-+                             fr->ePBC, fr->bMolPBC, cr, state->box);
-+        }
-+    }
-+
-+    debug_gmx();
-+
-+    /* set free energy calculation frequency as the minimum
-+       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
-+    nstfep = ir->fepvals->nstdhdl;
-+    if (ir->bExpanded)
-+    {
-+        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl, nstfep);
-+    }
-+    if (repl_ex_nst > 0)
-+    {
-+        nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
-+    }
-+
-+    /* I'm assuming we need global communication the first time! MRS */
-+    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
-+                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
-+                  | (bVV ? CGLO_PRESSURE : 0)
-+                  | (bVV ? CGLO_CONSTRAINT : 0)
-+                  | (bRerunMD ? CGLO_RERUNMD : 0)
-+                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
-+
-+    bSumEkinhOld = FALSE;
-+    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                    constr, NULL, FALSE, state->box,
-+                    top_global, &bSumEkinhOld, cglo_flags);
-+    if (ir->eI == eiVVAK)
-+    {
-+        /* a second call to get the half step temperature initialized as well */
-+        /* we do the same call as above, but turn the pressure off -- internally to
-+           compute_globals, this is recognized as a velocity verlet half-step
-+           kinetic energy calculation.  This minimized excess variables, but
-+           perhaps loses some logic?*/
-+
-+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                        constr, NULL, FALSE, state->box,
-+                        top_global, &bSumEkinhOld,
-+                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
-+    }
-+
-+    /* Calculate the initial half step temperature, and save the ekinh_old */
-+    if (!(Flags & MD_STARTFROMCPT))
-+    {
-+        for (i = 0; (i < ir->opts.ngtc); i++)
-+        {
-+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
-+        }
-+    }
-+    if (ir->eI != eiVV)
-+    {
-+        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
-+                                     and there is no previous step */
-+    }
-+
-+    /* if using an iterative algorithm, we need to create a working directory for the state. */
-+    if (bIterativeCase)
-+    {
-+        bufstate = init_bufstate(state);
-+    }
-+
-+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
-+       temperature control */
-+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
-+
-+    if (MASTER(cr))
-+    {
-+        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
-+        {
-+            fprintf(fplog,
-+                    "RMS relative constraint deviation after constraining: %.2e\n",
-+                    constr_rmsd(constr, FALSE));
-+        }
-+        if (EI_STATE_VELOCITY(ir->eI))
-+        {
-+            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
-+        }
-+        if (bRerunMD)
-+        {
-+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
-+                    " input trajectory '%s'\n\n",
-+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
-+                        "run input file,\nwhich may not correspond to the time "
-+                        "needed to process input trajectory.\n\n");
-+            }
-+        }
-+        else
-+        {
-+            char tbuf[20];
-+            fprintf(stderr, "starting mdrun '%s'\n",
-+                    *(top_global->name));
-+            if (ir->nsteps >= 0)
-+            {
-+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
-+            }
-+            else
-+            {
-+                sprintf(tbuf, "%s", "infinite");
-+            }
-+            if (ir->init_step > 0)
-+            {
-+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
-+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
-+                        gmx_step_str(ir->init_step, sbuf2),
-+                        ir->init_step*ir->delta_t);
-+            }
-+            else
-+            {
-+                fprintf(stderr, "%s steps, %s ps.\n",
-+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
-+            }
-+        }
-+        fprintf(fplog, "\n");
-+    }
-+
-+    walltime_accounting_start(walltime_accounting);
-+    wallcycle_start(wcycle, ewcRUN);
-+    print_start(fplog, cr, walltime_accounting, "mdrun");
-+
-+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
-+#ifdef GMX_FAHCORE
-+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
-+                                      NULL, 0);
-+    if (chkpt_ret == 0)
-+    {
-+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
-+    }
-+#endif
-+
-+    debug_gmx();
-+    /***********************************************************
-+     *
-+     *             Loop over MD steps
-+     *
-+     ************************************************************/
-+
-+    /* if rerunMD then read coordinates and velocities from input trajectory */
-+    if (bRerunMD)
-+    {
-+        if (getenv("GMX_FORCE_UPDATE"))
-+        {
-+            bForceUpdate = TRUE;
-+        }
-+
-+        rerun_fr.natoms = 0;
-+        if (MASTER(cr))
-+        {
-+            bNotLastFrame = read_first_frame(oenv, &status,
-+                                             opt2fn("-rerun", nfile, fnm),
-+                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
-+            if (rerun_fr.natoms != top_global->natoms)
-+            {
-+                gmx_fatal(FARGS,
-+                          "Number of atoms in trajectory (%d) does not match the "
-+                          "run input file (%d)\n",
-+                          rerun_fr.natoms, top_global->natoms);
-+            }
-+            if (ir->ePBC != epbcNONE)
-+            {
-+                if (!rerun_fr.bBox)
-+                {
-+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
-+                }
-+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
-+                {
-+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
-+                }
-+            }
-+        }
-+
-+        if (PAR(cr))
-+        {
-+            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
-+        }
-+
-+        if (ir->ePBC != epbcNONE)
-+        {
-+            /* Set the shift vectors.
-+             * Necessary here when have a static box different from the tpr box.
-+             */
-+            calc_shifts(rerun_fr.box, fr->shift_vec);
-+        }
-+    }
-+
-+    /* loop over MD steps or if rerunMD to end of input trajectory */
-+    bFirstStep = TRUE;
-+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
-+    bStateFromTPX    = !bStateFromCP;
-+    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
-+    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
-+    bLastStep        = FALSE;
-+    bSumEkinhOld     = FALSE;
-+    bDoReplEx        = FALSE;
-+    bExchanged       = FALSE;
-+    bNeedRepartition = FALSE;
-+
-+    init_global_signals(&gs, cr, ir, repl_ex_nst);
-+
-+    step     = ir->init_step;
-+    step_rel = 0;
-+
-+    if (ir->nstlist == -1)
-+    {
-+        init_nlistheuristics(&nlh, bGStatEveryStep, step);
-+    }
-+
-+    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
-+    {
-+        /* check how many steps are left in other sims */
-+        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
-+    }
-+
-+
-+    /* and stop now if we should */
-+    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
-+                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
-+    while (!bLastStep || (bRerunMD && bNotLastFrame))
-+    {
-+
-+        wallcycle_start(wcycle, ewcSTEP);
-+
-+        if (bRerunMD)
-+        {
-+            if (rerun_fr.bStep)
-+            {
-+                step     = rerun_fr.step;
-+                step_rel = step - ir->init_step;
-+            }
-+            if (rerun_fr.bTime)
-+            {
-+                t = rerun_fr.time;
-+            }
-+            else
-+            {
-+                t = step;
-+            }
-+        }
-+        else
-+        {
-+            bLastStep = (step_rel == ir->nsteps);
-+            t         = t0 + step*ir->delta_t;
-+        }
-+
-+        if (ir->efep != efepNO || ir->bSimTemp)
-+        {
-+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
-+               requiring different logic. */
-+
-+            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
-+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
-+            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
-+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
-+                            && (ir->bExpanded) && (step > 0) && (!bStartingFromCpt));
-+        }
-+
-+        bDoReplEx = ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+                     do_per_step(step, repl_ex_nst));
-+
-+        if (bSimAnn)
-+        {
-+            update_annealing_target_temp(&(ir->opts), t);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            if (!DOMAINDECOMP(cr) || MASTER(cr))
-+            {
-+                for (i = 0; i < state_global->natoms; i++)
-+                {
-+                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
-+                }
-+                if (rerun_fr.bV)
-+                {
-+                    for (i = 0; i < state_global->natoms; i++)
-+                    {
-+                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
-+                    }
-+                }
-+                else
-+                {
-+                    for (i = 0; i < state_global->natoms; i++)
-+                    {
-+                        clear_rvec(state_global->v[i]);
-+                    }
-+                    if (bRerunWarnNoV)
-+                    {
-+                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
-+                                "         Ekin, temperature and pressure are incorrect,\n"
-+                                "         the virial will be incorrect when constraints are present.\n"
-+                                "\n");
-+                        bRerunWarnNoV = FALSE;
-+                    }
-+                }
-+            }
-+            copy_mat(rerun_fr.box, state_global->box);
-+            copy_mat(state_global->box, state->box);
-+
-+            if (vsite && (Flags & MD_RERUN_VSITE))
-+            {
-+                if (DOMAINDECOMP(cr))
-+                {
-+                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, use a single rank");
-+                }
-+                if (graph)
-+                {
-+                    /* Following is necessary because the graph may get out of sync
-+                     * with the coordinates if we only have every N'th coordinate set
-+                     */
-+                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
-+                    shift_self(graph, state->box, state->x);
-+                }
-+                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-+                                 top->idef.iparams, top->idef.il,
-+                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-+                if (graph)
-+                {
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+            }
-+        }
-+
-+        /* Stop Center of Mass motion */
-+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
-+
-+        if (bRerunMD)
-+        {
-+            /* for rerun MD always do Neighbour Searching */
-+            bNS      = (bFirstStep || ir->nstlist != 0);
-+            bNStList = bNS;
-+        }
-+        else
-+        {
-+            /* Determine whether or not to do Neighbour Searching and LR */
-+            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
-+
-+            bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP ||
-+                   (ir->nstlist == -1 && nlh.nabnsb > 0));
-+
-+            if (bNS && ir->nstlist == -1)
-+            {
-+                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bNeedRepartition || bDoFEP, step);
-+            }
-+        }
-+
-+        /* check whether we should stop because another simulation has
-+           stopped. */
-+        if (MULTISIM(cr))
-+        {
-+            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
-+                 (multisim_nsteps != ir->nsteps) )
-+            {
-+                if (bNS)
-+                {
-+                    if (MASTER(cr))
-+                    {
-+                        fprintf(stderr,
-+                                "Stopping simulation %d because another one has finished\n",
-+                                cr->ms->sim);
-+                    }
-+                    bLastStep         = TRUE;
-+                    gs.sig[eglsCHKPT] = 1;
-+                }
-+            }
-+        }
-+
-+        /* < 0 means stop at next step, > 0 means stop at next NS step */
-+        if ( (gs.set[eglsSTOPCOND] < 0) ||
-+             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
-+        {
-+            bLastStep = TRUE;
-+        }
-+
-+        /* Determine whether or not to update the Born radii if doing GB */
-+        bBornRadii = bFirstStep;
-+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
-+        {
-+            bBornRadii = TRUE;
-+        }
-+
-+        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
-+        do_verbose = bVerbose &&
-+            (step % stepout == 0 || bFirstStep || bLastStep);
-+
-+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
-+        {
-+            if (bRerunMD)
-+            {
-+                bMasterState = TRUE;
-+            }
-+            else
-+            {
-+                bMasterState = FALSE;
-+                /* Correct the new box if it is too skewed */
-+                if (DYNAMIC_BOX(*ir))
-+                {
-+                    if (correct_box(fplog, step, state->box, graph))
-+                    {
-+                        bMasterState = TRUE;
-+                    }
-+                }
-+                if (DOMAINDECOMP(cr) && bMasterState)
-+                {
-+                    dd_collect_state(cr->dd, state, state_global);
-+                }
-+            }
-+
-+            if (DOMAINDECOMP(cr))
-+            {
-+                /* Repartition the domain decomposition */
-+                wallcycle_start(wcycle, ewcDOMDEC);
-+                dd_partition_system(fplog, step, cr,
-+                                    bMasterState, nstglobalcomm,
-+                                    state_global, top_global, ir,
-+                                    state, &f, mdatoms, top, fr,
-+                                    vsite, shellfc, constr,
-+                                    nrnb, wcycle,
-+                                    do_verbose && !bPMETuneRunning);
-+                wallcycle_stop(wcycle, ewcDOMDEC);
-+                /* If using an iterative integrator, reallocate space to match the decomposition */
-+            }
-+        }
-+
-+        if (MASTER(cr) && do_log)
-+        {
-+            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
-+        }
-+
-+        if (ir->efep != efepNO)
-+        {
-+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
-+        }
-+
-+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
-+        {
-+
-+            /* We need the kinetic energy at minus the half step for determining
-+             * the full step kinetic energy and possibly for T-coupling.*/
-+            /* This may not be quite working correctly yet . . . . */
-+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-+                            constr, NULL, FALSE, state->box,
-+                            top_global, &bSumEkinhOld,
-+                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-+        }
-+        clear_mat(force_vir);
-+
-+        /* We write a checkpoint at this MD step when:
-+         * either at an NS step when we signalled through gs,
-+         * or at the last step (but not when we do not want confout),
-+         * but never at the first step or with rerun.
-+         */
-+        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
-+                 (bLastStep && (Flags & MD_CONFOUT))) &&
-+                step > ir->init_step && !bRerunMD);
-+        if (bCPT)
-+        {
-+            gs.set[eglsCHKPT] = 0;
-+        }
-+
-+        /* Determine the energy and pressure:
-+         * at nstcalcenergy steps and at energy output steps (set below).
-+         */
-+        if (EI_VV(ir->eI) && (!bInitStep))
-+        {
-+            /* for vv, the first half of the integration actually corresponds
-+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
-+               but the virial needs to be calculated on both the current step and the 'next' step. Future
-+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
-+
-+            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
-+            bCalcVir  = bCalcEner ||
-+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
-+        }
-+        else
-+        {
-+            bCalcEner = do_per_step(step, ir->nstcalcenergy);
-+            bCalcVir  = bCalcEner ||
-+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
-+        }
-+
-+        /* Do we need global communication ? */
-+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
-+                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
-+                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
-+
-+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
-+
-+        if (do_ene || do_log || bDoReplEx)
-+        {
-+            bCalcVir  = TRUE;
-+            bCalcEner = TRUE;
-+            bGStat    = TRUE;
-+        }
-+
-+        /* these CGLO_ options remain the same throughout the iteration */
-+        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
-+                      (bGStat ? CGLO_GSTAT : 0)
-+                      );
-+
-+        force_flags = (GMX_FORCE_STATECHANGED |
-+                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
-+                       GMX_FORCE_ALLFORCES |
-+                       GMX_FORCE_SEPLRF |
-+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
-+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
-+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
-+                       );
-+
-+        if (fr->bTwinRange)
-+        {
-+            if (do_per_step(step, ir->nstcalclr))
-+            {
-+                force_flags |= GMX_FORCE_DO_LR;
-+            }
-+        }
-+
-+        if (shellfc)
-+        {
-+            /* Now is the time to relax the shells */
-+            count = relax_shell_flexcon(fplog, cr, bVerbose, step,
-+                                        ir, bNS, force_flags,
-+                                        top,
-+                                        constr, enerd, fcd,
-+                                        state, f, force_vir, mdatoms,
-+                                        nrnb, wcycle, graph, groups,
-+                                        shellfc, fr, bBornRadii, t, mu_tot,
-+                                        &bConverged, vsite,
-+                                        mdoutf_get_fp_field(outf));
-+            tcount += count;
-+
-+            if (bConverged)
-+            {
-+                nconverged++;
-+            }
-+        }
-+        else
-+        {
-+            /* The coordinates (x) are shifted (to get whole molecules)
-+             * in do_force.
-+             * This is parallellized as well, and does communication too.
-+             * Check comments in sim_util.c
-+             */
-+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
-+                     state->box, state->x, &state->hist,
-+                     f, force_vir, mdatoms, enerd, fcd,
-+                     state->lambda, graph,
-+                     fr, vsite, mu_tot, t, mdoutf_get_fp_field(outf), ed, bBornRadii,
-+                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+        }
-+
-+        if (bVV && !bStartingFromCpt && !bRerunMD)
-+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
-+        {
-+            wallcycle_start(wcycle, ewcUPDATE);
-+            if (ir->eI == eiVV && bInitStep)
-+            {
-+                /* if using velocity verlet with full time step Ekin,
-+                 * take the first half step only to compute the
-+                 * virial for the first step. From there,
-+                 * revert back to the initial coordinates
-+                 * so that the input is actually the initial step.
-+                 */
-+                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
-+            }
-+            else
-+            {
-+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
-+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
-+            }
-+
-+            /* If we are using twin-range interactions where the long-range component
-+             * is only evaluated every nstcalclr>1 steps, we should do a special update
-+             * step to combine the long-range forces on these steps.
-+             * For nstcalclr=1 this is not done, since the forces would have been added
-+             * directly to the short-range forces already.
-+             *
-+             * TODO Remove various aspects of VV+twin-range in master
-+             * branch, because VV integrators did not ever support
-+             * twin-range multiple time stepping with constraints.
-+             */
-+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
-+                          f, bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                          ekind, M, upd, bInitStep, etrtVELOCITY1,
-+                          cr, nrnb, constr, &top->idef);
-+
-+            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
-+            {
-+                gmx_iterate_init(&iterate, TRUE);
-+            }
-+            /* for iterations, we save these vectors, as we will be self-consistently iterating
-+               the calculations */
-+
-+            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
-+
-+            /* save the state */
-+            if (iterate.bIterationActive)
-+            {
-+                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-+            }
-+
-+            bFirstIterate = TRUE;
-+            while (bFirstIterate || iterate.bIterationActive)
-+            {
-+                if (iterate.bIterationActive)
-+                {
-+                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-+                    if (bFirstIterate && bTrotter)
-+                    {
-+                        /* The first time through, we need a decent first estimate
-+                           of veta(t+dt) to compute the constraints.  Do
-+                           this by computing the box volume part of the
-+                           trotter integration at this time. Nothing else
-+                           should be changed by this routine here.  If
-+                           !(first time), we start with the previous value
-+                           of veta.  */
-+
-+                        veta_save = state->veta;
-+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
-+                        vetanew     = state->veta;
-+                        state->veta = veta_save;
-+                    }
-+                }
-+
-+                bOK = TRUE;
-+                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
-+                {
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, shake_vir,
-+                                       cr, nrnb, wcycle, upd, constr,
-+                                       TRUE, bCalcVir, vetanew);
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+
-+                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-+                    {
-+                        /* Correct the virial for multiple time stepping */
-+                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-+                    }
-+
-+                    if (!bOK)
-+                    {
-+                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-+                    }
-+
-+                }
-+                else if (graph)
-+                {
-+                    /* Need to unshift here if a do_force has been
-+                       called in the previous step */
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+
-+                /* if VV, compute the pressure and constraints */
-+                /* For VV2, we strictly only need this if using pressure
-+                 * control, but we really would like to have accurate pressures
-+                 * printed out.
-+                 * Think about ways around this in the future?
-+                 * For now, keep this choice in comments.
-+                 */
-+                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-+                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
-+                bPres = TRUE;
-+                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-+                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
-+                {
-+                    bSumEkinhOld = TRUE;
-+                }
-+                /* for vv, the first half of the integration actually corresponds to the previous step.
-+                   So we need information from the last step in the first half of the integration */
-+                if (bGStat || do_per_step(step-1, nstglobalcomm))
-+                {
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                    constr, NULL, FALSE, state->box,
-+                                    top_global, &bSumEkinhOld,
-+                                    cglo_flags
-+                                    | CGLO_ENERGY
-+                                    | (bTemp ? CGLO_TEMPERATURE : 0)
-+                                    | (bPres ? CGLO_PRESSURE : 0)
-+                                    | (bPres ? CGLO_CONSTRAINT : 0)
-+                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
-+                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-+                                    | CGLO_SCALEEKIN
-+                                    );
-+                    /* explanation of above:
-+                       a) We compute Ekin at the full time step
-+                       if 1) we are using the AveVel Ekin, and it's not the
-+                       initial step, or 2) if we are using AveEkin, but need the full
-+                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-+                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-+                       EkinAveVel because it's needed for the pressure */
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+                }
-+                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-+                if (!bInitStep)
-+                {
-+                    if (bTrotter)
-+                    {
-+                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
-+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
-+                    }
-+                    else
-+                    {
-+                        if (bExchanged)
-+                        {
-+                            wallcycle_stop(wcycle, ewcUPDATE);
-+                            /* We need the kinetic energy at minus the half step for determining
-+                             * the full step kinetic energy and possibly for T-coupling.*/
-+                            /* This may not be quite working correctly yet . . . . */
-+                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-+                                            constr, NULL, FALSE, state->box,
-+                                            top_global, &bSumEkinhOld,
-+                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-+                            wallcycle_start(wcycle, ewcUPDATE);
-+                        }
-+                    }
-+                }
-+
-+                if (iterate.bIterationActive &&
-+                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-+                                   state->veta, &vetanew))
-+                {
-+                    break;
-+                }
-+                bFirstIterate = FALSE;
-+            }
-+
-+            if (bTrotter && !bInitStep)
-+            {
-+                copy_mat(shake_vir, state->svir_prev);
-+                copy_mat(force_vir, state->fvir_prev);
-+                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
-+                {
-+                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
-+                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE);
-+                    enerd->term[F_EKIN] = trace(ekind->ekin);
-+                }
-+            }
-+            /* if it's the initial step, we performed this first step just to get the constraint virial */
-+            if (bInitStep && ir->eI == eiVV)
-+            {
-+                copy_rvecn(cbuf, state->v, 0, state->natoms);
-+            }
-+            wallcycle_stop(wcycle, ewcUPDATE);
-+        }
-+
-+        /* MRS -- now done iterating -- compute the conserved quantity */
-+        if (bVV)
-+        {
-+            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
-+            if (ir->eI == eiVV)
-+            {
-+                last_ekin = enerd->term[F_EKIN];
-+            }
-+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
-+            {
-+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
-+            }
-+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
-+            if (!bRerunMD)
-+            {
-+                sum_dhdl(enerd, state->lambda, ir->fepvals);
-+            }
-+        }
-+
-+        /* ########  END FIRST UPDATE STEP  ############## */
-+        /* ########  If doing VV, we now have v(dt) ###### */
-+        if (bDoExpanded)
-+        {
-+            /* perform extended ensemble sampling in lambda - we don't
-+               actually move to the new state before outputting
-+               statistics, but if performing simulated tempering, we
-+               do update the velocities and the tau_t. */
-+
-+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, state->v, mdatoms);
-+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
-+            copy_df_history(&state_global->dfhist, &state->dfhist);
-+        }
-+
-+        /* Now we have the energies and forces corresponding to the
-+         * coordinates at time t. We must output all of this before
-+         * the update.
-+         */
-+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
-+                                 ir, state, state_global, top_global, fr,
-+                                 outf, mdebin, ekind, f, f_global,
-+                                 &nchkpt,
-+                                 bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
-+                                 bSumEkinhOld);
-+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
-+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x, ir, t, wcycle);
-+
-+        /* kludge -- virial is lost with restart for NPT control. Must restart */
-+        if (bStartingFromCpt && bVV)
-+        {
-+            copy_mat(state->svir_prev, shake_vir);
-+            copy_mat(state->fvir_prev, force_vir);
-+        }
-+
-+        elapsed_time = walltime_accounting_get_current_elapsed_time(walltime_accounting);
-+
-+        /* Check whether everything is still allright */
-+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
-+#ifdef GMX_THREAD_MPI
-+            && MASTER(cr)
-+#endif
-+            )
-+        {
-+            /* this is just make gs.sig compatible with the hack
-+               of sending signals around by MPI_Reduce with together with
-+               other floats */
-+            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
-+            {
-+                gs.sig[eglsSTOPCOND] = 1;
-+            }
-+            if (gmx_get_stop_condition() == gmx_stop_cond_next)
-+            {
-+                gs.sig[eglsSTOPCOND] = -1;
-+            }
-+            /* < 0 means stop at next step, > 0 means stop at next NS step */
-+            if (fplog)
-+            {
-+                fprintf(fplog,
-+                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
-+                        gmx_get_signal_name(),
-+                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
-+                fflush(fplog);
-+            }
-+            fprintf(stderr,
-+                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
-+                    gmx_get_signal_name(),
-+                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
-+            fflush(stderr);
-+            handled_stop_condition = (int)gmx_get_stop_condition();
-+        }
-+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
-+                 (max_hours > 0 && elapsed_time > max_hours*60.0*60.0*0.99) &&
-+                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
-+        {
-+            /* Signal to terminate the run */
-+            gs.sig[eglsSTOPCOND] = 1;
-+            if (fplog)
-+            {
-+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
-+            }
-+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
-+        }
-+
-+        if (bResetCountersHalfMaxH && MASTER(cr) &&
-+            elapsed_time > max_hours*60.0*60.0*0.495)
-+        {
-+            gs.sig[eglsRESETCOUNTERS] = 1;
-+        }
-+
-+        if (ir->nstlist == -1 && !bRerunMD)
-+        {
-+            /* When bGStatEveryStep=FALSE, global_stat is only called
-+             * when we check the atom displacements, not at NS steps.
-+             * This means that also the bonded interaction count check is not
-+             * performed immediately after NS. Therefore a few MD steps could
-+             * be performed with missing interactions.
-+             * But wrong energies are never written to file,
-+             * since energies are only written after global_stat
-+             * has been called.
-+             */
-+            if (step >= nlh.step_nscheck)
-+            {
-+                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
-+                                                     nlh.scale_tot, state->x);
-+            }
-+            else
-+            {
-+                /* This is not necessarily true,
-+                 * but step_nscheck is determined quite conservatively.
-+                 */
-+                nlh.nabnsb = 0;
-+            }
-+        }
-+
-+        /* In parallel we only have to check for checkpointing in steps
-+         * where we do global communication,
-+         *  otherwise the other nodes don't know.
-+         */
-+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
-+                           cpt_period >= 0 &&
-+                           (cpt_period == 0 ||
-+                            elapsed_time >= nchkpt*cpt_period*60.0)) &&
-+            gs.set[eglsCHKPT] == 0)
-+        {
-+            gs.sig[eglsCHKPT] = 1;
-+        }
-+
-+        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
-+        if (EI_VV(ir->eI))
-+        {
-+            if (!bInitStep)
-+            {
-+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-+            }
-+            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
-+            {
-+                gmx_bool bIfRandomize;
-+                bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
-+                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
-+                if (constr && bIfRandomize)
-+                {
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, tmp_vir,
-+                                       cr, nrnb, wcycle, upd, constr,
-+                                       TRUE, bCalcVir, vetanew);
-+                }
-+            }
-+        }
-+
-+        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
-+        {
-+            gmx_iterate_init(&iterate, TRUE);
-+            /* for iterations, we save these vectors, as we will be redoing the calculations */
-+            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-+        }
-+
-+        bFirstIterate = TRUE;
-+        while (bFirstIterate || iterate.bIterationActive)
-+        {
-+            /* We now restore these vectors to redo the calculation with improved extended variables */
-+            if (iterate.bIterationActive)
-+            {
-+                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-+            }
-+
-+            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
-+               so scroll down for that logic */
-+
-+            /* #########   START SECOND UPDATE STEP ################# */
-+            /* Box is changed in update() when we do pressure coupling,
-+             * but we should still use the old box for energy corrections and when
-+             * writing it to the energy file, so it matches the trajectory files for
-+             * the same timestep above. Make a copy in a separate array.
-+             */
-+            copy_mat(state->box, lastbox);
-+
-+            bOK         = TRUE;
-+            dvdl_constr = 0;
-+
-+            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
-+            {
-+                wallcycle_start(wcycle, ewcUPDATE);
-+                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-+                if (bTrotter)
-+                {
-+                    if (iterate.bIterationActive)
-+                    {
-+                        if (bFirstIterate)
-+                        {
-+                            scalevir = 1;
-+                        }
-+                        else
-+                        {
-+                            /* we use a new value of scalevir to converge the iterations faster */
-+                            scalevir = tracevir/trace(shake_vir);
-+                        }
-+                        msmul(shake_vir, scalevir, shake_vir);
-+                        m_add(force_vir, shake_vir, total_vir);
-+                        clear_mat(shake_vir);
-+                    }
-+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-+                    /* We can only do Berendsen coupling after we have summed
-+                     * the kinetic energy or virial. Since the happens
-+                     * in global_state after update, we should only do it at
-+                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
-+                     */
-+                }
-+                else
-+                {
-+                    update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-+                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
-+                }
-+
-+                if (bVV)
-+                {
-+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                    /* velocity half-step update */
-+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                                  ekind, M, upd, FALSE, etrtVELOCITY2,
-+                                  cr, nrnb, constr, &top->idef);
-+                }
-+
-+                /* Above, initialize just copies ekinh into ekin,
-+                 * it doesn't copy position (for VV),
-+                 * and entire integrator for MD.
-+                 */
-+
-+                if (ir->eI == eiVVAK)
-+                {
-+                    copy_rvecn(state->x, cbuf, 0, state->natoms);
-+                }
-+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                              ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-+                wallcycle_stop(wcycle, ewcUPDATE);
-+
-+                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
-+                                   fr->bMolPBC, graph, f,
-+                                   &top->idef, shake_vir,
-+                                   cr, nrnb, wcycle, upd, constr,
-+                                   FALSE, bCalcVir, state->veta);
-+
-+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-+                {
-+                    /* Correct the virial for multiple time stepping */
-+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-+                }
-+
-+                if (ir->eI == eiVVAK)
-+                {
-+                    /* erase F_EKIN and F_TEMP here? */
-+                    /* just compute the kinetic energy at the half step to perform a trotter step */
-+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                    constr, NULL, FALSE, lastbox,
-+                                    top_global, &bSumEkinhOld,
-+                                    cglo_flags | CGLO_TEMPERATURE
-+                                    );
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-+                    /* now we know the scaling, we can compute the positions again again */
-+                    copy_rvecn(cbuf, state->x, 0, state->natoms);
-+
-+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                                  ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+
-+                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
-+                    /* are the small terms in the shake_vir here due
-+                     * to numerical errors, or are they important
-+                     * physically? I'm thinking they are just errors, but not completely sure.
-+                     * For now, will call without actually constraining, constr=NULL*/
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, tmp_vir,
-+                                       cr, nrnb, wcycle, upd, NULL,
-+                                       FALSE, bCalcVir,
-+                                       state->veta);
-+                }
-+                if (!bOK)
-+                {
-+                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-+                }
-+
-+                if (fr->bSepDVDL && fplog && do_log)
-+                {
-+                    gmx_print_sepdvdl(fplog, "Constraint dV/dl", 0.0, dvdl_constr);
-+                }
-+                if (bVV)
-+                {
-+                    /* this factor or 2 correction is necessary
-+                       because half of the constraint force is removed
-+                       in the vv step, so we have to double it.  See
-+                       the Redmine issue #1255.  It is not yet clear
-+                       if the factor of 2 is exact, or just a very
-+                       good approximation, and this will be
-+                       investigated.  The next step is to see if this
-+                       can be done adding a dhdl contribution from the
-+                       rattle step, but this is somewhat more
-+                       complicated with the current code. Will be
-+                       investigated, hopefully for 4.6.3. However,
-+                       this current solution is much better than
-+                       having it completely wrong.
-+                     */
-+                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
-+                }
-+                else
-+                {
-+                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-+                }
-+            }
-+            else if (graph)
-+            {
-+                /* Need to unshift here */
-+                unshift_self(graph, state->box, state->x);
-+            }
-+
-+            if (vsite != NULL)
-+            {
-+                wallcycle_start(wcycle, ewcVSITECONSTR);
-+                if (graph != NULL)
-+                {
-+                    shift_self(graph, state->box, state->x);
-+                }
-+                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-+                                 top->idef.iparams, top->idef.il,
-+                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-+
-+                if (graph != NULL)
-+                {
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+                wallcycle_stop(wcycle, ewcVSITECONSTR);
-+            }
-+
-+            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
-+            /* With Leap-Frog we can skip compute_globals at
-+             * non-communication steps, but we need to calculate
-+             * the kinetic energy one step before communication.
-+             */
-+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
-+            {
-+                if (ir->nstlist == -1 && bFirstIterate)
-+                {
-+                    gs.sig[eglsNABNSB] = nlh.nabnsb;
-+                }
-+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                constr,
-+                                bFirstIterate ? &gs : NULL,
-+                                (step_rel % gs.nstms == 0) &&
-+                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
-+                                lastbox,
-+                                top_global, &bSumEkinhOld,
-+                                cglo_flags
-+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
-+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
-+                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
-+                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-+                                | CGLO_CONSTRAINT
-+                                );
-+                if (ir->nstlist == -1 && bFirstIterate)
-+                {
-+                    nlh.nabnsb         = gs.set[eglsNABNSB];
-+                    gs.set[eglsNABNSB] = 0;
-+                }
-+            }
-+            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
-+            /* #############  END CALC EKIN AND PRESSURE ################# */
-+
-+            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-+               the virial that should probably be addressed eventually. state->veta has better properies,
-+               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-+               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
-+
-+            if (iterate.bIterationActive &&
-+                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-+                               trace(shake_vir), &tracevir))
-+            {
-+                break;
-+            }
-+            bFirstIterate = FALSE;
-+        }
-+
-+        if (!bVV || bRerunMD)
-+        {
-+            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
-+            sum_dhdl(enerd, state->lambda, ir->fepvals);
-+        }
-+        update_box(fplog, step, ir, mdatoms, state, f,
-+                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, upd);
-+
-+        /* ################# END UPDATE STEP 2 ################# */
-+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
-+
-+        /* The coordinates (x) were unshifted in update */
-+        if (!bGStat)
-+        {
-+            /* We will not sum ekinh_old,
-+             * so signal that we still have to do it.
-+             */
-+            bSumEkinhOld = TRUE;
-+        }
-+
-+        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
-+
-+        /* use the directly determined last velocity, not actually the averaged half steps */
-+        if (bTrotter && ir->eI == eiVV)
-+        {
-+            enerd->term[F_EKIN] = last_ekin;
-+        }
-+        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
-+
-+        if (bVV)
-+        {
-+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
-+        }
-+        else
-+        {
-+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
-+        }
-+        /* #########  END PREPARING EDR OUTPUT  ###########  */
-+
-+        /* Output stuff */
-+        if (MASTER(cr))
-+        {
-+            gmx_bool do_dr, do_or;
-+
-+            if (fplog && do_log && bDoExpanded)
-+            {
-+                /* only needed if doing expanded ensemble */
-+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
-+                                          &state_global->dfhist, state->fep_state, ir->nstlog, step);
-+            }
-+            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
-+            {
-+                if (bCalcEner)
-+                {
-+                    upd_mdebin(mdebin, bDoDHDL, TRUE,
-+                               t, mdatoms->tmass, enerd, state,
-+                               ir->fepvals, ir->expandedvals, lastbox,
-+                               shake_vir, force_vir, total_vir, pres,
-+                               ekind, mu_tot, constr);
-+                }
-+                else
-+                {
-+                    upd_mdebin_step(mdebin);
-+                }
-+
-+                do_dr  = do_per_step(step, ir->nstdisreout);
-+                do_or  = do_per_step(step, ir->nstorireout);
-+
-+                print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : NULL,
-+                           step, t,
-+                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
-+            }
-+            if (ir->ePull != epullNO)
-+            {
-+                pull_print_output(ir->pull, step, t);
-+            }
-+
-+            if (do_per_step(step, ir->nstlog))
-+            {
-+                if (fflush(fplog) != 0)
-+                {
-+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
-+                }
-+            }
-+        }
-+        if (bDoExpanded)
-+        {
-+            /* Have to do this part _after_ outputting the logfile and the edr file */
-+            /* Gets written into the state at the beginning of next loop*/
-+            state->fep_state = lamnew;
-+        }
-+        /* Print the remaining wall clock time for the run */
-+        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
-+        {
-+            if (shellfc)
-+            {
-+                fprintf(stderr, "\n");
-+            }
-+            print_time(stderr, walltime_accounting, step, ir, cr);
-+        }
-+
-+        /* Ion/water position swapping.
-+         * Not done in last step since trajectory writing happens before this call
-+         * in the MD loop and exchanges would be lost anyway. */
-+        bNeedRepartition = FALSE;
-+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
-+            do_per_step(step, ir->swap->nstswap))
-+        {
-+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
-+                                             bRerunMD ? rerun_fr.x   : state->x,
-+                                             bRerunMD ? rerun_fr.box : state->box,
-+                                             top_global, MASTER(cr) && bVerbose, bRerunMD);
-+
-+            if (bNeedRepartition && DOMAINDECOMP(cr))
-+            {
-+                dd_collect_state(cr->dd, state, state_global);
-+            }
-+        }
-+
-+        /* Replica exchange */
-+        bExchanged = FALSE;
-+        if (bDoReplEx)
-+        {
-+            bExchanged = replica_exchange(fplog, cr, repl_ex,
-+                                          state_global, enerd,
-+                                          state, step, t);
-+        }
-+
-+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
-+        {
-+            dd_partition_system(fplog, step, cr, TRUE, 1,
-+                                state_global, top_global, ir,
-+                                state, &f, mdatoms, top, fr,
-+                                vsite, shellfc, constr,
-+                                nrnb, wcycle, FALSE);
-+        }
-+
-+        bFirstStep       = FALSE;
-+        bInitStep        = FALSE;
-+        bStartingFromCpt = FALSE;
-+
-+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
-+        /* With all integrators, except VV, we need to retain the pressure
-+         * at the current step for coupling at the next step.
-+         */
-+        if ((state->flags & (1<<estPRES_PREV)) &&
-+            (bGStatEveryStep ||
-+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
-+        {
-+            /* Store the pressure in t_state for pressure coupling
-+             * at the next MD step.
-+             */
-+            copy_mat(pres, state->pres_prev);
-+        }
-+
-+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
-+
-+        if ( (membed != NULL) && (!bLastStep) )
-+        {
-+            rescale_membed(step_rel, membed, state_global->x);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            if (MASTER(cr))
-+            {
-+                /* read next frame from input trajectory */
-+                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
-+            }
-+
-+            if (PAR(cr))
-+            {
-+                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
-+            }
-+        }
-+
-+        if (!bRerunMD || !rerun_fr.bStep)
-+        {
-+            /* increase the MD step number */
-+            step++;
-+            step_rel++;
-+        }
-+
-+        cycles = wallcycle_stop(wcycle, ewcSTEP);
-+        if (DOMAINDECOMP(cr) && wcycle)
-+        {
-+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
-+        }
-+
-+        if (bPMETuneRunning || bPMETuneTry)
-+        {
-+            /* PME grid + cut-off optimization with GPUs or PME nodes */
-+
-+            /* Count the total cycles over the last steps */
-+            cycles_pmes += cycles;
-+
-+            /* We can only switch cut-off at NS steps */
-+            if (step % ir->nstlist == 0)
-+            {
-+                /* PME grid + cut-off optimization with GPUs or PME nodes */
-+                if (bPMETuneTry)
-+                {
-+                    if (DDMASTER(cr->dd))
-+                    {
-+                        /* PME node load is too high, start tuning */
-+                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
-+                    }
-+                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-+
-+                    if (bPMETuneRunning &&
-+                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
-+                        !(cr->duty & DUTY_PME))
-+                    {
-+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
-+                         * With GPUs + separate PME ranks, we don't want DLB.
-+                         * This could happen when we scan coarse grids and
-+                         * it would then never be turned off again.
-+                         * This would hurt performance at the final, optimal
-+                         * grid spacing, where DLB almost never helps.
-+                         * Also, DLB can limit the cut-off for PME tuning.
-+                         */
-+                        dd_dlb_set_lock(cr->dd, TRUE);
-+                    }
-+
-+                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
-+                    {
-+                        bPMETuneTry     = FALSE;
-+                    }
-+                }
-+                if (bPMETuneRunning)
-+                {
-+                    /* init_step might not be a multiple of nstlist,
-+                     * but the first cycle is always skipped anyhow.
-+                     */
-+                    bPMETuneRunning =
-+                        pme_load_balance(pme_loadbal, cr,
-+                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
-+                                         fplog,
-+                                         ir, state, cycles_pmes,
-+                                         fr->ic, fr->nbv, &fr->pmedata,
-+                                         step);
-+
-+                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
-+                    fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-+                    fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-+                    fr->rlist         = fr->ic->rlist;
-+                    fr->rlistlong     = fr->ic->rlistlong;
-+                    fr->rcoulomb      = fr->ic->rcoulomb;
-+                    fr->rvdw          = fr->ic->rvdw;
-+
-+                    if (ir->eDispCorr != edispcNO)
-+                    {
-+                        calc_enervirdiff(NULL, ir->eDispCorr, fr);
-+                    }
-+
-+                    if (!bPMETuneRunning &&
-+                        DOMAINDECOMP(cr) &&
-+                        dd_dlb_is_locked(cr->dd))
-+                    {
-+                        /* Unlock the DLB=auto, DLB is allowed to activate
-+                         * (but we don't expect it to activate in most cases).
-+                         */
-+                        dd_dlb_set_lock(cr->dd, FALSE);
-+                    }
-+                }
-+                cycles_pmes = 0;
-+            }
-+        }
-+
-+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
-+            gs.set[eglsRESETCOUNTERS] != 0)
-+        {
-+            /* Reset all the counters related to performance over the run */
-+            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
-+                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
-+            wcycle_set_reset_counters(wcycle, -1);
-+            if (!(cr->duty & DUTY_PME))
-+            {
-+                /* Tell our PME node to reset its counters */
-+                gmx_pme_send_resetcounters(cr, step);
-+            }
-+            /* Correct max_hours for the elapsed time */
-+            max_hours                -= elapsed_time/(60.0*60.0);
-+            bResetCountersHalfMaxH    = FALSE;
-+            gs.set[eglsRESETCOUNTERS] = 0;
-+        }
-+
-+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
-+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
-+
-+    }
-+    /* End of main MD loop */
-+    debug_gmx();
-+
-+    /* Closing TNG files can include compressing data. Therefore it is good to do that
-+     * before stopping the time measurements. */
-+    mdoutf_tng_close(outf);
-+
-+    /* Stop measuring walltime */
-+    walltime_accounting_end(walltime_accounting);
-+
-+    if (bRerunMD && MASTER(cr))
-+    {
-+        close_trj(status);
-+    }
-+
-+    if (!(cr->duty & DUTY_PME))
-+    {
-+        /* Tell the PME only node to finish */
-+        gmx_pme_send_finish(cr);
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        if (ir->nstcalcenergy > 0 && !bRerunMD)
-+        {
-+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
-+                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
-+        }
-+    }
-+
-+    done_mdoutf(outf);
-+    debug_gmx();
-+
-+    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
-+    {
-+        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
-+        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
-+    }
-+
-+    if (pme_loadbal != NULL)
-+    {
-+        pme_loadbal_done(pme_loadbal, cr, fplog,
-+                         fr->nbv != NULL && fr->nbv->bUseGPU);
-+    }
-+
-+    if (shellfc && fplog)
-+    {
-+        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
-+                (nconverged*100.0)/step_rel);
-+        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
-+                tcount/step_rel);
-+    }
-+
-+    if (repl_ex_nst > 0 && MASTER(cr))
-+    {
-+        print_replica_exchange_statistics(fplog, repl_ex);
-+    }
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(ir->bIMD, ir->imd);
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
-+
-+    return 0;
-+}
-diff --git a/src/programs/mdrun/mdrun.cpp b/src/programs/mdrun/mdrun.cpp
-index 6bac3f0..e9fbf48 100644
---- a/src/programs/mdrun/mdrun.cpp
-+++ b/src/programs/mdrun/mdrun.cpp
-@@ -55,6 +55,12 @@
- 
- #include "gromacs/commandline/pargs.h"
- #include "gromacs/fileio/filenm.h"
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain; 
-+extern void(*plumedcmd)(plumed,const char*,const void*);
-+/* END PLUMED */
- 
- int gmx_mdrun(int argc, char *argv[])
- {
-@@ -428,6 +434,7 @@ int gmx_mdrun(int argc, char *argv[])
-         { efMTX, "-mtx",    "nm",       ffOPTWR },
-         { efNDX, "-dn",     "dipole",   ffOPTWR },
-         { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-plumed", "plumed",   ffOPTRD },   /* PLUMED */
-         { efDAT, "-membed", "membed",   ffOPTRD },
-         { efTOP, "-mp",     "membed",   ffOPTRD },
-         { efNDX, "-mn",     "membed",   ffOPTRD },
-@@ -780,6 +787,32 @@ int gmx_mdrun(int argc, char *argv[])
-     ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-     ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
- 
-+    /* PLUMED */
-+    plumedswitch=0;
-+    if (opt2bSet("-plumed",NFILE,fnm)) plumedswitch=1;
-+    if(plumedswitch){
-+      plumedcmd=plumed_cmd;
-+      int plumed_is_there=0;
-+      int real_precision=sizeof(real);
-+      real energyUnits=1.0;
-+      real lengthUnits=1.0;
-+      real timeUnits=1.0;
-+  
-+      if(!plumed_installed()){
-+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
-+      }
-+      plumedmain=plumed_create();
-+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
-+      // this is not necessary for gromacs units:
-+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
-+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
-+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
-+      //
-+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,NFILE,fnm));
-+      plumedswitch=1;
-+    }
-+    /* END PLUMED */
-+
-     rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-                   nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-                   dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
-@@ -788,6 +821,12 @@ int gmx_mdrun(int argc, char *argv[])
-                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-                   pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      plumed_finalize(plumedmain);
-+    }
-+    /* END PLUMED */
-+
-     /* Log file has to be closed in mdrunner if we are appending to it
-        (fplog not set here) */
-     if (MASTER(cr) && !bAppendFiles)
-diff --git a/src/programs/mdrun/mdrun.cpp.preplumed b/src/programs/mdrun/mdrun.cpp.preplumed
-new file mode 100644
-index 0000000..6bac3f0
---- /dev/null
-+++ b/src/programs/mdrun/mdrun.cpp.preplumed
-@@ -0,0 +1,799 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#include "mdrun_main.h"
-+
-+#ifdef HAVE_CONFIG_H
-+#include "config.h"
-+#endif
-+
-+#include <stdio.h>
-+
-+#include "gromacs/legacyheaders/checkpoint.h"
-+#include "gromacs/legacyheaders/copyrite.h"
-+#include "gromacs/legacyheaders/gmx_fatal.h"
-+#include "gromacs/legacyheaders/macros.h"
-+#include "gromacs/legacyheaders/main.h"
-+#include "gromacs/legacyheaders/mdrun.h"
-+#include "gromacs/legacyheaders/network.h"
-+#include "gromacs/legacyheaders/readinp.h"
-+#include "gromacs/legacyheaders/typedefs.h"
-+#include "gromacs/legacyheaders/types/commrec.h"
-+
-+#include "gromacs/commandline/pargs.h"
-+#include "gromacs/fileio/filenm.h"
-+
-+int gmx_mdrun(int argc, char *argv[])
-+{
-+    const char   *desc[] = {
-+        "[THISMODULE] is the main computational chemistry engine",
-+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
-+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
-+        "test particle insertion or (re)calculation of energies.",
-+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
-+        "builds a Hessian matrix from single conformation.",
-+        "For usual Normal Modes-like calculations, make sure that",
-+        "the structure provided is properly energy-minimized.",
-+        "The generated matrix can be diagonalized by [gmx-nmeig].[PAR]",
-+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
-+        "and distributes the topology over ranks if needed.",
-+        "[TT]mdrun[tt] produces at least four output files.",
-+        "A single log file ([TT]-g[tt]) is written, unless the option",
-+        "[TT]-seppot[tt] is used, in which case each rank writes a log file.",
-+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
-+        "optionally forces.",
-+        "The structure file ([TT]-c[tt]) contains the coordinates and",
-+        "velocities of the last step.",
-+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
-+        "pressure, etc, a lot of these things are also printed in the log file.",
-+        "Optionally coordinates can be written to a compressed trajectory file",
-+        "([TT]-x[tt]).[PAR]",
-+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
-+        "turned on.[PAR]",
-+        "A simulation can be run in parallel using two different parallelization",
-+        "schemes: MPI parallelization and/or OpenMP thread parallelization.",
-+        "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
-+        "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
-+        "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
-+        "are supported when [TT]mdrun[tt] is compiled with OpenMP. Full OpenMP support",
-+        "is only available with the Verlet cut-off scheme, with the (older)",
-+        "group scheme only PME-only ranks can use OpenMP parallelization.",
-+        "In all cases [TT]mdrun[tt] will by default try to use all the available",
-+        "hardware resources. With a normal MPI library only the options",
-+        "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
-+        "for PME-only ranks, can be used to control the number of threads.",
-+        "With thread-MPI there are additional options [TT]-nt[tt], which sets",
-+        "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
-+        "of thread-MPI threads.",
-+        "The number of OpenMP threads used by [TT]mdrun[tt] can also be set with",
-+        "the standard environment variable, [TT]OMP_NUM_THREADS[tt].",
-+        "The [TT]GMX_PME_NUM_THREADS[tt] environment variable can be used to specify",
-+        "the number of threads used by the PME-only ranks.[PAR]",
-+        "Note that combined MPI+OpenMP parallelization is in many cases",
-+        "slower than either on its own. However, at high parallelization, using the",
-+        "combination is often beneficial as it reduces the number of domains and/or",
-+        "the number of MPI ranks. (Less and larger domains can improve scaling,",
-+        "with separate PME ranks, using fewer MPI ranks reduces communication costs.)",
-+        "OpenMP-only parallelization is typically faster than MPI-only parallelization",
-+        "on a single CPU(-die). Since we currently don't have proper hardware",
-+        "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
-+        "automatically use OpenMP-only parallelization when you use up to 4",
-+        "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
-+        "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
-+        "parallelization is used (except with GPUs, see below).",
-+        "[PAR]",
-+        "To quickly test the performance of the new Verlet cut-off scheme",
-+        "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
-+        "the [TT]-testverlet[tt] option. This should not be used for production,",
-+        "since it can slightly modify potentials and it will remove charge groups",
-+        "making analysis difficult, as the [TT].tpr[tt] file will still contain",
-+        "charge groups. For production simulations it is highly recommended",
-+        "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
-+        "[PAR]",
-+        "With GPUs (only supported with the Verlet cut-off scheme), the number",
-+        "of GPUs should match the number of particle-particle ranks, i.e.",
-+        "excluding PME-only ranks. With thread-MPI, unless set on the command line, the number",
-+        "of MPI threads will automatically be set to the number of GPUs detected.",
-+        "To use a subset of the available GPUs, or to manually provide a mapping of",
-+        "GPUs to PP ranks, you can use the [TT]-gpu_id[tt] option. The argument of [TT]-gpu_id[tt] is",
-+        "a string of digits (without delimiter) representing device id-s of the GPUs to be used.",
-+        "For example, \"[TT]02[tt]\" specifies using GPUs 0 and 2 in the first and second PP ranks per compute node",
-+        "respectively. To select different sets of GPU-s",
-+        "on different nodes of a compute cluster, use the [TT]GMX_GPU_ID[tt] environment",
-+        "variable instead. The format for [TT]GMX_GPU_ID[tt] is identical to ",
-+        "[TT]-gpu_id[tt], with the difference that an environment variable can have",
-+        "different values on different compute nodes. Multiple MPI ranks on each node",
-+        "can share GPUs. This is accomplished by specifying the id(s) of the GPU(s)",
-+        "multiple times, e.g. \"[TT]0011[tt]\" for four ranks sharing two GPUs in this node.",
-+        "This works within a single simulation, or a multi-simulation, with any form of MPI.",
-+        "[PAR]",
-+        "With the Verlet cut-off scheme and verlet-buffer-tolerance set,",
-+        "the pair-list update interval nstlist can be chosen freely with",
-+        "the option [TT]-nstlist[tt]. [TT]mdrun[tt] will then adjust",
-+        "the pair-list cut-off to maintain accuracy, and not adjust nstlist.",
-+        "Otherwise, by default, [TT]mdrun[tt] will try to increase the",
-+        "value of nstlist set in the [TT].mdp[tt] file to improve the",
-+        "performance. For CPU-only runs, nstlist might increase to 20, for",
-+        "GPU runs up to 40. For medium to high parallelization or with",
-+        "fast GPUs, a (user-supplied) larger nstlist value can give much",
-+        "better performance.",
-+        "[PAR]",
-+        "When using PME with separate PME ranks or with a GPU, the two major",
-+        "compute tasks, the non-bonded force calculation and the PME calculation",
-+        "run on different compute resources. If this load is not balanced,",
-+        "some of the resources will be idle part of time. With the Verlet",
-+        "cut-off scheme this load is automatically balanced when the PME load",
-+        "is too high (but not when it is too low). This is done by scaling",
-+        "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
-+        "few hundred steps different settings are tried and the fastest is chosen",
-+        "for the rest of the simulation. This does not affect the accuracy of",
-+        "the results, but it does affect the decomposition of the Coulomb energy",
-+        "into particle and mesh contributions. The auto-tuning can be turned off",
-+        "with the option [TT]-notunepme[tt].",
-+        "[PAR]",
-+        "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,",
-+        "when all (logical) cores on a compute node are used by [TT]mdrun[tt],",
-+        "even when no multi-threading is used,",
-+        "as this usually results in significantly better performance.",
-+        "If the queuing systems or the OpenMP library pinned threads, we honor",
-+        "this and don't pin again, even though the layout may be sub-optimal.",
-+        "If you want to have [TT]mdrun[tt] override an already set thread affinity",
-+        "or pin threads when using less cores, use [TT]-pin on[tt].",
-+        "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,",
-+        "there are multiple logical cores per physical core.",
-+        "The option [TT]-pinstride[tt] sets the stride in logical cores for",
-+        "pinning consecutive threads. Without SMT, 1 is usually the best choice.",
-+        "With Intel Hyper-Threading 2 is best when using half or less of the",
-+        "logical cores, 1 otherwise. The default value of 0 do exactly that:",
-+        "it minimizes the threads per logical core, to optimize performance.",
-+        "If you want to run multiple [TT]mdrun[tt] jobs on the same physical node,"
-+        "you should set [TT]-pinstride[tt] to 1 when using all logical cores.",
-+        "When running multiple [TT]mdrun[tt] (or other) simulations on the same physical",
-+        "node, some simulations need to start pinning from a non-zero core",
-+        "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
-+        "the offset in logical cores for pinning.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] is started with more than 1 rank,",
-+        "parallelization with domain decomposition is used.",
-+        "[PAR]",
-+        "With domain decomposition, the spatial decomposition can be set",
-+        "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
-+        "The user only needs to change this when the system is very inhomogeneous.",
-+        "Dynamic load balancing is set with the option [TT]-dlb[tt],",
-+        "which can give a significant performance improvement,",
-+        "especially for inhomogeneous systems. The only disadvantage of",
-+        "dynamic load balancing is that runs are no longer binary reproducible,",
-+        "but in most cases this is not important.",
-+        "By default the dynamic load balancing is automatically turned on",
-+        "when the measured performance loss due to load imbalance is 5% or more.",
-+        "At low parallelization these are the only important options",
-+        "for domain decomposition.",
-+        "At high parallelization the options in the next two sections",
-+        "could be important for increasing the performace.",
-+        "[PAR]",
-+        "When PME is used with domain decomposition, separate ranks can",
-+        "be assigned to do only the PME mesh calculation;",
-+        "this is computationally more efficient starting at about 12 ranks,",
-+        "or even fewer when OpenMP parallelization is used.",
-+        "The number of PME ranks is set with option [TT]-npme[tt],",
-+        "but this cannot be more than half of the ranks.",
-+        "By default [TT]mdrun[tt] makes a guess for the number of PME",
-+        "ranks when the number of ranks is larger than 16. With GPUs,",
-+        "using separate PME ranks is not selected automatically,",
-+        "since the optimal setup depends very much on the details",
-+        "of the hardware. In all cases, you might gain performance",
-+        "by optimizing [TT]-npme[tt]. Performance statistics on this issue",
-+        "are written at the end of the log file.",
-+        "For good load balancing at high parallelization, the PME grid x and y",
-+        "dimensions should be divisible by the number of PME ranks",
-+        "(the simulation will run correctly also when this is not the case).",
-+        "[PAR]",
-+        "This section lists all options that affect the domain decomposition.",
-+        "[PAR]",
-+        "Option [TT]-rdd[tt] can be used to set the required maximum distance",
-+        "for inter charge-group bonded interactions.",
-+        "Communication for two-body bonded interactions below the non-bonded",
-+        "cut-off distance always comes for free with the non-bonded communication.",
-+        "Atoms beyond the non-bonded cut-off are only communicated when they have",
-+        "missing bonded interactions; this means that the extra cost is minor",
-+        "and nearly indepedent of the value of [TT]-rdd[tt].",
-+        "With dynamic load balancing option [TT]-rdd[tt] also sets",
-+        "the lower limit for the domain decomposition cell sizes.",
-+        "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
-+        "the initial coordinates. The chosen value will be a balance",
-+        "between interaction range and communication cost.",
-+        "[PAR]",
-+        "When inter charge-group bonded interactions are beyond",
-+        "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
-+        "For pair interactions and tabulated bonds",
-+        "that do not generate exclusions, this check can be turned off",
-+        "with the option [TT]-noddcheck[tt].",
-+        "[PAR]",
-+        "When constraints are present, option [TT]-rcon[tt] influences",
-+        "the cell size limit as well.",
-+        "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
-+        "should not be beyond the smallest cell size. A error message is",
-+        "generated when this happens and the user should change the decomposition",
-+        "or decrease the LINCS order and increase the number of LINCS iterations.",
-+        "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
-+        "in a conservative fashion. For high parallelization it can be useful",
-+        "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
-+        "[PAR]",
-+        "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
-+        "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
-+        "the cells can scale down by at least this factor. This option is used",
-+        "for the automated spatial decomposition (when not using [TT]-dd[tt])",
-+        "as well as for determining the number of grid pulses, which in turn",
-+        "sets the minimum allowed cell size. Under certain circumstances",
-+        "the value of [TT]-dds[tt] might need to be adjusted to account for",
-+        "high or low spatial inhomogeneity of the system.",
-+        "[PAR]",
-+        "The option [TT]-gcom[tt] can be used to only do global communication",
-+        "every n steps.",
-+        "This can improve performance for highly parallel simulations",
-+        "where this global communication step becomes the bottleneck.",
-+        "For a global thermostat and/or barostat the temperature",
-+        "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
-+        "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
-+        "With [TT]-rerun[tt] an input trajectory can be given for which ",
-+        "forces and energies will be (re)calculated. Neighbor searching will be",
-+        "performed for every frame, unless [TT]nstlist[tt] is zero",
-+        "(see the [TT].mdp[tt] file).[PAR]",
-+        "ED (essential dynamics) sampling and/or additional flooding potentials",
-+        "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]",
-+        "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool",
-+        "or by using options in the essdyn menu of the WHAT IF program.",
-+        "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
-+        "contains projections of positions, velocities and forces onto selected",
-+        "eigenvectors.[PAR]",
-+        "When user-defined potential functions have been selected in the",
-+        "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
-+        "a formatted table with potential functions. The file is read from",
-+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
-+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
-+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
-+        "normal Coulomb.",
-+        "When pair interactions are present, a separate table for pair interaction",
-+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
-+        "When tabulated bonded functions are present in the topology,",
-+        "interaction functions are read using the [TT]-tableb[tt] option.",
-+        "For each different tabulated interaction type the table file name is",
-+        "modified in a different way: before the file extension an underscore is",
-+        "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
-+        "and finally the table number of the interaction type.[PAR]",
-+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
-+        "coordinates and forces when pulling is selected",
-+        "in the [TT].mdp[tt] file.[PAR]",
-+        "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
-+        "simulated in parallel.",
-+        "As many input files/directories are required as the number of systems. ",
-+        "The [TT]-multidir[tt] option takes a list of directories (one for each ",
-+        "system) and runs in each of them, using the input/output file names, ",
-+        "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
-+        "directories.",
-+        "With [TT]-multi[tt], the system number is appended to the run input ",
-+        "and each output filename, for instance [TT]topol.tpr[tt] becomes",
-+        "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
-+        "The number of ranks per system is the total number of ranks",
-+        "divided by the number of systems.",
-+        "One use of this option is for NMR refinement: when distance",
-+        "or orientation restraints are present these can be ensemble averaged",
-+        "over all the systems.[PAR]",
-+        "With [TT]-replex[tt] replica exchange is attempted every given number",
-+        "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
-+        "[TT]-multidir[tt] option, described above.",
-+        "All run input files should use a different coupling temperature,",
-+        "the order of the files is not important. The random seed is set with",
-+        "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
-+        "is performed after every exchange.[PAR]",
-+        "Finally some experimental algorithms can be tested when the",
-+        "appropriate options have been given. Currently under",
-+        "investigation are: polarizability.",
-+        "[PAR]",
-+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
-+        "a protein into a membrane. The data file should contain the options",
-+        "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
-+        "both apply to this as well.",
-+        "[PAR]",
-+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
-+        "crashes due to too large forces. With this option coordinates and",
-+        "forces of atoms with a force larger than a certain value will",
-+        "be printed to stderr.",
-+        "[PAR]",
-+        "Checkpoints containing the complete state of the system are written",
-+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
-+        "unless option [TT]-cpt[tt] is set to -1.",
-+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
-+        "make sure that a recent state of the system is always available,",
-+        "even when the simulation is terminated while writing a checkpoint.",
-+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
-+        "with the step number.",
-+        "A simulation can be continued by reading the full state from file",
-+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
-+        "if no checkpoint file is found, Gromacs just assumes a normal run and",
-+        "starts from the first step of the [TT].tpr[tt] file. By default the output",
-+        "will be appending to the existing output files. The checkpoint file",
-+        "contains checksums of all output files, such that you will never",
-+        "loose data when some output files are modified, corrupt or removed.",
-+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
-+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
-+        "[TT]*[tt] all files are present with names and checksums matching those stored",
-+        "in the checkpoint file: files are appended[PAR]",
-+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
-+        "With [TT]-noappend[tt] new output files are opened and the simulation",
-+        "part number is added to all output file names.",
-+        "Note that in all cases the checkpoint file itself is not renamed",
-+        "and will be overwritten, unless its name does not match",
-+        "the [TT]-cpo[tt] option.",
-+        "[PAR]",
-+        "With checkpointing the output is appended to previously written",
-+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
-+        "output files are present (except for the checkpoint file).",
-+        "The integrity of the files to be appended is verified using checksums",
-+        "which are stored in the checkpoint file. This ensures that output can",
-+        "not be mixed up or corrupted due to file appending. When only some",
-+        "of the previous output files are present, a fatal error is generated",
-+        "and no old output files are modified and no new output files are opened.",
-+        "The result with appending will be the same as from a single run.",
-+        "The contents will be binary identical, unless you use a different number",
-+        "of ranks or dynamic load balancing or the FFT library uses optimizations",
-+        "through timing.",
-+        "[PAR]",
-+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
-+        "file is written at the first neighbor search step where the run time",
-+        "exceeds [TT]-maxh[tt]*0.99 hours.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
-+        "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
-+        "pressed), it will stop after the next neighbor search step ",
-+        "(with nstlist=0 at the next step).",
-+        "In both cases all the usual output will be written to file.",
-+        "When running with MPI, a signal to one of the [TT]mdrun[tt] ranks",
-+        "is sufficient, this signal should not be sent to mpirun or",
-+        "the [TT]mdrun[tt] process that is the parent of the others.",
-+        "[PAR]",
-+        "Interactive molecular dynamics (IMD) can be activated by using at least one",
-+        "of the three IMD switches: The [TT]-imdterm[tt] switch allows to terminate the",
-+        "simulation from the molecular viewer (e.g. VMD). With [TT]-imdwait[tt],",
-+        "[TT]mdrun[tt] pauses whenever no IMD client is connected. Pulling from the",
-+        "IMD remote can be turned on by [TT]-imdpull[tt].",
-+        "The port [TT]mdrun[tt] listens to can be altered by [TT]-imdport[tt].The",
-+        "file pointed to by [TT]-if[tt] contains atom indices and forces if IMD",
-+        "pulling is used."
-+        "[PAR]",
-+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
-+    };
-+    t_commrec    *cr;
-+    t_filenm      fnm[] = {
-+        { efTPX, NULL,      NULL,       ffREAD },
-+        { efTRN, "-o",      NULL,       ffWRITE },
-+        { efCOMPRESSED, "-x", NULL,     ffOPTWR },
-+        { efCPT, "-cpi",    NULL,       ffOPTRD },
-+        { efCPT, "-cpo",    NULL,       ffOPTWR },
-+        { efSTO, "-c",      "confout",  ffWRITE },
-+        { efEDR, "-e",      "ener",     ffWRITE },
-+        { efLOG, "-g",      "md",       ffWRITE },
-+        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
-+        { efXVG, "-field",  "field",    ffOPTWR },
-+        { efXVG, "-table",  "table",    ffOPTRD },
-+        { efXVG, "-tabletf", "tabletf",    ffOPTRD },
-+        { efXVG, "-tablep", "tablep",   ffOPTRD },
-+        { efXVG, "-tableb", "table",    ffOPTRD },
-+        { efTRX, "-rerun",  "rerun",    ffOPTRD },
-+        { efXVG, "-tpi",    "tpi",      ffOPTWR },
-+        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
-+        { efEDI, "-ei",     "sam",      ffOPTRD },
-+        { efXVG, "-eo",     "edsam",    ffOPTWR },
-+        { efXVG, "-devout", "deviatie", ffOPTWR },
-+        { efXVG, "-runav",  "runaver",  ffOPTWR },
-+        { efXVG, "-px",     "pullx",    ffOPTWR },
-+        { efXVG, "-pf",     "pullf",    ffOPTWR },
-+        { efXVG, "-ro",     "rotation", ffOPTWR },
-+        { efLOG, "-ra",     "rotangles", ffOPTWR },
-+        { efLOG, "-rs",     "rotslabs", ffOPTWR },
-+        { efLOG, "-rt",     "rottorque", ffOPTWR },
-+        { efMTX, "-mtx",    "nm",       ffOPTWR },
-+        { efNDX, "-dn",     "dipole",   ffOPTWR },
-+        { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-membed", "membed",   ffOPTRD },
-+        { efTOP, "-mp",     "membed",   ffOPTRD },
-+        { efNDX, "-mn",     "membed",   ffOPTRD },
-+        { efXVG, "-if",     "imdforces", ffOPTWR },
-+        { efXVG, "-swap",   "swapions", ffOPTWR }
-+    };
-+#define NFILE asize(fnm)
-+
-+    /* Command line options ! */
-+    gmx_bool        bDDBondCheck  = TRUE;
-+    gmx_bool        bDDBondComm   = TRUE;
-+    gmx_bool        bTunePME      = TRUE;
-+    gmx_bool        bTestVerlet   = FALSE;
-+    gmx_bool        bVerbose      = FALSE;
-+    gmx_bool        bCompact      = TRUE;
-+    gmx_bool        bSepPot       = FALSE;
-+    gmx_bool        bRerunVSite   = FALSE;
-+    gmx_bool        bConfout      = TRUE;
-+    gmx_bool        bReproducible = FALSE;
-+    gmx_bool        bIMDwait      = FALSE;
-+    gmx_bool        bIMDterm      = FALSE;
-+    gmx_bool        bIMDpull      = FALSE;
-+
-+    int             npme          = -1;
-+    int             nstlist       = 0;
-+    int             nmultisim     = 0;
-+    int             nstglobalcomm = -1;
-+    int             repl_ex_nst   = 0;
-+    int             repl_ex_seed  = -1;
-+    int             repl_ex_nex   = 0;
-+    int             nstepout      = 100;
-+    int             resetstep     = -1;
-+    gmx_int64_t     nsteps        = -2;   /* the value -2 means that the mdp option will be used */
-+    int             imdport       = 8888; /* can be almost anything, 8888 is easy to remember */
-+
-+    rvec            realddxyz          = {0, 0, 0};
-+    const char     *ddno_opt[ddnoNR+1] =
-+    { NULL, "interleave", "pp_pme", "cartesian", NULL };
-+    const char     *dddlb_opt[] =
-+    { NULL, "auto", "no", "yes", NULL };
-+    const char     *thread_aff_opt[threadaffNR+1] =
-+    { NULL, "auto", "on", "off", NULL };
-+    const char     *nbpu_opt[] =
-+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
-+    real            rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
-+    char           *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
-+    real            cpt_period            = 15.0, max_hours = -1;
-+    gmx_bool        bAppendFiles          = TRUE;
-+    gmx_bool        bKeepAndNumCPT        = FALSE;
-+    gmx_bool        bResetCountersHalfWay = FALSE;
-+    output_env_t    oenv                  = NULL;
-+    const char     *deviceOptions         = "";
-+
-+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
-+     * But unfortunately we are not allowed to call a function here,
-+     * since declarations follow below.
-+     */
-+    gmx_hw_opt_t    hw_opt = {
-+        0, 0, 0, 0, threadaffSEL, 0, 0,
-+        { NULL, FALSE, 0, NULL }
-+    };
-+
-+    t_pargs         pa[] = {
-+
-+        { "-dd",      FALSE, etRVEC, {&realddxyz},
-+          "Domain decomposition grid, 0 is optimize" },
-+        { "-ddorder", FALSE, etENUM, {ddno_opt},
-+          "DD rank order" },
-+        { "-npme",    FALSE, etINT, {&npme},
-+          "Number of separate ranks to be used for PME, -1 is guess" },
-+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
-+          "Total number of threads to start (0 is guess)" },
-+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
-+          "Number of thread-MPI threads to start (0 is guess)" },
-+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
-+          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
-+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
-+          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
-+        { "-pin",     FALSE, etENUM, {thread_aff_opt},
-+          "Set thread affinities" },
-+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
-+          "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
-+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
-+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
-+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
-+          "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
-+        { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
-+          "Check for all bonded interactions with DD" },
-+        { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
-+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
-+        { "-rdd",     FALSE, etREAL, {&rdd},
-+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
-+        { "-rcon",    FALSE, etREAL, {&rconstr},
-+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
-+        { "-dlb",     FALSE, etENUM, {dddlb_opt},
-+          "Dynamic load balancing (with DD)" },
-+        { "-dds",     FALSE, etREAL, {&dlb_scale},
-+          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
-+          "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
-+        { "-ddcsx",   FALSE, etSTR, {&ddcsx},
-+          "HIDDENA string containing a vector of the relative sizes in the x "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-ddcsy",   FALSE, etSTR, {&ddcsy},
-+          "HIDDENA string containing a vector of the relative sizes in the y "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-ddcsz",   FALSE, etSTR, {&ddcsz},
-+          "HIDDENA string containing a vector of the relative sizes in the z "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-gcom",    FALSE, etINT, {&nstglobalcomm},
-+          "Global communication frequency" },
-+        { "-nb",      FALSE, etENUM, {&nbpu_opt},
-+          "Calculate non-bonded interactions on" },
-+        { "-nstlist", FALSE, etINT, {&nstlist},
-+          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
-+        { "-tunepme", FALSE, etBOOL, {&bTunePME},
-+          "Optimize PME load between PP/PME ranks or GPU/CPU" },
-+        { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
-+          "Test the Verlet non-bonded scheme" },
-+        { "-v",       FALSE, etBOOL, {&bVerbose},
-+          "Be loud and noisy" },
-+        { "-compact", FALSE, etBOOL, {&bCompact},
-+          "Write a compact log file" },
-+        { "-seppot",  FALSE, etBOOL, {&bSepPot},
-+          "Write separate V and dVdl terms for each interaction type and rank to the log file(s)" },
-+        { "-pforce",  FALSE, etREAL, {&pforce},
-+          "Print all forces larger than this (kJ/mol nm)" },
-+        { "-reprod",  FALSE, etBOOL, {&bReproducible},
-+          "Try to avoid optimizations that affect binary reproducibility" },
-+        { "-cpt",     FALSE, etREAL, {&cpt_period},
-+          "Checkpoint interval (minutes)" },
-+        { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
-+          "Keep and number checkpoint files" },
-+        { "-append",  FALSE, etBOOL, {&bAppendFiles},
-+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
-+        { "-nsteps",  FALSE, etINT64, {&nsteps},
-+          "Run this number of steps, overrides .mdp file option" },
-+        { "-maxh",   FALSE, etREAL, {&max_hours},
-+          "Terminate after 0.99 times this time (hours)" },
-+        { "-multi",   FALSE, etINT, {&nmultisim},
-+          "Do multiple simulations in parallel" },
-+        { "-replex",  FALSE, etINT, {&repl_ex_nst},
-+          "Attempt replica exchange periodically with this period (steps)" },
-+        { "-nex",  FALSE, etINT, {&repl_ex_nex},
-+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
-+        { "-reseed",  FALSE, etINT, {&repl_ex_seed},
-+          "Seed for replica exchange, -1 is generate a seed" },
-+        { "-imdport",    FALSE, etINT, {&imdport},
-+          "HIDDENIMD listening port" },
-+        { "-imdwait",  FALSE, etBOOL, {&bIMDwait},
-+          "HIDDENPause the simulation while no IMD client is connected" },
-+        { "-imdterm",  FALSE, etBOOL, {&bIMDterm},
-+          "HIDDENAllow termination of the simulation from IMD client" },
-+        { "-imdpull",  FALSE, etBOOL, {&bIMDpull},
-+          "HIDDENAllow pulling in the simulation from IMD client" },
-+        { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
-+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
-+        { "-confout", FALSE, etBOOL, {&bConfout},
-+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
-+        { "-stepout", FALSE, etINT, {&nstepout},
-+          "HIDDENFrequency of writing the remaining wall clock time for the run" },
-+        { "-resetstep", FALSE, etINT, {&resetstep},
-+          "HIDDENReset cycle counters after these many time steps" },
-+        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
-+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
-+    };
-+    unsigned long   Flags, PCA_Flags;
-+    ivec            ddxyz;
-+    int             dd_node_order;
-+    gmx_bool        bAddPart;
-+    FILE           *fplog, *fpmulti;
-+    int             sim_part, sim_part_fn;
-+    const char     *part_suffix = ".part";
-+    char            suffix[STRLEN];
-+    int             rc;
-+    char          **multidir = NULL;
-+
-+
-+    cr = init_commrec();
-+
-+    PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
-+
-+    /* Comment this in to do fexist calls only on master
-+     * works not with rerun or tables at the moment
-+     * also comment out the version of init_forcerec in md.c
-+     * with NULL instead of opt2fn
-+     */
-+    /*
-+       if (!MASTER(cr))
-+       {
-+       PCA_Flags |= PCA_NOT_READ_NODE;
-+       }
-+     */
-+
-+    if (!parse_common_args(&argc, argv, PCA_Flags, NFILE, fnm, asize(pa), pa,
-+                           asize(desc), desc, 0, NULL, &oenv))
-+    {
-+        return 0;
-+    }
-+
-+
-+    /* we set these early because they might be used in init_multisystem()
-+       Note that there is the potential for npme>nnodes until the number of
-+       threads is set later on, if there's thread parallelization. That shouldn't
-+       lead to problems. */
-+    dd_node_order = nenum(ddno_opt);
-+    cr->npmenodes = npme;
-+
-+    hw_opt.thread_affinity = nenum(thread_aff_opt);
-+
-+    /* now check the -multi and -multidir option */
-+    if (opt2bSet("-multidir", NFILE, fnm))
-+    {
-+        if (nmultisim > 0)
-+        {
-+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
-+        }
-+        nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
-+    }
-+
-+
-+    if (repl_ex_nst != 0 && nmultisim < 2)
-+    {
-+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multi)");
-+    }
-+
-+    if (repl_ex_nex < 0)
-+    {
-+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
-+    }
-+
-+    if (nmultisim > 1)
-+    {
-+#ifndef GMX_THREAD_MPI
-+        gmx_bool bParFn = (multidir == NULL);
-+        init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
-+#else
-+        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library. "
-+                  "Please compile GROMACS with MPI support");
-+#endif
-+    }
-+
-+    bAddPart = !bAppendFiles;
-+
-+    /* Check if there is ANY checkpoint file available */
-+    sim_part    = 1;
-+    sim_part_fn = sim_part;
-+    if (opt2bSet("-cpi", NFILE, fnm))
-+    {
-+        if (bSepPot && bAppendFiles)
-+        {
-+            gmx_fatal(FARGS, "Output file appending is not supported with -seppot");
-+        }
-+
-+        bAppendFiles =
-+            read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
-+                                                          fnm, cr),
-+                                            &sim_part_fn, NULL, cr,
-+                                            bAppendFiles, NFILE, fnm,
-+                                            part_suffix, &bAddPart);
-+        if (sim_part_fn == 0 && MULTIMASTER(cr))
-+        {
-+            fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
-+        }
-+        else
-+        {
-+            sim_part = sim_part_fn + 1;
-+        }
-+
-+        if (MULTISIM(cr) && MASTER(cr))
-+        {
-+            if (MULTIMASTER(cr))
-+            {
-+                /* Log file is not yet available, so if there's a
-+                 * problem we can only write to stderr. */
-+                fpmulti = stderr;
-+            }
-+            else
-+            {
-+                fpmulti = NULL;
-+            }
-+            check_multi_int(fpmulti, cr->ms, sim_part, "simulation part", TRUE);
-+        }
-+    }
-+    else
-+    {
-+        bAppendFiles = FALSE;
-+    }
-+
-+    if (!bAppendFiles)
-+    {
-+        sim_part_fn = sim_part;
-+    }
-+
-+    if (bAddPart)
-+    {
-+        /* Rename all output files (except checkpoint files) */
-+        /* create new part name first (zero-filled) */
-+        sprintf(suffix, "%s%04d", part_suffix, sim_part_fn);
-+
-+        add_suffix_to_output_names(fnm, NFILE, suffix);
-+        if (MULTIMASTER(cr))
-+        {
-+            fprintf(stdout, "Checkpoint file is from part %d, new output files will be suffixed '%s'.\n", sim_part-1, suffix);
-+        }
-+    }
-+
-+    Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
-+    Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
-+    Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
-+    Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
-+    Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
-+    Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
-+    Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
-+    Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
-+    Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
-+    Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
-+    Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
-+    Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
-+    Flags = Flags | (sim_part > 1    ? MD_STARTFROMCPT : 0);
-+    Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
-+    Flags = Flags | (bIMDwait      ? MD_IMDWAIT      : 0);
-+    Flags = Flags | (bIMDterm      ? MD_IMDTERM      : 0);
-+    Flags = Flags | (bIMDpull      ? MD_IMDPULL      : 0);
-+
-+    /* We postpone opening the log file if we are appending, so we can
-+       first truncate the old log file and append to the correct position
-+       there instead.  */
-+    if ((MASTER(cr) || bSepPot) && !bAppendFiles)
-+    {
-+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
-+                     !bSepPot, Flags & MD_APPENDFILES, &fplog);
-+        please_cite(fplog, "Hess2008b");
-+        please_cite(fplog, "Spoel2005a");
-+        please_cite(fplog, "Lindahl2001a");
-+        please_cite(fplog, "Berendsen95a");
-+    }
-+    else if (!MASTER(cr) && bSepPot)
-+    {
-+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr, !bSepPot, Flags, &fplog);
-+    }
-+    else
-+    {
-+        fplog = NULL;
-+    }
-+
-+    ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
-+    ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-+    ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
-+
-+    rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-+                  nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-+                  dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
-+                  nbpu_opt[0], nstlist,
-+                  nsteps, nstepout, resetstep,
-+                  nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-+                  pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
-+
-+    /* Log file has to be closed in mdrunner if we are appending to it
-+       (fplog not set here) */
-+    if (MASTER(cr) && !bAppendFiles)
-+    {
-+        gmx_log_close(fplog);
-+    }
-+
-+    return rc;
-+}
-diff --git a/src/programs/mdrun/repl_ex.c b/src/programs/mdrun/repl_ex.c
-index 46a9bc0..cfb0b7f 100644
---- a/src/programs/mdrun/repl_ex.c
-+++ b/src/programs/mdrun/repl_ex.c
-@@ -51,6 +51,12 @@
- #include "domdec.h"
- #include "gromacs/random/random.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #define PROBABILITYCUTOFF 100
- /* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
- 
-@@ -112,14 +118,16 @@ static gmx_bool repl_quantity(const gmx_multisim_t *ms,
-     qall[re->repl] = q;
-     gmx_sum_sim(ms->nsim, qall, ms);
- 
--    bDiff = FALSE;
--    for (s = 1; s < ms->nsim; s++)
--    {
--        if (qall[s] != qall[0])
--        {
-+    /* PLUMED */
-+    //bDiff = FALSE;
-+    //for (s = 1; s < ms->nsim; s++)
-+    //{
-+    //    if (qall[s] != qall[0])
-+    //    {
-             bDiff = TRUE;
--        }
--    }
-+    //    }
-+    //}
-+    /* END PLUMED */
- 
-     if (bDiff)
-     {
-@@ -269,6 +277,10 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-         re->ind[i] = i;
-     }
- 
-+    /* PLUMED */
-+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
-+    // in those cases replicas can share the same temperature.
-+    /*
-     if (re->type < ereENDSINGLE)
-     {
- 
-@@ -277,11 +289,12 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-             for (j = i+1; j < re->nrepl; j++)
-             {
-                 if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
--                {
-+                {*/
-                     /* Unordered replicas are supposed to work, but there
-                      * is still an issues somewhere.
-                      * Note that at this point still re->ind[i]=i.
-                      */
-+                 /*
-                     gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
-                               i, j,
-                               erename[re->type],
-@@ -299,6 +312,8 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-             }
-         }
-     }
-+    */
-+    /* END PLUMED */
- 
-     /* keep track of all the swaps, starting with the initial placement. */
-     snew(re->allswaps, re->nrepl);
-@@ -982,6 +997,10 @@ test_for_replica_exchange(FILE                 *fplog,
-         pind[i] = re->ind[i];
-     }
- 
-+    /* PLUMED */
-+    int plumed_test_exchange_pattern=0;
-+    /* END PLUMED */
-+
-     if (bMultiEx)
-     {
-         /* multiple random switch exchange */
-@@ -1057,6 +1076,31 @@ test_for_replica_exchange(FILE                 *fplog,
-         /* standard nearest neighbor replica exchange */
- 
-         m = (step / re->nst) % 2;
-+        /* PLUMED */
-+        if(plumedswitch){
-+          int partner=re->repl;
-+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
-+          if(plumed_test_exchange_pattern>0){
-+            int *list;
-+            snew(list,re->nrepl);
-+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
-+            plumed_cmd(plumedmain,"getExchangesList",list);
-+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
-+            sfree(list);
-+          }
-+
-+          for(i=1; i<re->nrepl; i++) {
-+            if (i % 2 != m) continue;
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+            if(re->repl==a) partner=b;
-+            if(re->repl==b) partner=a;
-+          }
-+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
-+          plumed_cmd(plumedmain,"GREX calculate",NULL);
-+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
-+        }
-+        /* END PLUMED */
-         for (i = 1; i < re->nrepl; i++)
-         {
-             a = re->ind[i-1];
-@@ -1066,6 +1110,18 @@ test_for_replica_exchange(FILE                 *fplog,
-             if (i % 2 == m)
-             {
-                 delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  real adb,bdb,dplumed;
-+                  char buf[300];
-+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
-+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
-+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
-+                  delta+=dplumed;
-+                  if (bPrint)
-+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
-+                }
-+                /* END PLUMED */
-                 if (delta <= 0)
-                 {
-                     /* accepted */
-@@ -1092,11 +1148,22 @@ test_for_replica_exchange(FILE                 *fplog,
- 
-                 if (bEx[i])
-                 {
-+                  /* PLUMED */
-+                  if(!plumed_test_exchange_pattern) {
-+                    /* standard neighbour swapping */
-                     /* swap these two */
-                     tmp       = pind[i-1];
-                     pind[i-1] = pind[i];
-                     pind[i]   = tmp;
-                     re->nexchange[i]++;  /* statistics for back compatibility */
-+                  } else {
-+                    /* alternative swapping patterns */
-+                    tmp       = pind[a];
-+                    pind[a]   = pind[b];
-+                    pind[b]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                  }
-+                  /* END PLUMED */
-                 }
-             }
-             else
-@@ -1112,6 +1179,15 @@ test_for_replica_exchange(FILE                 *fplog,
-         re->nattempt[m]++;
-     }
- 
-+    /* PLUMED */
-+    if(plumed_test_exchange_pattern>0) {
-+      for (i = 0; i < re->nrepl; i++)
-+      {
-+          re->ind[i] = i;
-+      }
-+    }
-+    /* END PLUMED */
-+
-     /* record which moves were made and accepted */
-     for (i = 0; i < re->nrepl; i++)
-     {
-@@ -1316,6 +1392,10 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
-     /* The order in which multiple exchanges will occur. */
-     gmx_bool bThisReplicaExchanged = FALSE;
- 
-+    /* PLUMED */
-+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
-+    /* END PLUMED */
-+
-     if (MASTER(cr))
-     {
-         replica_id  = re->repl;
-diff --git a/src/programs/mdrun/repl_ex.c.preplumed b/src/programs/mdrun/repl_ex.c.preplumed
-new file mode 100644
-index 0000000..46a9bc0
---- /dev/null
-+++ b/src/programs/mdrun/repl_ex.c.preplumed
-@@ -0,0 +1,1439 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <math.h>
-+#include "repl_ex.h"
-+#include "network.h"
-+#include "gromacs/random/random.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "physics.h"
-+#include "copyrite.h"
-+#include "macros.h"
-+#include "vec.h"
-+#include "names.h"
-+#include "domdec.h"
-+#include "gromacs/random/random.h"
-+
-+#define PROBABILITYCUTOFF 100
-+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
-+
-+enum {
-+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
-+};
-+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
-+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
-+   it are multiple replica exchange methods */
-+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
-+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
-+
-+typedef struct gmx_repl_ex
-+{
-+    int       repl;
-+    int       nrepl;
-+    real      temp;
-+    int       type;
-+    real    **q;
-+    gmx_bool  bNPT;
-+    real     *pres;
-+    int      *ind;
-+    int      *allswaps;
-+    int       nst;
-+    int       nex;
-+    int       seed;
-+    int       nattempt[2];
-+    real     *prob_sum;
-+    int     **nmoves;
-+    int      *nexchange;
-+    gmx_rng_t rng;
-+
-+    /* these are helper arrays for replica exchange; allocated here so they
-+       don't have to be allocated each time */
-+    int      *destinations;
-+    int     **cyclic;
-+    int     **order;
-+    int      *tmpswap;
-+    gmx_bool *incycle;
-+    gmx_bool *bEx;
-+
-+    /* helper arrays to hold the quantities that are exchanged */
-+    real  *prob;
-+    real  *Epot;
-+    real  *beta;
-+    real  *Vol;
-+    real **de;
-+
-+} t_gmx_repl_ex;
-+
-+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
-+                              struct gmx_repl_ex *re, int ere, real q)
-+{
-+    real    *qall;
-+    gmx_bool bDiff;
-+    int      i, s;
-+
-+    snew(qall, ms->nsim);
-+    qall[re->repl] = q;
-+    gmx_sum_sim(ms->nsim, qall, ms);
-+
-+    bDiff = FALSE;
-+    for (s = 1; s < ms->nsim; s++)
-+    {
-+        if (qall[s] != qall[0])
-+        {
-+            bDiff = TRUE;
-+        }
-+    }
-+
-+    if (bDiff)
-+    {
-+        /* Set the replica exchange type and quantities */
-+        re->type = ere;
-+
-+        snew(re->q[ere], re->nrepl);
-+        for (s = 0; s < ms->nsim; s++)
-+        {
-+            re->q[ere][s] = qall[s];
-+        }
-+    }
-+    sfree(qall);
-+    return bDiff;
-+}
-+
-+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-+                                    const gmx_multisim_t *ms,
-+                                    const t_state *state,
-+                                    const t_inputrec *ir,
-+                                    int nst, int nex, int init_seed)
-+{
-+    real                temp, pres;
-+    int                 i, j, k;
-+    struct gmx_repl_ex *re;
-+    gmx_bool            bTemp;
-+    gmx_bool            bLambda = FALSE;
-+
-+    fprintf(fplog, "\nInitializing Replica Exchange\n");
-+
-+    if (ms == NULL || ms->nsim == 1)
-+    {
-+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
-+    }
-+    if (!EI_DYNAMICS(ir->eI))
-+    {
-+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
-+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
-+         * distinct from MULTISIM(cr). A multi-simulation only runs
-+         * with real MPI parallelism, but this does not imply PAR(cr)
-+         * is true!
-+         *
-+         * Since we are using a dynamical integrator, the only
-+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
-+         * synonymous. The only way for cr->nnodes > 1 to be true is
-+         * if we are using DD. */
-+    }
-+
-+    snew(re, 1);
-+
-+    re->repl     = ms->sim;
-+    re->nrepl    = ms->nsim;
-+    snew(re->q, ereENDSINGLE);
-+
-+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
-+
-+    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
-+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
-+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
-+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
-+                      "first exchange step: init_step/-replex", FALSE);
-+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
-+    check_multi_int(fplog, ms, ir->opts.ngtc,
-+                    "the number of temperature coupling groups", FALSE);
-+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
-+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
-+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
-+
-+    re->temp = ir->opts.ref_t[0];
-+    for (i = 1; (i < ir->opts.ngtc); i++)
-+    {
-+        if (ir->opts.ref_t[i] != re->temp)
-+        {
-+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
-+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
-+        }
-+    }
-+
-+    re->type = -1;
-+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
-+    if (ir->efep != efepNO)
-+    {
-+        bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
-+    }
-+    if (re->type == -1)  /* nothing was assigned */
-+    {
-+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
-+    }
-+    if (bLambda && bTemp)
-+    {
-+        re->type = ereTL;
-+    }
-+
-+    if (bTemp)
-+    {
-+        please_cite(fplog, "Sugita1999a");
-+        if (ir->epc != epcNO)
-+        {
-+            re->bNPT = TRUE;
-+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
-+            please_cite(fplog, "Okabe2001a");
-+        }
-+        if (ir->etc == etcBERENDSEN)
-+        {
-+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
-+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
-+        }
-+    }
-+    if (bLambda)
-+    {
-+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
-+        {
-+            gmx_fatal(FARGS, "delta_lambda is not zero");
-+        }
-+    }
-+    if (re->bNPT)
-+    {
-+        snew(re->pres, re->nrepl);
-+        if (ir->epct == epctSURFACETENSION)
-+        {
-+            pres = ir->ref_p[ZZ][ZZ];
-+        }
-+        else
-+        {
-+            pres = 0;
-+            j    = 0;
-+            for (i = 0; i < DIM; i++)
-+            {
-+                if (ir->compress[i][i] != 0)
-+                {
-+                    pres += ir->ref_p[i][i];
-+                    j++;
-+                }
-+            }
-+            pres /= j;
-+        }
-+        re->pres[re->repl] = pres;
-+        gmx_sum_sim(re->nrepl, re->pres, ms);
-+    }
-+
-+    /* Make an index for increasing replica order */
-+    /* only makes sense if one or the other is varying, not both!
-+       if both are varying, we trust the order the person gave. */
-+    snew(re->ind, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->ind[i] = i;
-+    }
-+
-+    if (re->type < ereENDSINGLE)
-+    {
-+
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = i+1; j < re->nrepl; j++)
-+            {
-+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
-+                {
-+                    /* Unordered replicas are supposed to work, but there
-+                     * is still an issues somewhere.
-+                     * Note that at this point still re->ind[i]=i.
-+                     */
-+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
-+                              i, j,
-+                              erename[re->type],
-+                              re->q[re->type][i], re->q[re->type][j],
-+                              erename[re->type]);
-+
-+                    k          = re->ind[i];
-+                    re->ind[i] = re->ind[j];
-+                    re->ind[j] = k;
-+                }
-+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
-+                {
-+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
-+                }
-+            }
-+        }
-+    }
-+
-+    /* keep track of all the swaps, starting with the initial placement. */
-+    snew(re->allswaps, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->allswaps[i] = re->ind[i];
-+    }
-+
-+    switch (re->type)
-+    {
-+        case ereTEMP:
-+            fprintf(fplog, "\nReplica exchange in temperature\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        case ereLAMBDA:
-+            fprintf(fplog, "\nReplica exchange in lambda\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        case ereTL:
-+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        default:
-+            gmx_incons("Unknown replica exchange quantity");
-+    }
-+    if (re->bNPT)
-+    {
-+        fprintf(fplog, "\nRepl  p");
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
-+        }
-+
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
-+            {
-+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
-+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
-+            }
-+        }
-+    }
-+    re->nst = nst;
-+    if (init_seed == -1)
-+    {
-+        if (MASTERSIM(ms))
-+        {
-+            re->seed = (int)gmx_rng_make_seed();
-+        }
-+        else
-+        {
-+            re->seed = 0;
-+        }
-+        gmx_sumi_sim(1, &(re->seed), ms);
-+    }
-+    else
-+    {
-+        re->seed = init_seed;
-+    }
-+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
-+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
-+    re->rng = gmx_rng_init(re->seed);
-+
-+    re->nattempt[0] = 0;
-+    re->nattempt[1] = 0;
-+
-+    snew(re->prob_sum, re->nrepl);
-+    snew(re->nexchange, re->nrepl);
-+    snew(re->nmoves, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->nmoves[i], re->nrepl);
-+    }
-+    fprintf(fplog, "Replica exchange information below: x=exchange, pr=probability\n");
-+
-+    /* generate space for the helper functions so we don't have to snew each time */
-+
-+    snew(re->destinations, re->nrepl);
-+    snew(re->incycle, re->nrepl);
-+    snew(re->tmpswap, re->nrepl);
-+    snew(re->cyclic, re->nrepl);
-+    snew(re->order, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->cyclic[i], re->nrepl);
-+        snew(re->order[i], re->nrepl);
-+    }
-+    /* allocate space for the functions storing the data for the replicas */
-+    /* not all of these arrays needed in all cases, but they don't take
-+       up much space, since the max size is nrepl**2 */
-+    snew(re->prob, re->nrepl);
-+    snew(re->bEx, re->nrepl);
-+    snew(re->beta, re->nrepl);
-+    snew(re->Vol, re->nrepl);
-+    snew(re->Epot, re->nrepl);
-+    snew(re->de, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->de[i], re->nrepl);
-+    }
-+    re->nex = nex;
-+    return re;
-+}
-+
-+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
-+{
-+    real *buf;
-+    int   i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+
-+static void exchange_ints(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, int *v, int n)
-+{
-+    int *buf;
-+    int  i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
-+             buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
-+             ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
-+{
-+    double *buf;
-+    int     i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
-+{
-+    rvec *buf;
-+    int   i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            copy_rvec(buf[i], v[i]);
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
-+{
-+    /* When t_state changes, this code should be updated. */
-+    int ngtc, nnhpres;
-+    ngtc    = state->ngtc * state->nhchainlength;
-+    nnhpres = state->nnhpres* state->nhchainlength;
-+    exchange_rvecs(ms, b, state->box, DIM);
-+    exchange_rvecs(ms, b, state->box_rel, DIM);
-+    exchange_rvecs(ms, b, state->boxv, DIM);
-+    exchange_reals(ms, b, &(state->veta), 1);
-+    exchange_reals(ms, b, &(state->vol0), 1);
-+    exchange_rvecs(ms, b, state->svir_prev, DIM);
-+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
-+    exchange_rvecs(ms, b, state->pres_prev, DIM);
-+    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
-+    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
-+    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
-+    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
-+    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
-+    exchange_rvecs(ms, b, state->x, state->natoms);
-+    exchange_rvecs(ms, b, state->v, state->natoms);
-+    exchange_rvecs(ms, b, state->sd_X, state->natoms);
-+}
-+
-+static void copy_rvecs(rvec *s, rvec *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            copy_rvec(s[i], d[i]);
-+        }
-+    }
-+}
-+
-+static void copy_doubles(const double *s, double *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+static void copy_reals(const real *s, real *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+static void copy_ints(const int *s, int *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
-+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
-+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
-+#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
-+
-+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
-+{
-+    /* When t_state changes, this code should be updated. */
-+    int ngtc, nnhpres;
-+    ngtc    = state->ngtc * state->nhchainlength;
-+    nnhpres = state->nnhpres* state->nhchainlength;
-+    scopy_rvecs(box, DIM);
-+    scopy_rvecs(box_rel, DIM);
-+    scopy_rvecs(boxv, DIM);
-+    state_local->veta = state->veta;
-+    state_local->vol0 = state->vol0;
-+    scopy_rvecs(svir_prev, DIM);
-+    scopy_rvecs(fvir_prev, DIM);
-+    scopy_rvecs(pres_prev, DIM);
-+    scopy_doubles(nosehoover_xi, ngtc);
-+    scopy_doubles(nosehoover_vxi, ngtc);
-+    scopy_doubles(nhpres_xi, nnhpres);
-+    scopy_doubles(nhpres_vxi, nnhpres);
-+    scopy_doubles(therm_integral, state->ngtc);
-+    scopy_rvecs(x, state->natoms);
-+    scopy_rvecs(v, state->natoms);
-+    scopy_rvecs(sd_X, state->natoms);
-+    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
-+    scopy_reals(lambda, efptNR);
-+}
-+
-+static void scale_velocities(t_state *state, real fac)
-+{
-+    int i;
-+
-+    if (state->v)
-+    {
-+        for (i = 0; i < state->natoms; i++)
-+        {
-+            svmul(fac, state->v[i], state->v[i]);
-+        }
-+    }
-+}
-+
-+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
-+{
-+    int   i, j, ntot;
-+    float Tprint;
-+
-+    ntot = nattempt[0] + nattempt[1];
-+    fprintf(fplog, "\n");
-+    fprintf(fplog, "Repl");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "    ");  /* put the title closer to the center */
-+    }
-+    fprintf(fplog, "Empirical Transition Matrix\n");
-+
-+    fprintf(fplog, "Repl");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%8d", (i+1));
-+    }
-+    fprintf(fplog, "\n");
-+
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "Repl");
-+        for (j = 0; j < n; j++)
-+        {
-+            Tprint = 0.0;
-+            if (nmoves[i][j] > 0)
-+            {
-+                Tprint = nmoves[i][j]/(2.0*ntot);
-+            }
-+            fprintf(fplog, "%8.4f", Tprint);
-+        }
-+        fprintf(fplog, "%3d\n", i);
-+    }
-+}
-+
-+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
-+{
-+    int i;
-+
-+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
-+    for (i = 1; i < n; i++)
-+    {
-+        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
-+{
-+    int i;
-+
-+    for (i = 0; i < n; i++)
-+    {
-+        tmpswap[i] = allswaps[i];
-+    }
-+    for (i = 0; i < n; i++)
-+    {
-+        allswaps[i] = tmpswap[pind[i]];
-+    }
-+
-+    fprintf(fplog, "\nAccepted Exchanges:   ");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%d ", pind[i]);
-+    }
-+    fprintf(fplog, "\n");
-+
-+    /* the "Order After Exchange" is the state label corresponding to the configuration that
-+       started in state listed in order, i.e.
-+
-+       3 0 1 2
-+
-+       means that the:
-+       configuration starting in simulation 3 is now in simulation 0,
-+       configuration starting in simulation 0 is now in simulation 1,
-+       configuration starting in simulation 1 is now in simulation 2,
-+       configuration starting in simulation 2 is now in simulation 3
-+     */
-+    fprintf(fplog, "Order After Exchange: ");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%d ", allswaps[i]);
-+    }
-+    fprintf(fplog, "\n\n");
-+}
-+
-+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
-+{
-+    int  i;
-+    char buf[8];
-+
-+    fprintf(fplog, "Repl %2s ", leg);
-+    for (i = 1; i < n; i++)
-+    {
-+        if (prob[i] >= 0)
-+        {
-+            sprintf(buf, "%4.2f", prob[i]);
-+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
-+        }
-+        else
-+        {
-+            fprintf(fplog, "     ");
-+        }
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static void print_count(FILE *fplog, const char *leg, int n, int *count)
-+{
-+    int i;
-+
-+    fprintf(fplog, "Repl %2s ", leg);
-+    for (i = 1; i < n; i++)
-+    {
-+        fprintf(fplog, " %4d", count[i]);
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
-+{
-+
-+    real   ediff, dpV, delta = 0;
-+    real  *Epot = re->Epot;
-+    real  *Vol  = re->Vol;
-+    real **de   = re->de;
-+    real  *beta = re->beta;
-+
-+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
-+       to the non permuted case */
-+
-+    switch (re->type)
-+    {
-+        case ereTEMP:
-+            /*
-+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
-+             */
-+            ediff = Epot[b] - Epot[a];
-+            delta = -(beta[bp] - beta[ap])*ediff;
-+            break;
-+        case ereLAMBDA:
-+            /* two cases:  when we are permuted, and not.  */
-+            /* non-permuted:
-+               ediff =  E_new - E_old
-+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
-+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
-+                     =  de[b][a] + de[a][b] */
-+
-+            /* permuted:
-+               ediff =  E_new - E_old
-+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
-+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
-+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
-+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
-+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
-+            /* but, in the current code implementation, we flip configurations, not indices . . .
-+               So let's examine that.
-+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
-+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
-+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
-+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
-+                     So the simple solution is to flip the
-+                     position of perturbed and original indices in the tests.
-+             */
-+
-+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
-+            delta = ediff*beta[a]; /* assume all same temperature in this case */
-+            break;
-+        case ereTL:
-+            /* not permuted:  */
-+            /* delta =  reduced E_new - reduced E_old
-+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
-+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
-+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
-+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
-+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
-+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
-+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
-+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
-+            /* permuted (big breath!) */
-+            /*   delta =  reduced E_new - reduced E_old
-+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
-+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
-+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
-+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
-+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
-+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
-+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
-+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
-+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
-+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
-+            break;
-+        default:
-+            gmx_incons("Unknown replica exchange quantity");
-+    }
-+    if (bPrint)
-+    {
-+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
-+    }
-+    if (re->bNPT)
-+    {
-+        /* revist the calculation for 5.0.  Might be some improvements. */
-+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
-+        if (bPrint)
-+        {
-+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
-+        }
-+        delta += dpV;
-+    }
-+    return delta;
-+}
-+
-+static void
-+test_for_replica_exchange(FILE                 *fplog,
-+                          const gmx_multisim_t *ms,
-+                          struct gmx_repl_ex   *re,
-+                          gmx_enerdata_t       *enerd,
-+                          real                  vol,
-+                          gmx_int64_t           step,
-+                          real                  time)
-+{
-+    int       m, i, j, a, b, ap, bp, i0, i1, tmp;
-+    real      ediff = 0, delta = 0, dpV = 0;
-+    gmx_bool  bPrint, bMultiEx;
-+    gmx_bool *bEx      = re->bEx;
-+    real     *prob     = re->prob;
-+    int      *pind     = re->destinations; /* permuted index */
-+    gmx_bool  bEpot    = FALSE;
-+    gmx_bool  bDLambda = FALSE;
-+    gmx_bool  bVol     = FALSE;
-+    gmx_rng_t rng;
-+
-+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
-+    fprintf(fplog, "Replica exchange at step " "%"GMX_PRId64 " time %.5f\n", step, time);
-+
-+    if (re->bNPT)
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->Vol[i] = 0;
-+        }
-+        bVol               = TRUE;
-+        re->Vol[re->repl]  = vol;
-+    }
-+    if ((re->type == ereTEMP || re->type == ereTL))
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->Epot[i] = 0;
-+        }
-+        bEpot              = TRUE;
-+        re->Epot[re->repl] = enerd->term[F_EPOT];
-+        /* temperatures of different states*/
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
-+        }
-+    }
-+    else
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
-+        }
-+    }
-+    if (re->type == ereLAMBDA || re->type == ereTL)
-+    {
-+        bDLambda = TRUE;
-+        /* lambda differences. */
-+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
-+           minus the energy of the jth simulation in the jth Hamiltonian */
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = 0; j < re->nrepl; j++)
-+            {
-+                re->de[i][j] = 0;
-+            }
-+        }
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
-+        }
-+    }
-+
-+    /* now actually do the communication */
-+    if (bVol)
-+    {
-+        gmx_sum_sim(re->nrepl, re->Vol, ms);
-+    }
-+    if (bEpot)
-+    {
-+        gmx_sum_sim(re->nrepl, re->Epot, ms);
-+    }
-+    if (bDLambda)
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            gmx_sum_sim(re->nrepl, re->de[i], ms);
-+        }
-+    }
-+
-+    /* make a duplicate set of indices for shuffling */
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        pind[i] = re->ind[i];
-+    }
-+
-+    if (bMultiEx)
-+    {
-+        /* multiple random switch exchange */
-+        int nself = 0;
-+        for (i = 0; i < re->nex + nself; i++)
-+        {
-+            double rnd[2];
-+
-+            gmx_rng_cycle_2uniform(step, i*2, re->seed, RND_SEED_REPLEX, rnd);
-+            /* randomly select a pair  */
-+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
-+               probability of occurring (log p > -100) and only operate on those switches */
-+            /* find out which state it is from, and what label that state currently has. Likely
-+               more work that useful. */
-+            i0 = (int)(re->nrepl*rnd[0]);
-+            i1 = (int)(re->nrepl*rnd[1]);
-+            if (i0 == i1)
-+            {
-+                nself++;
-+                continue;  /* self-exchange, back up and do it again */
-+            }
-+
-+            a  = re->ind[i0]; /* what are the indices of these states? */
-+            b  = re->ind[i1];
-+            ap = pind[i0];
-+            bp = pind[i1];
-+
-+            bPrint = FALSE; /* too noisy */
-+            /* calculate the energy difference */
-+            /* if the code changes to flip the STATES, rather than the configurations,
-+               use the commented version of the code */
-+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
-+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
-+
-+            /* we actually only use the first space in the prob and bEx array,
-+               since there are actually many switches between pairs. */
-+
-+            if (delta <= 0)
-+            {
-+                /* accepted */
-+                prob[0] = 1;
-+                bEx[0]  = TRUE;
-+            }
-+            else
-+            {
-+                if (delta > PROBABILITYCUTOFF)
-+                {
-+                    prob[0] = 0;
-+                }
-+                else
-+                {
-+                    prob[0] = exp(-delta);
-+                }
-+                /* roll a number to determine if accepted */
-+                gmx_rng_cycle_2uniform(step, i*2+1, re->seed, RND_SEED_REPLEX, rnd);
-+                bEx[0] = rnd[0] < prob[0];
-+            }
-+            re->prob_sum[0] += prob[0];
-+
-+            if (bEx[0])
-+            {
-+                /* swap the states */
-+                tmp      = pind[i0];
-+                pind[i0] = pind[i1];
-+                pind[i1] = tmp;
-+            }
-+        }
-+        re->nattempt[0]++;  /* keep track of total permutation trials here */
-+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
-+    }
-+    else
-+    {
-+        /* standard nearest neighbor replica exchange */
-+
-+        m = (step / re->nst) % 2;
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+
-+            bPrint = (re->repl == a || re->repl == b);
-+            if (i % 2 == m)
-+            {
-+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                if (delta <= 0)
-+                {
-+                    /* accepted */
-+                    prob[i] = 1;
-+                    bEx[i]  = TRUE;
-+                }
-+                else
-+                {
-+                    double rnd[2];
-+
-+                    if (delta > PROBABILITYCUTOFF)
-+                    {
-+                        prob[i] = 0;
-+                    }
-+                    else
-+                    {
-+                        prob[i] = exp(-delta);
-+                    }
-+                    /* roll a number to determine if accepted */
-+                    gmx_rng_cycle_2uniform(step, i, re->seed, RND_SEED_REPLEX, rnd);
-+                    bEx[i] = rnd[0] < prob[i];
-+                }
-+                re->prob_sum[i] += prob[i];
-+
-+                if (bEx[i])
-+                {
-+                    /* swap these two */
-+                    tmp       = pind[i-1];
-+                    pind[i-1] = pind[i];
-+                    pind[i]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                }
-+            }
-+            else
-+            {
-+                prob[i] = -1;
-+                bEx[i]  = FALSE;
-+            }
-+        }
-+        /* print some statistics */
-+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
-+        print_prob(fplog, "pr", re->nrepl, prob);
-+        fprintf(fplog, "\n");
-+        re->nattempt[m]++;
-+    }
-+
-+    /* record which moves were made and accepted */
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->nmoves[re->ind[i]][pind[i]] += 1;
-+        re->nmoves[pind[i]][re->ind[i]] += 1;
-+    }
-+    fflush(fplog); /* make sure we can see what the last exchange was */
-+}
-+
-+static void write_debug_x(t_state *state)
-+{
-+    int i;
-+
-+    if (debug)
-+    {
-+        for (i = 0; i < state->natoms; i += 10)
-+        {
-+            fprintf(debug, "dx %5d %10.5f %10.5f %10.5f\n", i, state->x[i][XX], state->x[i][YY], state->x[i][ZZ]);
-+        }
-+    }
-+}
-+
-+static void
-+cyclic_decomposition(const int *destinations,
-+                     int      **cyclic,
-+                     gmx_bool  *incycle,
-+                     const int  nrepl,
-+                     int       *nswap)
-+{
-+
-+    int i, j, c, p;
-+    int maxlen = 1;
-+    for (i = 0; i < nrepl; i++)
-+    {
-+        incycle[i] = FALSE;
-+    }
-+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
-+    {
-+        if (incycle[i])
-+        {
-+            cyclic[i][0] = -1;
-+            continue;
-+        }
-+        cyclic[i][0] = i;
-+        incycle[i]   = TRUE;
-+        c            = 1;
-+        p            = i;
-+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
-+        {
-+            p = destinations[p];    /* start permuting */
-+            if (p == i)
-+            {
-+                cyclic[i][c] = -1;
-+                if (c > maxlen)
-+                {
-+                    maxlen = c;
-+                }
-+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
-+            }
-+            else
-+            {
-+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
-+                incycle[p]   = TRUE;
-+                c++;
-+            }
-+        }
-+    }
-+    *nswap = maxlen - 1;
-+
-+    if (debug)
-+    {
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            fprintf(debug, "Cycle %d:", i);
-+            for (j = 0; j < nrepl; j++)
-+            {
-+                if (cyclic[i][j] < 0)
-+                {
-+                    break;
-+                }
-+                fprintf(debug, "%2d", cyclic[i][j]);
-+            }
-+            fprintf(debug, "\n");
-+        }
-+        fflush(debug);
-+    }
-+}
-+
-+static void
-+compute_exchange_order(FILE     *fplog,
-+                       int     **cyclic,
-+                       int     **order,
-+                       const int nrepl,
-+                       const int maxswap)
-+{
-+    int i, j;
-+
-+    for (j = 0; j < maxswap; j++)
-+    {
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            if (cyclic[i][j+1] >= 0)
-+            {
-+                order[cyclic[i][j+1]][j] = cyclic[i][j];
-+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
-+            }
-+        }
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            if (order[i][j] < 0)
-+            {
-+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
-+            }
-+        }
-+    }
-+
-+    if (debug)
-+    {
-+        fprintf(fplog, "Replica Exchange Order\n");
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            fprintf(fplog, "Replica %d:", i);
-+            for (j = 0; j < maxswap; j++)
-+            {
-+                if (order[i][j] < 0)
-+                {
-+                    break;
-+                }
-+                fprintf(debug, "%2d", order[i][j]);
-+            }
-+            fprintf(fplog, "\n");
-+        }
-+        fflush(fplog);
-+    }
-+}
-+
-+static void
-+prepare_to_do_exchange(FILE               *fplog,
-+                       struct gmx_repl_ex *re,
-+                       const int           replica_id,
-+                       int                *maxswap,
-+                       gmx_bool           *bThisReplicaExchanged)
-+{
-+    int i, j;
-+    /* Hold the cyclic decomposition of the (multiple) replica
-+     * exchange. */
-+    gmx_bool bAnyReplicaExchanged = FALSE;
-+    *bThisReplicaExchanged = FALSE;
-+
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        if (re->destinations[i] != re->ind[i])
-+        {
-+            /* only mark as exchanged if the index has been shuffled */
-+            bAnyReplicaExchanged = TRUE;
-+            break;
-+        }
-+    }
-+    if (bAnyReplicaExchanged)
-+    {
-+        /* reinitialize the placeholder arrays */
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = 0; j < re->nrepl; j++)
-+            {
-+                re->cyclic[i][j] = -1;
-+                re->order[i][j]  = -1;
-+            }
-+        }
-+
-+        /* Identify the cyclic decomposition of the permutation (very
-+         * fast if neighbor replica exchange). */
-+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
-+
-+        /* Now translate the decomposition into a replica exchange
-+         * order at each step. */
-+        compute_exchange_order(fplog, re->cyclic, re->order, re->nrepl, *maxswap);
-+
-+        /* Did this replica do any exchange at any point? */
-+        for (j = 0; j < *maxswap; j++)
-+        {
-+            if (replica_id != re->order[replica_id][j])
-+            {
-+                *bThisReplicaExchanged = TRUE;
-+                break;
-+            }
-+        }
-+    }
-+}
-+
-+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
-+                          t_state *state, gmx_enerdata_t *enerd,
-+                          t_state *state_local, gmx_int64_t step, real time)
-+{
-+    int i, j;
-+    int replica_id = 0;
-+    int exchange_partner;
-+    int maxswap = 0;
-+    /* Number of rounds of exchanges needed to deal with any multiple
-+     * exchanges. */
-+    /* Where each replica ends up after the exchange attempt(s). */
-+    /* The order in which multiple exchanges will occur. */
-+    gmx_bool bThisReplicaExchanged = FALSE;
-+
-+    if (MASTER(cr))
-+    {
-+        replica_id  = re->repl;
-+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
-+        prepare_to_do_exchange(fplog, re, replica_id, &maxswap, &bThisReplicaExchanged);
-+    }
-+    /* Do intra-simulation broadcast so all processors belonging to
-+     * each simulation know whether they need to participate in
-+     * collecting the state. Otherwise, they might as well get on with
-+     * the next thing to do. */
-+    if (DOMAINDECOMP(cr))
-+    {
-+#ifdef GMX_MPI
-+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
-+                  cr->mpi_comm_mygroup);
-+#endif
-+    }
-+
-+    if (bThisReplicaExchanged)
-+    {
-+        /* Exchange the states */
-+        /* Collect the global state on the master node */
-+        if (DOMAINDECOMP(cr))
-+        {
-+            dd_collect_state(cr->dd, state_local, state);
-+        }
-+        else
-+        {
-+            copy_state_nonatomdata(state_local, state);
-+        }
-+
-+        if (MASTER(cr))
-+        {
-+            /* There will be only one swap cycle with standard replica
-+             * exchange, but there may be multiple swap cycles if we
-+             * allow multiple swaps. */
-+
-+            for (j = 0; j < maxswap; j++)
-+            {
-+                exchange_partner = re->order[replica_id][j];
-+
-+                if (exchange_partner != replica_id)
-+                {
-+                    /* Exchange the global states between the master nodes */
-+                    if (debug)
-+                    {
-+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
-+                    }
-+                    exchange_state(cr->ms, exchange_partner, state);
-+                }
-+            }
-+            /* For temperature-type replica exchange, we need to scale
-+             * the velocities. */
-+            if (re->type == ereTEMP || re->type == ereTL)
-+            {
-+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
-+            }
-+
-+        }
-+
-+        /* With domain decomposition the global state is distributed later */
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            /* Copy the global state to the local state data structure */
-+            copy_state_nonatomdata(state, state_local);
-+        }
-+    }
-+
-+    return bThisReplicaExchanged;
-+}
-+
-+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
-+{
-+    int  i;
-+
-+    fprintf(fplog, "\nReplica exchange statistics\n");
-+
-+    if (re->nex == 0)
-+    {
-+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
-+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
-+
-+        fprintf(fplog, "Repl  average probabilities:\n");
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            if (re->nattempt[i%2] == 0)
-+            {
-+                re->prob[i] = 0;
-+            }
-+            else
-+            {
-+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
-+            }
-+        }
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_prob(fplog, "", re->nrepl, re->prob);
-+
-+        fprintf(fplog, "Repl  number of exchanges:\n");
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_count(fplog, "", re->nrepl, re->nexchange);
-+
-+        fprintf(fplog, "Repl  average number of exchanges:\n");
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            if (re->nattempt[i%2] == 0)
-+            {
-+                re->prob[i] = 0;
-+            }
-+            else
-+            {
-+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
-+            }
-+        }
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_prob(fplog, "", re->nrepl, re->prob);
-+
-+        fprintf(fplog, "\n");
-+    }
-+    /* print the transition matrix */
-+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
-+}
diff --git a/g/GROMACS/gromacs-5.0.4-plumed-2.1.3.patch b/g/GROMACS/gromacs-5.0.4-plumed-2.1.3.patch
deleted file mode 100644
index 9faba3c3..00000000
--- a/g/GROMACS/gromacs-5.0.4-plumed-2.1.3.patch
+++ /dev/null
@@ -1,9575 +0,0 @@
-diff --git a/Plumed.cmake b/Plumed.cmake
-new file mode 100644
-index 0000000..f66e115
---- /dev/null
-+++ b/Plumed.cmake
-@@ -0,0 +1,3 @@
-+# PLUMED: shared installation
-+set(PLUMED_LOAD  /home/jas02/software/PLUMED/2.1.3-foss-2015b/lib/plumed///src/lib/libplumed.so -ldl )
-+set(PLUMED_DEPENDENCIES  /home/jas02/software/PLUMED/2.1.3-foss-2015b/lib/plumed///src/lib/libplumed.so)
-diff --git a/Plumed.h b/Plumed.h
-new file mode 100644
-index 0000000..16da74a
---- /dev/null
-+++ b/Plumed.h
-@@ -0,0 +1,494 @@
-+/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-+   Copyright (c) 2011-2014 The plumed team
-+   (see the PEOPLE file at the root of the distribution for a list of names)
-+
-+   See http://www.plumed-code.org for more information.
-+
-+   This file is part of plumed, version 2.
-+
-+   plumed is free software: you can redistribute it and/or modify
-+   it under the terms of the GNU Lesser General Public License as published by
-+   the Free Software Foundation, either version 3 of the License, or
-+   (at your option) any later version.
-+
-+   plumed is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+   GNU Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public License
-+   along with plumed.  If not, see <http://www.gnu.org/licenses/>.
-++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
-+#ifndef __PLUMED_wrapper_Plumed_h
-+#define __PLUMED_wrapper_Plumed_h
-+
-+/**
-+\page ReferencePlumedH Reference for interfacing MD codes with PLUMED
-+
-+  Plumed.h and Plumed.c contain the external plumed interface, which is used to
-+  integrate it with MD engines. This interface is very general, and is expected
-+  not to change across plumed versions. Plumed.c also implements a dummy version
-+  of the interface, so as to allow a code to be fully linked even if the plumed
-+  library is not available yet. These files could be directly included in the official
-+  host MD distribution. In this manner, it will be sufficient to link the plumed
-+  library at link time (on all systems) or directly at runtime (on system where
-+  dynamic loading is enabled) to include plumed features.
-+
-+  Why is Plumed.c written in C and not C++? The reason is that the resulting Plumed.o
-+  needs to be linked with the host MD code immediately (whereas the rest of plumed
-+  could be linked a posteriori). Imagine the MD code is written in FORTRAN: when we
-+  link the Plumed.o file we would like not to need any C++ library linked. In this
-+  manner, we do not need to know which C++ compiler will be used to compile plumed.
-+  The C++ library is only linked to the "rest" of plumed, which actually use it.
-+  Anyway, Plumed.c is written in such a manner to allow its compilation also in C++
-+  (C++ is a bit stricter than C; compatibility is checked when PlumedStatic.cpp,
-+  which basically includes Plumed.c, is compiled with the C++ compiler). This will
-+  allow e.g. MD codes written in C++ to just incorporate Plumed.c (maybe renamed into
-+  Plumed.cpp), without the need of configuring a plain C compiler.
-+
-+  Plumed interface can be used from C, C++ and FORTRAN. Everything concerning plumed
-+  is hidden inside a single object type, which is described in C by a structure
-+  (struct \ref plumed), in C++ by a class (PLMD::Plumed) and in FORTRAN by a
-+  fixed-length string (CHARACTER(LEN=32)). Obviously C++ can use both struct
-+  and class interfaces, but the first should be preferred. The reference interface
-+  is the C one, whereas FORTRAN and C++ interfaces are implemented as wrappers
-+  around it.
-+
-+  In the C++ interface, all the routines are implemented as methods of PLMD::Plumed.
-+  In the C and FORTRAN interfaces, all the routines are named plumed_*, to
-+  avoid potential name clashes. Notice that the entire plumed library
-+  is implemented in C++, and it is hidden inside the PLMD namespace.
-+
-+  Handlers to the plumed object can be converted among different representations,
-+  to allow inter-operability among languages. In C, there are tools to convert
-+  to/from FORTRAN, whereas in C++ there are tools to convert to/from FORTRAN and C.
-+
-+  These handlers only contain a pointer to the real structure, so that
-+  when a plumed object is brought from one language to another,
-+  it brings a reference to the same environment.
-+
-+  Moreover, to simplify life in all cases where a single Plumed object is
-+  required for the entire simulation (which covers most of the practical
-+  applications with conventional MD codes) it is possible to take advantage
-+  of a global interface, which is implicitly referring to a unique global instance.
-+  The global object should still be initialized and finalized properly.
-+
-+  The basic method to send a message to plumed is
-+\verbatim
-+  (C) plumed_cmd
-+  (C++) PLMD::Plumed::cmd
-+  (FORTRAN)  PLUMED_F_CMD
-+\endverbatim
-+
-+  To initialize a plumed object, use:
-+\verbatim
-+  (C)        plumed_create
-+  (C++)      (constructor of PLMD::Plumed)
-+  (FORTRAN)  PLUMED_F_CREATE
-+\endverbatim
-+
-+  To finalize it, use
-+\verbatim
-+  (C)        plumed_finalize
-+  (C++)      (destructor of PLMD::Plumed)
-+  (FORTRAN)  PLUMED_F_FINALIZE
-+\endverbatim
-+
-+  To access to the global-object, use
-+\verbatim
-+  (C)        plumed_gcreate, plumed_gfinalize, plumed_gcmd
-+  (C++)      PLMD::Plumed::gcreate, PLMD::Plumed::gfinalize, PLMD::Plumed::gcmd
-+  (FORTRAN)  PLUMED_F_GCREATE, PLUMED_F_GFINALIZE, PLUMED_F_GCMD
-+\endverbatim
-+
-+  To check if the global object has been initialized, use
-+\verbatim
-+  (C)        plumed_ginitialized
-+  (C++)      PLMD::Plumed::ginitialized
-+  (FORTRAN)  PLUMED_F_GINITIALIZED
-+\endverbatim
-+
-+  To check if plumed library is available (this is useful for runtime linking), use
-+\verbatim
-+  (C)        plumed_installed 
-+  (C++)      PLMD::Plumed::installed
-+  (FORTRAN)  PLUMED_F_INSTALLED
-+\endverbatim
-+
-+  To convert handlers use
-+\verbatim
-+  (C)        plumed_c2f                 (C to FORTRAN)
-+  (C)        plumed_f2c                 (FORTRAN to C)
-+  (C++)      Plumed(plumed) constructor (C to C++)
-+  (C++)      operator plumed() cast     (C++ to C)
-+  (C++)      Plumed(char*)  constructor (FORTRAN to C++)
-+  (C++)      toFortran(char*)           (C++ to FORTRAN)
-+\endverbatim
-+
-+\verbatim
-+  FORTRAN interface
-+    SUBROUTINE PLUMED_F_INSTALLED(i)
-+      INTEGER,           INTENT(OUT)   :: i
-+    SUBROUTINE PLUMED_F_GINITIALIZED(i)
-+      INTEGER,           INTENT(OUT)   :: i
-+    SUBROUTINE PLUMED_F_GCREATE()
-+    SUBROUTINE PLUMED_F_GCMD(key,val)
-+      CHARACTER(LEN=*), INTENT(IN)     :: key
-+      UNSPECIFIED_TYPE, INTENT(INOUT)  :: val(*)
-+    SUBROUTINE PLUMED_F_GFINALIZE()
-+    SUBROUTINE PLUMED_F_GLOBAL(p)
-+      CHARACTER(LEN=32), INTENT(OUT)   :: p
-+    SUBROUTINE PLUMED_F_CREATE(p)
-+      CHARACTER(LEN=32), INTENT(OUT)   :: p
-+    SUBROUTINE PLUMED_F_CMD(p,key,val)
-+      CHARACTER(LEN=32), INTENT(IN)    :: p
-+      CHARACTER(LEN=*),  INTENT(IN)    :: key
-+      UNSPECIFIED_TYPE,  INTENT(INOUT) :: val(*)
-+    SUBROUTINE PLUMED_F_FINALIZE(p)
-+      CHARACTER(LEN=32), INTENT(IN)    :: p
-+\endverbatim
-+
-+  The main routine is "cmd", which accepts two arguments:
-+  key is a string containing the name of the command
-+  val is the argument. it is declared const so as to use allow passing const objects, but in practice plumed
-+      is going to modify val in several cases (using a const_cast).
-+  In some cases val can be omitted: just pass a NULL pointer (in C++, val is optional and can be omitted).
-+  The set of possible keys is the real API of the plumed library, and will be expanded with time.
-+  New commands will be added, but backward compatibility will be retained as long as possible.
-+
-+  To pass plumed a callback function use the following syntax (not available in FORTRAN yet)
-+\verbatim
-+    plumed_function_holder ff;
-+    ff.p=your_function;
-+    plumed_cmd(plumed,"xxxx",&ff);
-+\endverbatim
-+  (this is passing the your_function() function to the "xxxx" command)
-+*/
-+
-+#ifdef __cplusplus
-+ extern "C" {
-+#endif
-+
-+/* Generic function pointer */
-+typedef void (*plumed_function_pointer)(void);
-+
-+/**
-+  \brief Holder for function pointer.
-+
-+  To pass plumed a callback function use the following syntax:
-+\verbatim
-+    plumed_function_holder ff;
-+    ff.p=your_function;
-+    plumed_cmd(plumed,"xxxx",&ff);
-+\endverbatim
-+  (this is going to pass the your_function() function to the "xxxx" command)
-+*/
-+
-+typedef struct {
-+  plumed_function_pointer p;
-+} plumed_function_holder;
-+
-+/**
-+  \brief Main plumed object
-+
-+  This is an object containing a Plumed instance, which should be used in
-+  the MD engine. It should first be initialized with plumed_create(),
-+  then it communicates with the MD engine using plumed_cmd(). Finally,
-+  before the termination, it should be deallocated with plumed_finalize().
-+  Its interface is very simple and general, and is expected
-+  not to change across plumed versions. See \ref ReferencePlumedH.
-+*/
-+typedef struct {
-+/**
-+  \private
-+  \brief Void pointer holding the real PlumedMain structure
-+*/
-+  void*p;
-+} plumed;
-+
-+/** \relates plumed
-+    \brief Constructor
-+
-+    \return The constructed plumed object
-+*/
-+plumed plumed_create(void);
-+
-+/** \relates plumed
-+    \brief Tells p to execute a command
-+
-+    \param p The plumed object on which command is acting
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like plumed_cmd(p,"A","B"),
-+               but for some choice of key it can change the content
-+*/
-+void plumed_cmd(plumed p,const char*key,const void*val);
-+
-+/** \relates plumed
-+    \brief Destructor
-+
-+    \param p The plumed object to be deallocated
-+*/
-+void plumed_finalize(plumed p);
-+
-+/** \relates plumed
-+    \brief Check if plumed is installed (for runtime binding)
-+
-+    \return 1 if plumed is installed, to 0 otherwise
-+*/
-+int plumed_installed(void);
-+
-+/** \relates plumed
-+    \brief Retrieves an handler to the global structure.
-+*/
-+plumed plumed_global(void);
-+
-+/** \relates plumed
-+    \brief Check if the global interface has been initialized
-+
-+    \return 1 if plumed has been initialized, 0 otherwise
-+*/
-+int plumed_ginitialized(void);
-+
-+/* global C interface, working on a global object */
-+
-+/** \relates plumed
-+    \brief Constructor for the global interface.
-+
-+    \note Equivalent to plumed_create(), but initialize a static global plumed object
-+*/
-+void plumed_gcreate(void);
-+
-+/** \relates plumed
-+    \brief Tells to the global interface to execute a command.
-+
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like plumed_gcmd("A","B"),
-+               but for some choice of key it can change the content
-+
-+    \note Equivalent to plumed_cmd(), but skipping the plumed argument
-+*/
-+void plumed_gcmd(const char* key,const void* val);
-+
-+/** \relates plumed
-+    \brief Destructor for the global interface.
-+
-+    \note Equivalent to plumed_finalize(), but skipping the plumed argument
-+*/
-+void plumed_gfinalize(void);
-+
-+/* routines to convert char handler from/to plumed objects */
-+
-+/** \related plumed
-+    \brief Converts a C handler to a FORTRAN handler
-+
-+    \param p The C handler
-+    \param c The FORTRAN handler (a char[32])
-+*/
-+void   plumed_c2f(plumed p,char* c);
-+
-+/** \related plumed
-+    \brief Converts a FORTRAN handler to a C handler
-+    \param c The FORTRAN handler (a char[32])
-+    \return The C handler
-+*/
-+plumed plumed_f2c(const char* c);
-+
-+#ifdef __cplusplus
-+ }
-+#endif
-+
-+#ifdef __cplusplus
-+
-+/* this is to include the NULL pointer */
-+#include <cstdlib>
-+
-+/* C++ interface is hidden in PLMD namespace (same as plumed library) */
-+namespace PLMD {
-+
-+/**
-+  C++ wrapper for \ref plumed.
-+
-+  This class provides a C++ interface to PLUMED.
-+*/
-+
-+class Plumed{
-+  plumed main;
-+/**
-+   keeps track if the object was created from scratch using 
-+   the defaults destructor (cloned=false) or if it was imported
-+   from C or FORTRAN (cloned-true). In the latter case, the
-+   plumed_finalize() method is not called when destructing the object,
-+   since it is expected to be finalized in the C/FORTRAN code
-+*/
-+  bool cloned;
-+public:
-+/**
-+   Check if plumed is installed (for runtime binding)
-+   \return true if plumed is installed, false otherwise
-+*/
-+  static bool installed();
-+/**
-+   Check if global-plumed has been initialized
-+   \return true if global plumed object (see global()) is initialized (i.e. if gcreate() has been
-+           called), false otherwise.
-+*/
-+  static bool ginitialized();
-+/**
-+   Initialize global-plumed
-+*/
-+  static void gcreate();
-+/**
-+   Send a command to global-plumed
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like gcmd("A","B"),
-+               but for some choice of key it can change the content
-+*/
-+  static void gcmd(const char* key,const void* val);
-+/**
-+   Finalize global-plumed
-+*/
-+  static void gfinalize();
-+/**
-+   Returns the Plumed global object
-+   \return The Plumed global object
-+*/
-+  static Plumed global();
-+/**
-+   Constructor
-+*/
-+  Plumed();
-+/**
-+   Clone a Plumed object from a FORTRAN char* handler
-+   \param c The FORTRAN handler (a char[32]).
-+
-+ \attention The Plumed object created in this manner
-+            will not finalize the corresponding plumed structure.
-+            It is expected that the FORTRAN code calls plumed_c_finalize for it
-+*/
-+  Plumed(const char*c);
-+/**
-+   Clone a Plumed object from a C plumed structure
-+   \param p The C plumed structure.
-+
-+ \attention The Plumed object created in this manner
-+            will not finalize the corresponding plumed structure.
-+            It is expected that the C code calls plumed_finalize for it
-+*/
-+  Plumed(plumed p);
-+private:
-+/** Copy constructor is disabled (private and unimplemented)
-+  The problem here is that after copying it will not be clear who is
-+  going to finalize the corresponding plumed structure.
-+*/
-+  Plumed(const Plumed&);
-+/** Assignment operator is disabled (private and unimplemented)
-+  The problem here is that after copying it will not be clear who is
-+  going to finalize the corresponding plumed structure.
-+*/
-+  Plumed&operator=(const Plumed&);
-+public:
-+/**
-+   Retrieve the C plumed structure for this object
-+*/
-+  operator plumed()const;
-+/**
-+   Retrieve a FORTRAN handler for this object
-+    \param c The FORTRAN handler (a char[32]).
-+*/
-+  void toFortran(char*c)const;
-+/**
-+   Send a command to this plumed object
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like p.cmd("A","B"),
-+               but for some choice of key it can change the content
-+*/
-+  void cmd(const char*key,const void*val=NULL);
-+/**
-+   Destructor
-+
-+   Destructor is virtual so as to allow correct inheritance from Plumed object.
-+   To avoid linking problems with g++, I specify "inline" also here (in principle
-+   it should be enough to specify it down in the definition of the function, but
-+   for some reason that I do not understand g++ does not inline it properly in that
-+   case and complains when Plumed.h is included but Plumed.o is not linked. Anyway, the
-+   way it is done here seems to work properly).
-+*/
-+  inline virtual ~Plumed();
-+};
-+
-+/* All methods are inlined so as to avoid the compilation of an extra c++ file */
-+
-+inline
-+bool Plumed::installed(){
-+  return plumed_installed();
-+}
-+
-+inline
-+Plumed::Plumed():
-+  main(plumed_create()),
-+  cloned(false)
-+{}
-+
-+inline
-+Plumed::Plumed(const char*c):
-+  main(plumed_f2c(c)),
-+  cloned(true)
-+{}
-+
-+inline
-+Plumed::Plumed(plumed p):
-+  main(p),
-+  cloned(true)
-+{}
-+
-+inline
-+Plumed::operator plumed()const{
-+  return main;
-+}
-+
-+inline
-+void Plumed::toFortran(char*c)const{
-+  plumed_c2f(main,c);
-+}
-+
-+inline
-+void Plumed::cmd(const char*key,const void*val){
-+  plumed_cmd(main,key,val);
-+}
-+
-+inline
-+Plumed::~Plumed(){
-+  if(!cloned)plumed_finalize(main);
-+}
-+
-+inline
-+bool Plumed::ginitialized(){
-+  return plumed_ginitialized();
-+}
-+
-+inline
-+void Plumed::gcreate(){
-+  plumed_gcreate();
-+}
-+
-+inline
-+void Plumed::gcmd(const char* key,const void* val){
-+  plumed_gcmd(key,val);
-+}
-+
-+inline
-+void Plumed::gfinalize(){
-+  plumed_gfinalize();
-+}
-+
-+inline
-+Plumed Plumed::global(){
-+  return plumed_global();
-+}
-+
-+}
-+
-+#endif
-+
-+
-+#endif
-diff --git a/Plumed.inc b/Plumed.inc
-new file mode 100644
-index 0000000..cd6097a
---- /dev/null
-+++ b/Plumed.inc
-@@ -0,0 +1,3 @@
-+# PLUMED: shared installation
-+PLUMED_LOAD= /home/jas02/software/PLUMED/2.1.3-foss-2015b/lib/plumed///src/lib/libplumed.so -ldl
-+PLUMED_DEPENDENCIES= /home/jas02/software/PLUMED/2.1.3-foss-2015b/lib/plumed///src/lib/libplumed.so
-diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt
-index 6db37e2..cc97aa8 100644
---- a/src/gromacs/CMakeLists.txt
-+++ b/src/gromacs/CMakeLists.txt
-@@ -32,6 +32,8 @@
- # To help us fund GROMACS development, we humbly ask that you cite
- # the research papers on the package. Check out http://www.gromacs.org.
- 
-+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
-+
- set(LIBGROMACS_SOURCES)
- 
- function (gmx_install_headers DESTINATION)
-@@ -189,7 +191,7 @@ target_link_libraries(libgromacs
-                       ${TNG_IO_LIBRARIES}
-                       ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
-                       ${XML_LIBRARIES}
--                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
-+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${PLUMED_LOAD})
- set_target_properties(libgromacs PROPERTIES
-                       OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
-                       SOVERSION ${LIBRARY_SOVERSION}
-diff --git a/src/gromacs/CMakeLists.txt.preplumed b/src/gromacs/CMakeLists.txt.preplumed
-new file mode 100644
-index 0000000..6db37e2
---- /dev/null
-+++ b/src/gromacs/CMakeLists.txt.preplumed
-@@ -0,0 +1,232 @@
-+#
-+# This file is part of the GROMACS molecular simulation package.
-+#
-+# Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
-+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+# and including many others, as listed in the AUTHORS file in the
-+# top-level source directory and at http://www.gromacs.org.
-+#
-+# GROMACS is free software; you can redistribute it and/or
-+# modify it under the terms of the GNU Lesser General Public License
-+# as published by the Free Software Foundation; either version 2.1
-+# of the License, or (at your option) any later version.
-+#
-+# GROMACS is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+# Lesser General Public License for more details.
-+#
-+# You should have received a copy of the GNU Lesser General Public
-+# License along with GROMACS; if not, see
-+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+#
-+# If you want to redistribute modifications to GROMACS, please
-+# consider that scientific software is very special. Version
-+# control is crucial - bugs must be traceable. We will be happy to
-+# consider code for inclusion in the official distribution, but
-+# derived work must not be called official GROMACS. Details are found
-+# in the README & COPYING files - if they are missing, get the
-+# official version at http://www.gromacs.org.
-+#
-+# To help us fund GROMACS development, we humbly ask that you cite
-+# the research papers on the package. Check out http://www.gromacs.org.
-+
-+set(LIBGROMACS_SOURCES)
-+
-+function (gmx_install_headers DESTINATION)
-+    if (NOT GMX_BUILD_MDRUN_ONLY)
-+        if (DESTINATION)
-+            set(DESTINATION ${INCL_INSTALL_DIR}/gromacs/${DESTINATION})
-+        else()
-+            set(DESTINATION ${INCL_INSTALL_DIR}/gromacs)
-+        endif()
-+        install(FILES ${ARGN} DESTINATION ${DESTINATION} COMPONENT development)
-+    endif()
-+endfunction ()
-+
-+if(GMX_USE_TNG)
-+    option(GMX_EXTERNAL_TNG "Use external TNG instead of compiling the version shipped with GROMACS."
-+           OFF)
-+    # Detect TNG if GMX_EXTERNAL_TNG is explicitly ON
-+    if(GMX_EXTERNAL_TNG)
-+        find_package(TNG_IO 1.6.0)
-+        if(NOT TNG_IO_FOUND)
-+            message(FATAL_ERROR
-+                "TNG >= 1.6.0 not found. "
-+                "You can set GMX_EXTERNAL_TNG=OFF to compile TNG.")
-+        endif()
-+        include_directories(${TNG_IO_INCLUDE_DIRS})
-+    endif()
-+    if(NOT GMX_EXTERNAL_TNG)
-+        include(${CMAKE_SOURCE_DIR}/src/external/tng_io/BuildTNG.cmake)
-+        tng_get_source_list(TNG_SOURCES TNG_IO_DEFINITIONS)
-+        list(APPEND LIBGROMACS_SOURCES ${TNG_SOURCES})
-+        tng_set_source_properties(WITH_ZLIB ${HAVE_ZLIB})
-+
-+        if (HAVE_ZLIB)
-+            list(APPEND GMX_EXTRA_LIBRARIES ${ZLIB_LIBRARIES})
-+            include_directories(${ZLIB_INCLUDE_DIRS})
-+        endif()
-+    endif()
-+else()
-+    # We still need to get tng/tng_io_fwd.h from somewhere!
-+    include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
-+endif()
-+
-+add_subdirectory(gmxlib)
-+add_subdirectory(mdlib)
-+add_subdirectory(gmxpreprocess)
-+add_subdirectory(commandline)
-+add_subdirectory(fft)
-+add_subdirectory(linearalgebra)
-+add_subdirectory(math)
-+add_subdirectory(random)
-+add_subdirectory(onlinehelp)
-+add_subdirectory(options)
-+add_subdirectory(timing)
-+add_subdirectory(utility)
-+add_subdirectory(fileio)
-+add_subdirectory(swap)
-+add_subdirectory(essentialdynamics)
-+add_subdirectory(pulling)
-+add_subdirectory(simd)
-+add_subdirectory(imd)
-+if (NOT GMX_BUILD_MDRUN_ONLY)
-+    add_subdirectory(legacyheaders)
-+    add_subdirectory(gmxana)
-+    add_subdirectory(statistics)
-+    add_subdirectory(analysisdata)
-+    add_subdirectory(selection)
-+    add_subdirectory(trajectoryanalysis)
-+    add_subdirectory(tools)
-+endif()
-+
-+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES})
-+
-+# This would be the standard way to include thread_mpi, but
-+# we want libgromacs to link the functions directly
-+#if(GMX_THREAD_MPI)
-+#    add_subdirectory(thread_mpi)
-+#endif()
-+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
-+
-+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
-+list(APPEND LIBGROMACS_SOURCES ${THREAD_MPI_SOURCES})
-+
-+file(GLOB LIBGROMACS_HEADERS *.h)
-+configure_file(version.h.cmakein version.h)
-+gmx_install_headers("" ${LIBGROMACS_HEADERS})
-+gmx_install_headers("" ${CMAKE_CURRENT_BINARY_DIR}/version.h)
-+
-+# Add target that generates baseversion-gen.c every time make is run
-+# if git version info is requested, or create it statically.
-+# This code is here instead of utility/CMakeLists.txt because CMake
-+# ignores set_source_file_properties from subdirectories.
-+set(GENERATED_VERSION_FILE
-+    ${CMAKE_CURRENT_BINARY_DIR}/utility/baseversion-gen.c)
-+set(GENERATED_VERSION_FILE_SOURCE
-+    ${CMAKE_CURRENT_SOURCE_DIR}/utility/baseversion-gen.c.cmakein)
-+if (GMX_GIT_VERSION_INFO)
-+    add_custom_target(gmx-version ALL
-+            COMMAND ${CMAKE_COMMAND}
-+                -D GIT_EXECUTABLE="${GIT_EXECUTABLE}"
-+                -D PROJECT_VERSION="${PROJECT_VERSION}"
-+                -D PROJECT_SOURCE_DIR="${PROJECT_SOURCE_DIR}"
-+                -D VERSION_CMAKEIN=${GENERATED_VERSION_FILE_SOURCE}
-+                -D VERSION_OUT=${GENERATED_VERSION_FILE}
-+                -P ${CMAKE_SOURCE_DIR}/cmake/gmxGenerateVersionInfo.cmake
-+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-+            DEPENDS ${GENERATED_VERSION_FILE_SOURCE}
-+            COMMENT "Generating git version information")
-+    set_source_files_properties(${GENERATED_VERSION_FILE}
-+                                PROPERTIES GENERATED true)
-+else()
-+    set(GMX_PROJECT_VERSION_STR ${PROJECT_VERSION})
-+    configure_file(${GENERATED_VERSION_FILE_SOURCE} ${GENERATED_VERSION_FILE})
-+endif()
-+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
-+
-+# apply gcc 4.4.x bug workaround
-+if(GMX_USE_GCC44_BUG_WORKAROUND)
-+   include(gmxGCC44O3BugWorkaround)
-+   gmx_apply_gcc44_bug_workaround("gmxlib/bondfree.c")
-+   gmx_apply_gcc44_bug_workaround("mdlib/force.c")
-+   gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
-+endif()
-+
-+add_library(libgromacs ${LIBGROMACS_SOURCES})
-+if (GMX_GIT_VERSION_INFO)
-+    add_dependencies(libgromacs gmx-version)
-+endif()
-+
-+# Recent versions of gcc and clang give warnings on scanner.cpp, which
-+# is a generated source file. These are awkward to suppress inline, so
-+# we do it in the compilation command (after testing that the compiler
-+# supports the suppressions). Setting the properties only works after
-+# the related target has been created, e.g. after when the file is
-+# used with add_library().
-+include(CheckCXXCompilerFlag)
-+check_cxx_compiler_flag(-Wno-unused-parameter HAS_NO_UNUSED_PARAMETER)
-+if (HAS_NO_UNUSED_PARAMETER)
-+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-unused-parameter")
-+endif()
-+check_cxx_compiler_flag(-Wno-deprecated-register HAS_NO_DEPRECATED_REGISTER)
-+if (HAS_NO_DEPRECATED_REGISTER)
-+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated-register")
-+else()
-+    check_cxx_compiler_flag(-Wno-deprecated HAS_NO_DEPRECATED)
-+    if (HAS_NO_DEPRECATED)
-+        set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated")
-+    endif()
-+endif()
-+set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
-+
-+target_link_libraries(libgromacs
-+                      ${EXTRAE_LIBRARIES}
-+                      ${GMX_GPU_LIBRARIES}
-+                      ${GMX_EXTRA_LIBRARIES}
-+                      ${TNG_IO_LIBRARIES}
-+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
-+                      ${XML_LIBRARIES}
-+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
-+set_target_properties(libgromacs PROPERTIES
-+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
-+                      SOVERSION ${LIBRARY_SOVERSION}
-+                      VERSION ${LIBRARY_VERSION}
-+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
-+
-+# Only install the library in mdrun-only mode if it is actually necessary
-+# for the binary
-+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
-+    install(TARGETS libgromacs
-+        LIBRARY DESTINATION ${LIB_INSTALL_DIR}
-+        RUNTIME DESTINATION ${BIN_INSTALL_DIR}
-+        ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
-+        COMPONENT libraries)
-+endif()
-+
-+if (NOT GMX_BUILD_MDRUN_ONLY)
-+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgromacs.pc.cmakein
-+                   ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc @ONLY)
-+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc
-+            DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
-+            RENAME "libgromacs${GMX_LIBS_SUFFIX}.pc"
-+            COMPONENT development)
-+endif()
-+
-+if (INSTALL_CUDART_LIB) #can be set manual by user
-+    if (GMX_GPU)
-+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
-+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
-+            if(IS_CUDART) #libcuda should not be installed
-+                #install also name-links (linker uses those)
-+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
-+                install(FILES ${CUDA_LIBS} DESTINATION
-+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
-+            endif()
-+        endforeach()
-+    else()
-+        message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
-+    endif()
-+endif()
-diff --git a/src/gromacs/mdlib/force.c b/src/gromacs/mdlib/force.c
-index 5230983..8227d5b 100644
---- a/src/gromacs/mdlib/force.c
-+++ b/src/gromacs/mdlib/force.c
-@@ -67,6 +67,14 @@
- #include "gromacs/timing/wallcycle.h"
- #include "gmx_fatal.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+int    plumedswitch=0;
-+plumed plumedmain;
-+void(*plumedcmd)(plumed,const char*,const void*)=NULL;
-+/* END PLUMED */
-+
-+
- void ns(FILE              *fp,
-         t_forcerec        *fr,
-         matrix             box,
-@@ -737,6 +745,13 @@ void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
-         pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
-     }
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      int plumedNeedsEnergy;
-+      (*plumedcmd)(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+      if(!plumedNeedsEnergy) (*plumedcmd)(plumedmain,"performCalc",NULL);
-+    }
-+    /* END PLUMED */
- }
- 
- void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
-diff --git a/src/gromacs/mdlib/force.c.preplumed b/src/gromacs/mdlib/force.c.preplumed
-new file mode 100644
-index 0000000..5230983
---- /dev/null
-+++ b/src/gromacs/mdlib/force.c.preplumed
-@@ -0,0 +1,1018 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <math.h>
-+#include <string.h>
-+#include <assert.h>
-+#include "sysstuff.h"
-+#include "typedefs.h"
-+#include "macros.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "macros.h"
-+#include "physics.h"
-+#include "force.h"
-+#include "nonbonded.h"
-+#include "names.h"
-+#include "network.h"
-+#include "pbc.h"
-+#include "ns.h"
-+#include "nrnb.h"
-+#include "bondf.h"
-+#include "mshift.h"
-+#include "txtdump.h"
-+#include "coulomb.h"
-+#include "pme.h"
-+#include "mdrun.h"
-+#include "domdec.h"
-+#include "qmmm.h"
-+#include "gmx_omp_nthreads.h"
-+
-+#include "gromacs/timing/wallcycle.h"
-+#include "gmx_fatal.h"
-+
-+void ns(FILE              *fp,
-+        t_forcerec        *fr,
-+        matrix             box,
-+        gmx_groups_t      *groups,
-+        gmx_localtop_t    *top,
-+        t_mdatoms         *md,
-+        t_commrec         *cr,
-+        t_nrnb            *nrnb,
-+        gmx_bool           bFillGrid,
-+        gmx_bool           bDoLongRangeNS)
-+{
-+    char   *ptr;
-+    int     nsearch;
-+
-+
-+    if (!fr->ns.nblist_initialized)
-+    {
-+        init_neighbor_list(fp, fr, md->homenr);
-+    }
-+
-+    if (fr->bTwinRange)
-+    {
-+        fr->nlr = 0;
-+    }
-+
-+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
-+                                bFillGrid, bDoLongRangeNS);
-+    if (debug)
-+    {
-+        fprintf(debug, "nsearch = %d\n", nsearch);
-+    }
-+
-+    /* Check whether we have to do dynamic load balancing */
-+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
-+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
-+       &(top->idef),opts->ngener);
-+     */
-+    if (fr->ns.dump_nl > 0)
-+    {
-+        dump_nblist(fp, cr, fr, fr->ns.dump_nl);
-+    }
-+}
-+
-+static void reduce_thread_forces(int n, rvec *f,
-+                                 tensor vir_q, tensor vir_lj,
-+                                 real *Vcorr_q, real *Vcorr_lj,
-+                                 real *dvdl_q, real *dvdl_lj,
-+                                 int nthreads, f_thread_t *f_t)
-+{
-+    int t, i;
-+    int nthreads_loop gmx_unused;
-+
-+    /* This reduction can run over any number of threads */
-+    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
-+#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
-+    for (i = 0; i < n; i++)
-+    {
-+        for (t = 1; t < nthreads; t++)
-+        {
-+            rvec_inc(f[i], f_t[t].f[i]);
-+        }
-+    }
-+    for (t = 1; t < nthreads; t++)
-+    {
-+        *Vcorr_q  += f_t[t].Vcorr_q;
-+        *Vcorr_lj += f_t[t].Vcorr_lj;
-+        *dvdl_q   += f_t[t].dvdl[efptCOUL];
-+        *dvdl_lj  += f_t[t].dvdl[efptVDW];
-+        m_add(vir_q, f_t[t].vir_q, vir_q);
-+        m_add(vir_lj, f_t[t].vir_lj, vir_lj);
-+    }
-+}
-+
-+void gmx_print_sepdvdl(FILE *fplog, const char *s, real v, real dvdlambda)
-+{
-+    fprintf(fplog, "  %-30s V %12.5e  dVdl %12.5e\n", s, v, dvdlambda);
-+}
-+
-+void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
-+                       t_forcerec *fr,      t_inputrec *ir,
-+                       t_idef     *idef,    t_commrec  *cr,
-+                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
-+                       t_mdatoms  *md,
-+                       rvec       x[],      history_t  *hist,
-+                       rvec       f[],
-+                       rvec       f_longrange[],
-+                       gmx_enerdata_t *enerd,
-+                       t_fcdata   *fcd,
-+                       gmx_localtop_t *top,
-+                       gmx_genborn_t *born,
-+                       t_atomtypes *atype,
-+                       gmx_bool       bBornRadii,
-+                       matrix     box,
-+                       t_lambda   *fepvals,
-+                       real       *lambda,
-+                       t_graph    *graph,
-+                       t_blocka   *excl,
-+                       rvec       mu_tot[],
-+                       int        flags,
-+                       float      *cycles_pme)
-+{
-+    int         i, j;
-+    int         donb_flags;
-+    gmx_bool    bDoEpot, bSepDVDL, bSB;
-+    int         pme_flags;
-+    matrix      boxs;
-+    rvec        box_size;
-+    t_pbc       pbc;
-+    char        buf[22];
-+    double      clam_i, vlam_i;
-+    real        dvdl_dum[efptNR], dvdl_nb[efptNR], lam_i[efptNR];
-+    real        dvdl_q, dvdl_lj;
-+
-+#ifdef GMX_MPI
-+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
-+#endif
-+
-+#define PRINT_SEPDVDL(s, v, dvdlambda) if (bSepDVDL) { gmx_print_sepdvdl(fplog, s, v, dvdlambda); }
-+
-+    set_pbc(&pbc, fr->ePBC, box);
-+
-+    /* reset free energy components */
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        dvdl_nb[i]  = 0;
-+        dvdl_dum[i] = 0;
-+    }
-+
-+    /* Reset box */
-+    for (i = 0; (i < DIM); i++)
-+    {
-+        box_size[i] = box[i][i];
-+    }
-+
-+    bSepDVDL = (fr->bSepDVDL && do_per_step(step, ir->nstlog));
-+    debug_gmx();
-+
-+    /* do QMMM first if requested */
-+    if (fr->bQMMM)
-+    {
-+        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr);
-+    }
-+
-+    if (bSepDVDL)
-+    {
-+        fprintf(fplog, "Step %s: non-bonded V and dVdl for rank %d:\n",
-+                gmx_step_str(step, buf), cr->nodeid);
-+    }
-+
-+    /* Call the short range functions all in one go. */
-+
-+#ifdef GMX_MPI
-+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
-+#define TAKETIME FALSE
-+    if (TAKETIME)
-+    {
-+        MPI_Barrier(cr->mpi_comm_mygroup);
-+        t0 = MPI_Wtime();
-+    }
-+#endif
-+
-+    if (ir->nwall)
-+    {
-+        /* foreign lambda component for walls */
-+        real dvdl_walls = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
-+                                   enerd->grpp.ener[egLJSR], nrnb);
-+        PRINT_SEPDVDL("Walls", 0.0, dvdl_walls);
-+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
-+    }
-+
-+    /* If doing GB, reset dvda and calculate the Born radii */
-+    if (ir->implicit_solvent)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
-+
-+        for (i = 0; i < born->nr; i++)
-+        {
-+            fr->dvda[i] = 0;
-+        }
-+
-+        if (bBornRadii)
-+        {
-+            calc_gb_rad(cr, fr, ir, top, x, &(fr->gblist), born, md, nrnb);
-+        }
-+
-+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
-+    }
-+
-+    where();
-+    /* We only do non-bonded calculation with group scheme here, the verlet
-+     * calls are done from do_force_cutsVERLET(). */
-+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
-+    {
-+        donb_flags = 0;
-+        /* Add short-range interactions */
-+        donb_flags |= GMX_NONBONDED_DO_SR;
-+
-+        /* Currently all group scheme kernels always calculate (shift-)forces */
-+        if (flags & GMX_FORCE_FORCES)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_FORCE;
-+        }
-+        if (flags & GMX_FORCE_VIRIAL)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
-+        }
-+        if (flags & GMX_FORCE_ENERGY)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
-+        }
-+        if (flags & GMX_FORCE_DO_LR)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_LR;
-+        }
-+
-+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
-+        do_nonbonded(fr, x, f, f_longrange, md, excl,
-+                     &enerd->grpp, nrnb,
-+                     lambda, dvdl_nb, -1, -1, donb_flags);
-+
-+        /* If we do foreign lambda and we have soft-core interactions
-+         * we have to recalculate the (non-linear) energies contributions.
-+         */
-+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
-+        {
-+            for (i = 0; i < enerd->n_lambda; i++)
-+            {
-+                for (j = 0; j < efptNR; j++)
-+                {
-+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
-+                }
-+                reset_foreign_enerdata(enerd);
-+                do_nonbonded(fr, x, f, f_longrange, md, excl,
-+                             &(enerd->foreign_grpp), nrnb,
-+                             lam_i, dvdl_dum, -1, -1,
-+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
-+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
-+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
-+            }
-+        }
-+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
-+        where();
-+    }
-+
-+    /* If we are doing GB, calculate bonded forces and apply corrections
-+     * to the solvation forces */
-+    /* MRS: Eventually, many need to include free energy contribution here! */
-+    if (ir->implicit_solvent)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsBONDED);
-+        calc_gb_forces(cr, md, born, top, x, f, fr, idef,
-+                       ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
-+        wallcycle_sub_stop(wcycle, ewcsBONDED);
-+    }
-+
-+#ifdef GMX_MPI
-+    if (TAKETIME)
-+    {
-+        t1          = MPI_Wtime();
-+        fr->t_fnbf += t1-t0;
-+    }
-+#endif
-+
-+    if (fepvals->sc_alpha != 0)
-+    {
-+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
-+    }
-+    else
-+    {
-+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
-+    }
-+
-+    if (fepvals->sc_alpha != 0)
-+
-+    /* even though coulomb part is linear, we already added it, beacuse we
-+       need to go through the vdw calculation anyway */
-+    {
-+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
-+    }
-+    else
-+    {
-+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
-+    }
-+
-+    if (bSepDVDL)
-+    {
-+        real V_short_range    = 0;
-+        real dvdl_short_range = 0;
-+
-+        for (i = 0; i < enerd->grpp.nener; i++)
-+        {
-+            V_short_range +=
-+                (fr->bBHAM ?
-+                 enerd->grpp.ener[egBHAMSR][i] :
-+                 enerd->grpp.ener[egLJSR][i])
-+                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
-+        }
-+        dvdl_short_range = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
-+        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.",
-+                      V_short_range,
-+                      dvdl_short_range);
-+    }
-+    debug_gmx();
-+
-+
-+    if (debug)
-+    {
-+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
-+    }
-+
-+    /* Shift the coordinates. Must be done before bonded forces and PPPM,
-+     * but is also necessary for SHAKE and update, therefore it can NOT
-+     * go when no bonded forces have to be evaluated.
-+     */
-+
-+    /* Here sometimes we would not need to shift with NBFonly,
-+     * but we do so anyhow for consistency of the returned coordinates.
-+     */
-+    if (graph)
-+    {
-+        shift_self(graph, box, x);
-+        if (TRICLINIC(box))
-+        {
-+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
-+        }
-+        else
-+        {
-+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
-+        }
-+    }
-+    /* Check whether we need to do bondeds or correct for exclusions */
-+    if (fr->bMolPBC &&
-+        ((flags & GMX_FORCE_BONDED)
-+         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype)))
-+    {
-+        /* Since all atoms are in the rectangular or triclinic unit-cell,
-+         * only single box vector shifts (2 in x) are required.
-+         */
-+        set_pbc_dd(&pbc, fr->ePBC, cr->dd, TRUE, box);
-+    }
-+    debug_gmx();
-+
-+    if (flags & GMX_FORCE_BONDED)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsBONDED);
-+        calc_bonds(fplog, cr->ms,
-+                   idef, x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
-+                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
-+                   flags,
-+                   fr->bSepDVDL && do_per_step(step, ir->nstlog), step);
-+
-+        /* Check if we have to determine energy differences
-+         * at foreign lambda's.
-+         */
-+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) &&
-+            idef->ilsort != ilsortNO_FE)
-+        {
-+            if (idef->ilsort != ilsortFE_SORTED)
-+            {
-+                gmx_incons("The bonded interactions are not sorted for free energy");
-+            }
-+            for (i = 0; i < enerd->n_lambda; i++)
-+            {
-+                reset_foreign_enerdata(enerd);
-+                for (j = 0; j < efptNR; j++)
-+                {
-+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
-+                }
-+                calc_bonds_lambda(fplog, idef, x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
-+                                  fcd, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
-+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
-+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
-+            }
-+        }
-+        debug_gmx();
-+
-+        wallcycle_sub_stop(wcycle, ewcsBONDED);
-+    }
-+
-+    where();
-+
-+    *cycles_pme = 0;
-+    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+    {
-+        real Vlr             = 0, Vcorr = 0;
-+        real dvdl_long_range = 0;
-+        int  status          = 0;
-+
-+        bSB = (ir->nwall == 2);
-+        if (bSB)
-+        {
-+            copy_mat(box, boxs);
-+            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
-+            box_size[ZZ] *= ir->wall_ewald_zfac;
-+        }
-+    }
-+
-+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
-+     * corrections.
-+     */
-+
-+    clear_mat(fr->vir_el_recip);
-+    clear_mat(fr->vir_lj_recip);
-+
-+    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+    {
-+        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
-+        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
-+        int  status            = 0;
-+
-+        if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+        {
-+            real dvdl_long_range_correction_q   = 0;
-+            real dvdl_long_range_correction_lj  = 0;
-+            /* With the Verlet scheme exclusion forces are calculated
-+             * in the non-bonded kernel.
-+             */
-+            /* The TPI molecule does not have exclusions with the rest
-+             * of the system and no intra-molecular PME grid
-+             * contributions will be calculated in
-+             * gmx_pme_calc_energy.
-+             */
-+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
-+                ir->ewald_geometry != eewg3D ||
-+                ir->epsilon_surface != 0)
-+            {
-+                int nthreads, t;
-+
-+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
-+
-+                if (fr->n_tpi > 0)
-+                {
-+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
-+                }
-+
-+                nthreads = gmx_omp_nthreads_get(emntBonded);
-+#pragma omp parallel for num_threads(nthreads) schedule(static)
-+                for (t = 0; t < nthreads; t++)
-+                {
-+                    int     s, e, i;
-+                    rvec   *fnv;
-+                    tensor *vir_q, *vir_lj;
-+                    real   *Vcorrt_q, *Vcorrt_lj, *dvdlt_q, *dvdlt_lj;
-+                    if (t == 0)
-+                    {
-+                        fnv       = fr->f_novirsum;
-+                        vir_q     = &fr->vir_el_recip;
-+                        vir_lj    = &fr->vir_lj_recip;
-+                        Vcorrt_q  = &Vcorr_q;
-+                        Vcorrt_lj = &Vcorr_lj;
-+                        dvdlt_q   = &dvdl_long_range_correction_q;
-+                        dvdlt_lj  = &dvdl_long_range_correction_lj;
-+                    }
-+                    else
-+                    {
-+                        fnv       = fr->f_t[t].f;
-+                        vir_q     = &fr->f_t[t].vir_q;
-+                        vir_lj    = &fr->f_t[t].vir_lj;
-+                        Vcorrt_q  = &fr->f_t[t].Vcorr_q;
-+                        Vcorrt_lj = &fr->f_t[t].Vcorr_lj;
-+                        dvdlt_q   = &fr->f_t[t].dvdl[efptCOUL];
-+                        dvdlt_lj  = &fr->f_t[t].dvdl[efptVDW];
-+                        for (i = 0; i < fr->natoms_force; i++)
-+                        {
-+                            clear_rvec(fnv[i]);
-+                        }
-+                        clear_mat(*vir_q);
-+                        clear_mat(*vir_lj);
-+                    }
-+                    *dvdlt_q  = 0;
-+                    *dvdlt_lj = 0;
-+
-+                    ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
-+                                       cr, t, fr,
-+                                       md->chargeA, md->chargeB,
-+                                       md->sqrt_c6A, md->sqrt_c6B,
-+                                       md->sigmaA, md->sigmaB,
-+                                       md->sigma3A, md->sigma3B,
-+                                       md->nChargePerturbed || md->nTypePerturbed,
-+                                       ir->cutoff_scheme != ecutsVERLET,
-+                                       excl, x, bSB ? boxs : box, mu_tot,
-+                                       ir->ewald_geometry,
-+                                       ir->epsilon_surface,
-+                                       fnv, *vir_q, *vir_lj,
-+                                       Vcorrt_q, Vcorrt_lj,
-+                                       lambda[efptCOUL], lambda[efptVDW],
-+                                       dvdlt_q, dvdlt_lj);
-+                }
-+                if (nthreads > 1)
-+                {
-+                    reduce_thread_forces(fr->natoms_force, fr->f_novirsum,
-+                                         fr->vir_el_recip, fr->vir_lj_recip,
-+                                         &Vcorr_q, &Vcorr_lj,
-+                                         &dvdl_long_range_correction_q,
-+                                         &dvdl_long_range_correction_lj,
-+                                         nthreads, fr->f_t);
-+                }
-+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
-+            }
-+
-+            if (EEL_PME_EWALD(fr->eeltype) && fr->n_tpi == 0)
-+            {
-+                Vcorr_q += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
-+                                                   &dvdl_long_range_correction_q,
-+                                                   fr->vir_el_recip);
-+            }
-+
-+            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.", Vcorr_q, dvdl_long_range_correction_q);
-+            PRINT_SEPDVDL("Ewald excl. corr. LJ", Vcorr_lj, dvdl_long_range_correction_lj);
-+            enerd->dvdl_lin[efptCOUL] += dvdl_long_range_correction_q;
-+            enerd->dvdl_lin[efptVDW]  += dvdl_long_range_correction_lj;
-+        }
-+
-+        if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)))
-+        {
-+            if (cr->duty & DUTY_PME)
-+            {
-+                /* Do reciprocal PME for Coulomb and/or LJ. */
-+                assert(fr->n_tpi >= 0);
-+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
-+                {
-+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
-+                    if (EEL_PME(fr->eeltype))
-+                    {
-+                        pme_flags     |= GMX_PME_DO_COULOMB;
-+                    }
-+                    if (EVDW_PME(fr->vdwtype))
-+                    {
-+                        pme_flags |= GMX_PME_DO_LJ;
-+                    }
-+                    if (flags & GMX_FORCE_FORCES)
-+                    {
-+                        pme_flags |= GMX_PME_CALC_F;
-+                    }
-+                    if (flags & GMX_FORCE_VIRIAL)
-+                    {
-+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
-+                    }
-+                    if (fr->n_tpi > 0)
-+                    {
-+                        /* We don't calculate f, but we do want the potential */
-+                        pme_flags |= GMX_PME_CALC_POT;
-+                    }
-+                    wallcycle_start(wcycle, ewcPMEMESH);
-+                    status = gmx_pme_do(fr->pmedata,
-+                                        0, md->homenr - fr->n_tpi,
-+                                        x, fr->f_novirsum,
-+                                        md->chargeA, md->chargeB,
-+                                        md->sqrt_c6A, md->sqrt_c6B,
-+                                        md->sigmaA, md->sigmaB,
-+                                        bSB ? boxs : box, cr,
-+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
-+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
-+                                        nrnb, wcycle,
-+                                        fr->vir_el_recip, fr->ewaldcoeff_q,
-+                                        fr->vir_lj_recip, fr->ewaldcoeff_lj,
-+                                        &Vlr_q, &Vlr_lj,
-+                                        lambda[efptCOUL], lambda[efptVDW],
-+                                        &dvdl_long_range_q, &dvdl_long_range_lj, pme_flags);
-+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
-+                    if (status != 0)
-+                    {
-+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
-+                    }
-+                    /* We should try to do as little computation after
-+                     * this as possible, because parallel PME synchronizes
-+                     * the nodes, so we want all load imbalance of the
-+                     * rest of the force calculation to be before the PME
-+                     * call.  DD load balancing is done on the whole time
-+                     * of the force call (without PME).
-+                     */
-+                }
-+                if (fr->n_tpi > 0)
-+                {
-+                    if (EVDW_PME(ir->vdwtype))
-+                    {
-+
-+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
-+                    }
-+                    /* Determine the PME grid energy of the test molecule
-+                     * with the PME grid potential of the other charges.
-+                     */
-+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
-+                                        x + md->homenr - fr->n_tpi,
-+                                        md->chargeA + md->homenr - fr->n_tpi,
-+                                        &Vlr_q);
-+                }
-+                PRINT_SEPDVDL("PME mesh", Vlr_q + Vlr_lj, dvdl_long_range_q+dvdl_long_range_lj);
-+            }
-+        }
-+
-+        if (!EEL_PME(fr->eeltype) && EEL_PME_EWALD(fr->eeltype))
-+        {
-+            Vlr_q = do_ewald(ir, x, fr->f_novirsum,
-+                             md->chargeA, md->chargeB,
-+                             box_size, cr, md->homenr,
-+                             fr->vir_el_recip, fr->ewaldcoeff_q,
-+                             lambda[efptCOUL], &dvdl_long_range_q, fr->ewald_table);
-+            PRINT_SEPDVDL("Ewald long-range", Vlr_q, dvdl_long_range_q);
-+        }
-+
-+        /* Note that with separate PME nodes we get the real energies later */
-+        enerd->dvdl_lin[efptCOUL] += dvdl_long_range_q;
-+        enerd->dvdl_lin[efptVDW]  += dvdl_long_range_lj;
-+        enerd->term[F_COUL_RECIP]  = Vlr_q + Vcorr_q;
-+        enerd->term[F_LJ_RECIP]    = Vlr_lj + Vcorr_lj;
-+        if (debug)
-+        {
-+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
-+                    Vlr_q, Vcorr_q, enerd->term[F_COUL_RECIP]);
-+            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
-+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
-+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
-+                    Vlr_lj, Vcorr_lj, enerd->term[F_LJ_RECIP]);
-+            pr_rvecs(debug, 0, "vir_lj_recip after corr", fr->vir_lj_recip, DIM);
-+        }
-+    }
-+    else
-+    {
-+        /* Is there a reaction-field exclusion correction needed? */
-+        if (EEL_RF(fr->eeltype) && eelRF_NEC != fr->eeltype)
-+        {
-+            /* With the Verlet scheme, exclusion forces are calculated
-+             * in the non-bonded kernel.
-+             */
-+            if (ir->cutoff_scheme != ecutsVERLET)
-+            {
-+                real dvdl_rf_excl      = 0;
-+                enerd->term[F_RF_EXCL] =
-+                    RF_excl_correction(fr, graph, md, excl, x, f,
-+                                       fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
-+
-+                enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
-+                PRINT_SEPDVDL("RF exclusion correction",
-+                              enerd->term[F_RF_EXCL], dvdl_rf_excl);
-+            }
-+        }
-+    }
-+    where();
-+    debug_gmx();
-+
-+    if (debug)
-+    {
-+        print_nrnb(debug, nrnb);
-+    }
-+    debug_gmx();
-+
-+#ifdef GMX_MPI
-+    if (TAKETIME)
-+    {
-+        t2 = MPI_Wtime();
-+        MPI_Barrier(cr->mpi_comm_mygroup);
-+        t3          = MPI_Wtime();
-+        fr->t_wait += t3-t2;
-+        if (fr->timesteps == 11)
-+        {
-+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
-+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
-+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
-+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
-+        }
-+        fr->timesteps++;
-+    }
-+#endif
-+
-+    if (debug)
-+    {
-+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
-+    }
-+
-+}
-+
-+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
-+{
-+    int i, n2;
-+
-+    for (i = 0; i < F_NRE; i++)
-+    {
-+        enerd->term[i]         = 0;
-+        enerd->foreign_term[i] = 0;
-+    }
-+
-+
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        enerd->dvdl_lin[i]     = 0;
-+        enerd->dvdl_nonlin[i]  = 0;
-+    }
-+
-+    n2 = ngener*ngener;
-+    if (debug)
-+    {
-+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
-+    }
-+    enerd->grpp.nener         = n2;
-+    enerd->foreign_grpp.nener = n2;
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        snew(enerd->grpp.ener[i], n2);
-+        snew(enerd->foreign_grpp.ener[i], n2);
-+    }
-+
-+    if (n_lambda)
-+    {
-+        enerd->n_lambda = 1 + n_lambda;
-+        snew(enerd->enerpart_lambda, enerd->n_lambda);
-+    }
-+    else
-+    {
-+        enerd->n_lambda = 0;
-+    }
-+}
-+
-+void destroy_enerdata(gmx_enerdata_t *enerd)
-+{
-+    int i;
-+
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        sfree(enerd->grpp.ener[i]);
-+    }
-+
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        sfree(enerd->foreign_grpp.ener[i]);
-+    }
-+
-+    if (enerd->n_lambda)
-+    {
-+        sfree(enerd->enerpart_lambda);
-+    }
-+}
-+
-+static real sum_v(int n, real v[])
-+{
-+    real t;
-+    int  i;
-+
-+    t = 0.0;
-+    for (i = 0; (i < n); i++)
-+    {
-+        t = t + v[i];
-+    }
-+
-+    return t;
-+}
-+
-+void sum_epot(gmx_grppairener_t *grpp, real *epot)
-+{
-+    int i;
-+
-+    /* Accumulate energies */
-+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
-+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
-+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
-+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
-+    epot[F_COUL_LR]  = sum_v(grpp->nener, grpp->ener[egCOULLR]);
-+    epot[F_LJ_LR]    = sum_v(grpp->nener, grpp->ener[egLJLR]);
-+    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
-+    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
-+
-+/* lattice part of LR doesnt belong to any group
-+ * and has been added earlier
-+ */
-+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
-+    epot[F_BHAM_LR]  = sum_v(grpp->nener, grpp->ener[egBHAMLR]);
-+
-+    epot[F_EPOT] = 0;
-+    for (i = 0; (i < F_EPOT); i++)
-+    {
-+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
-+        {
-+            epot[F_EPOT] += epot[i];
-+        }
-+    }
-+}
-+
-+void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
-+{
-+    int    i, j, index;
-+    double dlam;
-+
-+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
-+    enerd->term[F_DVDL]       = 0.0;
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        if (fepvals->separate_dvdl[i])
-+        {
-+            /* could this be done more readably/compactly? */
-+            switch (i)
-+            {
-+                case (efptMASS):
-+                    index = F_DKDL;
-+                    break;
-+                case (efptCOUL):
-+                    index = F_DVDL_COUL;
-+                    break;
-+                case (efptVDW):
-+                    index = F_DVDL_VDW;
-+                    break;
-+                case (efptBONDED):
-+                    index = F_DVDL_BONDED;
-+                    break;
-+                case (efptRESTRAINT):
-+                    index = F_DVDL_RESTRAINT;
-+                    break;
-+                default:
-+                    index = F_DVDL;
-+                    break;
-+            }
-+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
-+            if (debug)
-+            {
-+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
-+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
-+            }
-+        }
-+        else
-+        {
-+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
-+            if (debug)
-+            {
-+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
-+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
-+            }
-+        }
-+    }
-+
-+    /* Notes on the foreign lambda free energy difference evaluation:
-+     * Adding the potential and ekin terms that depend linearly on lambda
-+     * as delta lam * dvdl to the energy differences is exact.
-+     * For the constraints this is not exact, but we have no other option
-+     * without literally changing the lengths and reevaluating the energies at each step.
-+     * (try to remedy this post 4.6 - MRS)
-+     * For the non-bonded LR term we assume that the soft-core (if present)
-+     * no longer affects the energy beyond the short-range cut-off,
-+     * which is a very good approximation (except for exotic settings).
-+     * (investigate how to overcome this post 4.6 - MRS)
-+     */
-+    if (fepvals->separate_dvdl[efptBONDED])
-+    {
-+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
-+    }
-+    else
-+    {
-+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
-+    }
-+    enerd->term[F_DVDL_CONSTR] = 0;
-+
-+    for (i = 0; i < fepvals->n_lambda; i++)
-+    {
-+        /* note we are iterating over fepvals here!
-+           For the current lam, dlam = 0 automatically,
-+           so we don't need to add anything to the
-+           enerd->enerpart_lambda[0] */
-+
-+        /* we don't need to worry about dvdl_lin contributions to dE at
-+           current lambda, because the contributions to the current
-+           lambda are automatically zeroed */
-+
-+        for (j = 0; j < efptNR; j++)
-+        {
-+            /* Note that this loop is over all dhdl components, not just the separated ones */
-+            dlam = (fepvals->all_lambda[j][i]-lambda[j]);
-+            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
-+            if (debug)
-+            {
-+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
-+                        fepvals->all_lambda[j][i], efpt_names[j],
-+                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
-+                        dlam, enerd->dvdl_lin[j]);
-+            }
-+        }
-+    }
-+}
-+
-+
-+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
-+{
-+    int  i, j;
-+
-+    /* First reset all foreign energy components.  Foreign energies always called on
-+       neighbor search steps */
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        for (j = 0; (j < enerd->grpp.nener); j++)
-+        {
-+            enerd->foreign_grpp.ener[i][j] = 0.0;
-+        }
-+    }
-+
-+    /* potential energy components */
-+    for (i = 0; (i <= F_EPOT); i++)
-+    {
-+        enerd->foreign_term[i] = 0.0;
-+    }
-+}
-+
-+void reset_enerdata(t_forcerec *fr, gmx_bool bNS,
-+                    gmx_enerdata_t *enerd,
-+                    gmx_bool bMaster)
-+{
-+    gmx_bool bKeepLR;
-+    int      i, j;
-+
-+    /* First reset all energy components, except for the long range terms
-+     * on the master at non neighbor search steps, since the long range
-+     * terms have already been summed at the last neighbor search step.
-+     */
-+    bKeepLR = (fr->bTwinRange && !bNS);
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        if (!(bKeepLR && bMaster && (i == egCOULLR || i == egLJLR)))
-+        {
-+            for (j = 0; (j < enerd->grpp.nener); j++)
-+            {
-+                enerd->grpp.ener[i][j] = 0.0;
-+            }
-+        }
-+    }
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        enerd->dvdl_lin[i]    = 0.0;
-+        enerd->dvdl_nonlin[i] = 0.0;
-+    }
-+
-+    /* Normal potential energy components */
-+    for (i = 0; (i <= F_EPOT); i++)
-+    {
-+        enerd->term[i] = 0.0;
-+    }
-+    /* Initialize the dVdlambda term with the long range contribution */
-+    /* Initialize the dvdl term with the long range contribution */
-+    enerd->term[F_DVDL]            = 0.0;
-+    enerd->term[F_DVDL_COUL]       = 0.0;
-+    enerd->term[F_DVDL_VDW]        = 0.0;
-+    enerd->term[F_DVDL_BONDED]     = 0.0;
-+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
-+    enerd->term[F_DKDL]            = 0.0;
-+    if (enerd->n_lambda > 0)
-+    {
-+        for (i = 0; i < enerd->n_lambda; i++)
-+        {
-+            enerd->enerpart_lambda[i] = 0.0;
-+        }
-+    }
-+    /* reset foreign energy data - separate function since we also call it elsewhere */
-+    reset_foreign_enerdata(enerd);
-+}
-diff --git a/src/gromacs/mdlib/minimize.c b/src/gromacs/mdlib/minimize.c
-index 69008f5..5114fa0 100644
---- a/src/gromacs/mdlib/minimize.c
-+++ b/src/gromacs/mdlib/minimize.c
-@@ -80,6 +80,13 @@
- #include "gromacs/timing/walltime_accounting.h"
- #include "gromacs/imd/imd.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+extern void(*plumedcmd)(plumed,const char*,const void*);
-+/* END PLUMED */
-+
- typedef struct {
-     t_state  s;
-     rvec    *f;
-@@ -442,6 +449,43 @@ void init_em(FILE *fplog, const char *title,
- 
-     clear_rvec(mu_tot);
-     calc_shifts(ems->s.box, fr->shift_vec);
-+
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) (*plumedcmd) (plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        (*plumedcmd) (plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }else{
-+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
-+        }
-+      }
-+      (*plumedcmd) (plumedmain,"setNatoms",&top_global->natoms);
-+      (*plumedcmd) (plumedmain,"setMDEngine","gromacs");
-+      (*plumedcmd) (plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      (*plumedcmd) (plumedmain,"setTimestep",&real_delta_t);
-+      (*plumedcmd) (plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
- }
- 
- static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
-@@ -737,12 +781,34 @@ static void evaluate_energy(FILE *fplog, t_commrec *cr,
-         em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-                                ems, top, mdatoms, fr, vsite, constr,
-                                nrnb, wcycle);
-+        /* PLUMED */
-+        if(plumedswitch){
-+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+        /* END PLUMED */
-     }
- 
-     /* Calc force & energy on new trial position  */
-     /* do_force always puts the charge groups in the box and shifts again
-      * We do not unshift, so molecules are always whole in congrad.c
-      */
-+    /* PLUMED */
-+    int plumedNeedsEnergy=0;
-+    matrix plumed_vir;
-+    if(plumedswitch){
-+      long int lstep=count; (*plumedcmd)(plumedmain,"setStepLong",&count);
-+      (*plumedcmd) (plumedmain,"setPositions",&ems->s.x[0][0]);
-+      (*plumedcmd) (plumedmain,"setMasses",&mdatoms->massT[0]);
-+      (*plumedcmd) (plumedmain,"setCharges",&mdatoms->chargeA[0]);
-+      (*plumedcmd) (plumedmain,"setBox",&ems->s.box[0][0]);
-+      (*plumedcmd) (plumedmain,"prepareCalc",NULL);
-+      (*plumedcmd) (plumedmain,"setForces",&ems->f[0][0]);
-+      (*plumedcmd) (plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+      clear_mat(plumed_vir);
-+      (*plumedcmd) (plumedmain,"setVirial",&plumed_vir[0][0]);
-+    }
-+    /* END PLUMED */
-     do_force(fplog, cr, inputrec,
-              count, nrnb, wcycle, top, &top_global->groups,
-              ems->s.box, ems->s.x, &ems->s.hist,
-@@ -751,6 +817,19 @@ static void evaluate_energy(FILE *fplog, t_commrec *cr,
-              GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
-              GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
-              (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(plumedNeedsEnergy) {
-+        msmul(force_vir,2.0,plumed_vir);
-+        (*plumedcmd) (plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+        (*plumedcmd) (plumedmain,"performCalc",NULL);
-+        msmul(plumed_vir,0.5,force_vir);
-+      } else {
-+        msmul(plumed_vir,0.5,plumed_vir);
-+        m_add(force_vir,plumed_vir,force_vir);
-+      }
-+    }
-+    /* END PLUMED */
- 
-     /* Clear the unused shake virial and pressure */
-     clear_mat(shake_vir);
-diff --git a/src/gromacs/mdlib/minimize.c.preplumed b/src/gromacs/mdlib/minimize.c.preplumed
-new file mode 100644
-index 0000000..69008f5
---- /dev/null
-+++ b/src/gromacs/mdlib/minimize.c.preplumed
-@@ -0,0 +1,2906 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <string.h>
-+#include <time.h>
-+#include <math.h>
-+#include "sysstuff.h"
-+#include "gromacs/utility/cstringutil.h"
-+#include "network.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "nrnb.h"
-+#include "main.h"
-+#include "force.h"
-+#include "macros.h"
-+#include "names.h"
-+#include "gmx_fatal.h"
-+#include "txtdump.h"
-+#include "typedefs.h"
-+#include "update.h"
-+#include "constr.h"
-+#include "vec.h"
-+#include "tgroup.h"
-+#include "mdebin.h"
-+#include "vsite.h"
-+#include "force.h"
-+#include "mdrun.h"
-+#include "md_support.h"
-+#include "sim_util.h"
-+#include "domdec.h"
-+#include "mdatoms.h"
-+#include "ns.h"
-+#include "mtop_util.h"
-+#include "pme.h"
-+#include "bondf.h"
-+#include "gmx_omp_nthreads.h"
-+#include "md_logging.h"
-+
-+#include "gromacs/fileio/confio.h"
-+#include "gromacs/fileio/trajectory_writing.h"
-+#include "gromacs/linearalgebra/mtxio.h"
-+#include "gromacs/linearalgebra/sparsematrix.h"
-+#include "gromacs/timing/wallcycle.h"
-+#include "gromacs/timing/walltime_accounting.h"
-+#include "gromacs/imd/imd.h"
-+
-+typedef struct {
-+    t_state  s;
-+    rvec    *f;
-+    real     epot;
-+    real     fnorm;
-+    real     fmax;
-+    int      a_fmax;
-+} em_state_t;
-+
-+static em_state_t *init_em_state()
-+{
-+    em_state_t *ems;
-+
-+    snew(ems, 1);
-+
-+    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
-+    snew(ems->s.lambda, efptNR);
-+
-+    return ems;
-+}
-+
-+static void print_em_start(FILE                     *fplog,
-+                           t_commrec                *cr,
-+                           gmx_walltime_accounting_t walltime_accounting,
-+                           gmx_wallcycle_t           wcycle,
-+                           const char               *name)
-+{
-+    walltime_accounting_start(walltime_accounting);
-+    wallcycle_start(wcycle, ewcRUN);
-+    print_start(fplog, cr, walltime_accounting, name);
-+}
-+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
-+                        gmx_wallcycle_t           wcycle)
-+{
-+    wallcycle_stop(wcycle, ewcRUN);
-+
-+    walltime_accounting_end(walltime_accounting);
-+}
-+
-+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
-+{
-+    fprintf(out, "\n");
-+    fprintf(out, "%s:\n", minimizer);
-+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
-+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
-+}
-+
-+static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
-+{
-+    char buffer[2048];
-+    if (bLastStep)
-+    {
-+        sprintf(buffer,
-+                "\nEnergy minimization reached the maximum number "
-+                "of steps before the forces reached the requested "
-+                "precision Fmax < %g.\n", ftol);
-+    }
-+    else
-+    {
-+        sprintf(buffer,
-+                "\nEnergy minimization has stopped, but the forces have "
-+                "not converged to the requested precision Fmax < %g (which "
-+                "may not be possible for your system). It stopped "
-+                "because the algorithm tried to make a new step whose size "
-+                "was too small, or there was no change in the energy since "
-+                "last step. Either way, we regard the minimization as "
-+                "converged to within the available machine precision, "
-+                "given your starting configuration and EM parameters.\n%s%s",
-+                ftol,
-+                sizeof(real) < sizeof(double) ?
-+                "\nDouble precision normally gives you higher accuracy, but "
-+                "this is often not needed for preparing to run molecular "
-+                "dynamics.\n" :
-+                "",
-+                bConstrain ?
-+                "You might need to increase your constraint accuracy, or turn\n"
-+                "off constraints altogether (set constraints = none in mdp file)\n" :
-+                "");
-+    }
-+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
-+}
-+
-+
-+
-+static void print_converged(FILE *fp, const char *alg, real ftol,
-+                            gmx_int64_t count, gmx_bool bDone, gmx_int64_t nsteps,
-+                            real epot, real fmax, int nfmax, real fnorm)
-+{
-+    char buf[STEPSTRSIZE];
-+
-+    if (bDone)
-+    {
-+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
-+                alg, ftol, gmx_step_str(count, buf));
-+    }
-+    else if (count < nsteps)
-+    {
-+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
-+                "but did not reach the requested Fmax < %g.\n",
-+                alg, gmx_step_str(count, buf), ftol);
-+    }
-+    else
-+    {
-+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
-+                alg, ftol, gmx_step_str(count, buf));
-+    }
-+
-+#ifdef GMX_DOUBLE
-+    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
-+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
-+    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
-+#else
-+    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
-+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
-+    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
-+#endif
-+}
-+
-+static void get_f_norm_max(t_commrec *cr,
-+                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
-+                           real *fnorm, real *fmax, int *a_fmax)
-+{
-+    double fnorm2, *sum;
-+    real   fmax2, fmax2_0, fam;
-+    int    la_max, a_max, start, end, i, m, gf;
-+
-+    /* This routine finds the largest force and returns it.
-+     * On parallel machines the global max is taken.
-+     */
-+    fnorm2 = 0;
-+    fmax2  = 0;
-+    la_max = -1;
-+    gf     = 0;
-+    start  = 0;
-+    end    = mdatoms->homenr;
-+    if (mdatoms->cFREEZE)
-+    {
-+        for (i = start; i < end; i++)
-+        {
-+            gf  = mdatoms->cFREEZE[i];
-+            fam = 0;
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    fam += sqr(f[i][m]);
-+                }
-+            }
-+            fnorm2 += fam;
-+            if (fam > fmax2)
-+            {
-+                fmax2  = fam;
-+                la_max = i;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        for (i = start; i < end; i++)
-+        {
-+            fam     = norm2(f[i]);
-+            fnorm2 += fam;
-+            if (fam > fmax2)
-+            {
-+                fmax2  = fam;
-+                la_max = i;
-+            }
-+        }
-+    }
-+
-+    if (la_max >= 0 && DOMAINDECOMP(cr))
-+    {
-+        a_max = cr->dd->gatindex[la_max];
-+    }
-+    else
-+    {
-+        a_max = la_max;
-+    }
-+    if (PAR(cr))
-+    {
-+        snew(sum, 2*cr->nnodes+1);
-+        sum[2*cr->nodeid]   = fmax2;
-+        sum[2*cr->nodeid+1] = a_max;
-+        sum[2*cr->nnodes]   = fnorm2;
-+        gmx_sumd(2*cr->nnodes+1, sum, cr);
-+        fnorm2 = sum[2*cr->nnodes];
-+        /* Determine the global maximum */
-+        for (i = 0; i < cr->nnodes; i++)
-+        {
-+            if (sum[2*i] > fmax2)
-+            {
-+                fmax2 = sum[2*i];
-+                a_max = (int)(sum[2*i+1] + 0.5);
-+            }
-+        }
-+        sfree(sum);
-+    }
-+
-+    if (fnorm)
-+    {
-+        *fnorm = sqrt(fnorm2);
-+    }
-+    if (fmax)
-+    {
-+        *fmax  = sqrt(fmax2);
-+    }
-+    if (a_fmax)
-+    {
-+        *a_fmax = a_max;
-+    }
-+}
-+
-+static void get_state_f_norm_max(t_commrec *cr,
-+                                 t_grpopts *opts, t_mdatoms *mdatoms,
-+                                 em_state_t *ems)
-+{
-+    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
-+}
-+
-+void init_em(FILE *fplog, const char *title,
-+             t_commrec *cr, t_inputrec *ir,
-+             t_state *state_global, gmx_mtop_t *top_global,
-+             em_state_t *ems, gmx_localtop_t **top,
-+             rvec **f, rvec **f_global,
-+             t_nrnb *nrnb, rvec mu_tot,
-+             t_forcerec *fr, gmx_enerdata_t **enerd,
-+             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int nfile, const t_filenm fnm[],
-+             gmx_mdoutf_t *outf, t_mdebin **mdebin,
-+             int imdport, unsigned long gmx_unused Flags,
-+             gmx_wallcycle_t wcycle)
-+{
-+    int  i;
-+    real dvdl_constr;
-+
-+    if (fplog)
-+    {
-+        fprintf(fplog, "Initiating %s\n", title);
-+    }
-+
-+    state_global->ngtc = 0;
-+
-+    /* Initialize lambda variables */
-+    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
-+
-+    init_nrnb(nrnb);
-+
-+    /* Interactive molecular dynamics */
-+    init_IMD(ir, cr, top_global, fplog, 1, state_global->x,
-+             nfile, fnm, NULL, imdport, Flags);
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        *top = dd_init_local_top(top_global);
-+
-+        dd_init_local_state(cr->dd, state_global, &ems->s);
-+
-+        *f = NULL;
-+
-+        /* Distribute the charge groups over the nodes from the master node */
-+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
-+                            state_global, top_global, ir,
-+                            &ems->s, &ems->f, mdatoms, *top,
-+                            fr, vsite, NULL, constr,
-+                            nrnb, NULL, FALSE);
-+        dd_store_state(cr->dd, &ems->s);
-+
-+        if (ir->nstfout)
-+        {
-+            snew(*f_global, top_global->natoms);
-+        }
-+        else
-+        {
-+            *f_global = NULL;
-+        }
-+        *graph = NULL;
-+    }
-+    else
-+    {
-+        snew(*f, top_global->natoms);
-+
-+        /* Just copy the state */
-+        ems->s = *state_global;
-+        snew(ems->s.x, ems->s.nalloc);
-+        snew(ems->f, ems->s.nalloc);
-+        for (i = 0; i < state_global->natoms; i++)
-+        {
-+            copy_rvec(state_global->x[i], ems->s.x[i]);
-+        }
-+        copy_mat(state_global->box, ems->s.box);
-+
-+        *top      = gmx_mtop_generate_local_top(top_global, ir);
-+        *f_global = *f;
-+
-+        forcerec_set_excl_load(fr, *top);
-+
-+        setup_bonded_threading(fr, &(*top)->idef);
-+
-+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
-+        {
-+            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
-+        }
-+        else
-+        {
-+            *graph = NULL;
-+        }
-+
-+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
-+        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
-+
-+        if (vsite)
-+        {
-+            set_vsite_top(vsite, *top, mdatoms, cr);
-+        }
-+    }
-+
-+    if (constr)
-+    {
-+        if (ir->eConstrAlg == econtSHAKE &&
-+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
-+        {
-+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
-+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
-+        }
-+
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            set_constraints(constr, *top, ir, mdatoms, cr);
-+        }
-+
-+        if (!ir->bContinuation)
-+        {
-+            /* Constrain the starting coordinates */
-+            dvdl_constr = 0;
-+            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
-+                      ir, NULL, cr, -1, 0, 1.0, mdatoms,
-+                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
-+                      ems->s.lambda[efptFEP], &dvdl_constr,
-+                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
-+        }
-+    }
-+
-+    if (PAR(cr))
-+    {
-+        *gstat = global_stat_init(ir);
-+    }
-+
-+    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL, wcycle);
-+
-+    snew(*enerd, 1);
-+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
-+                  *enerd);
-+
-+    if (mdebin != NULL)
-+    {
-+        /* Init bin for energy stuff */
-+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, NULL);
-+    }
-+
-+    clear_rvec(mu_tot);
-+    calc_shifts(ems->s.box, fr->shift_vec);
-+}
-+
-+static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
-+                      gmx_walltime_accounting_t walltime_accounting,
-+                      gmx_wallcycle_t wcycle)
-+{
-+    if (!(cr->duty & DUTY_PME))
-+    {
-+        /* Tell the PME only node to finish */
-+        gmx_pme_send_finish(cr);
-+    }
-+
-+    done_mdoutf(outf);
-+
-+    em_time_end(walltime_accounting, wcycle);
-+}
-+
-+static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
-+{
-+    em_state_t tmp;
-+
-+    tmp   = *ems1;
-+    *ems1 = *ems2;
-+    *ems2 = tmp;
-+}
-+
-+static void copy_em_coords(em_state_t *ems, t_state *state)
-+{
-+    int i;
-+
-+    for (i = 0; (i < state->natoms); i++)
-+    {
-+        copy_rvec(ems->s.x[i], state->x[i]);
-+    }
-+}
-+
-+static void write_em_traj(FILE *fplog, t_commrec *cr,
-+                          gmx_mdoutf_t outf,
-+                          gmx_bool bX, gmx_bool bF, const char *confout,
-+                          gmx_mtop_t *top_global,
-+                          t_inputrec *ir, gmx_int64_t step,
-+                          em_state_t *state,
-+                          t_state *state_global, rvec *f_global)
-+{
-+    int      mdof_flags;
-+    gmx_bool bIMDout = FALSE;
-+
-+
-+    /* Shall we do IMD output? */
-+    if (ir->bIMD)
-+    {
-+        bIMDout = do_per_step(step, IMD_get_step(ir->imd->setup));
-+    }
-+
-+    if ((bX || bF || bIMDout || confout != NULL) && !DOMAINDECOMP(cr))
-+    {
-+        copy_em_coords(state, state_global);
-+        f_global = state->f;
-+    }
-+
-+    mdof_flags = 0;
-+    if (bX)
-+    {
-+        mdof_flags |= MDOF_X;
-+    }
-+    if (bF)
-+    {
-+        mdof_flags |= MDOF_F;
-+    }
-+
-+    /* If we want IMD output, set appropriate MDOF flag */
-+    if (ir->bIMD)
-+    {
-+        mdof_flags |= MDOF_IMD;
-+    }
-+
-+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
-+                                     top_global, step, (double)step,
-+                                     &state->s, state_global, state->f, f_global);
-+
-+    if (confout != NULL && MASTER(cr))
-+    {
-+        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
-+        {
-+            /* Make molecules whole only for confout writing */
-+            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
-+                        state_global->x);
-+        }
-+
-+        write_sto_conf_mtop(confout,
-+                            *top_global->name, top_global,
-+                            state_global->x, NULL, ir->ePBC, state_global->box);
-+    }
-+}
-+
-+static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
-+                       gmx_bool bMolPBC,
-+                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
-+                       gmx_constr_t constr, gmx_localtop_t *top,
-+                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                       gmx_int64_t count)
-+
-+{
-+    t_state *s1, *s2;
-+    int      i;
-+    int      start, end;
-+    rvec    *x1, *x2;
-+    real     dvdl_constr;
-+    int      nthreads gmx_unused;
-+
-+    s1 = &ems1->s;
-+    s2 = &ems2->s;
-+
-+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
-+    {
-+        gmx_incons("state mismatch in do_em_step");
-+    }
-+
-+    s2->flags = s1->flags;
-+
-+    if (s2->nalloc != s1->nalloc)
-+    {
-+        s2->nalloc = s1->nalloc;
-+        srenew(s2->x, s1->nalloc);
-+        srenew(ems2->f,  s1->nalloc);
-+        if (s2->flags & (1<<estCGP))
-+        {
-+            srenew(s2->cg_p,  s1->nalloc);
-+        }
-+    }
-+
-+    s2->natoms = s1->natoms;
-+    copy_mat(s1->box, s2->box);
-+    /* Copy free energy state */
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        s2->lambda[i] = s1->lambda[i];
-+    }
-+    copy_mat(s1->box, s2->box);
-+
-+    start = 0;
-+    end   = md->homenr;
-+
-+    x1 = s1->x;
-+    x2 = s2->x;
-+
-+    nthreads = gmx_omp_nthreads_get(emntUpdate);
-+#pragma omp parallel num_threads(nthreads)
-+    {
-+        int gf, i, m;
-+
-+        gf = 0;
-+#pragma omp for schedule(static) nowait
-+        for (i = start; i < end; i++)
-+        {
-+            if (md->cFREEZE)
-+            {
-+                gf = md->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (ir->opts.nFreeze[gf][m])
-+                {
-+                    x2[i][m] = x1[i][m];
-+                }
-+                else
-+                {
-+                    x2[i][m] = x1[i][m] + a*f[i][m];
-+                }
-+            }
-+        }
-+
-+        if (s2->flags & (1<<estCGP))
-+        {
-+            /* Copy the CG p vector */
-+            x1 = s1->cg_p;
-+            x2 = s2->cg_p;
-+#pragma omp for schedule(static) nowait
-+            for (i = start; i < end; i++)
-+            {
-+                copy_rvec(x1[i], x2[i]);
-+            }
-+        }
-+
-+        if (DOMAINDECOMP(cr))
-+        {
-+            s2->ddp_count = s1->ddp_count;
-+            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
-+            {
-+#pragma omp barrier
-+                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
-+                srenew(s2->cg_gl, s2->cg_gl_nalloc);
-+#pragma omp barrier
-+            }
-+            s2->ncg_gl = s1->ncg_gl;
-+#pragma omp for schedule(static) nowait
-+            for (i = 0; i < s2->ncg_gl; i++)
-+            {
-+                s2->cg_gl[i] = s1->cg_gl[i];
-+            }
-+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
-+        }
-+    }
-+
-+    if (constr)
-+    {
-+        wallcycle_start(wcycle, ewcCONSTR);
-+        dvdl_constr = 0;
-+        constrain(NULL, TRUE, TRUE, constr, &top->idef,
-+                  ir, NULL, cr, count, 0, 1.0, md,
-+                  s1->x, s2->x, NULL, bMolPBC, s2->box,
-+                  s2->lambda[efptBONDED], &dvdl_constr,
-+                  NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
-+        wallcycle_stop(wcycle, ewcCONSTR);
-+    }
-+}
-+
-+static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
-+                                   gmx_mtop_t *top_global, t_inputrec *ir,
-+                                   em_state_t *ems, gmx_localtop_t *top,
-+                                   t_mdatoms *mdatoms, t_forcerec *fr,
-+                                   gmx_vsite_t *vsite, gmx_constr_t constr,
-+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
-+{
-+    /* Repartition the domain decomposition */
-+    wallcycle_start(wcycle, ewcDOMDEC);
-+    dd_partition_system(fplog, step, cr, FALSE, 1,
-+                        NULL, top_global, ir,
-+                        &ems->s, &ems->f,
-+                        mdatoms, top, fr, vsite, NULL, constr,
-+                        nrnb, wcycle, FALSE);
-+    dd_store_state(cr->dd, &ems->s);
-+    wallcycle_stop(wcycle, ewcDOMDEC);
-+}
-+
-+static void evaluate_energy(FILE *fplog, t_commrec *cr,
-+                            gmx_mtop_t *top_global,
-+                            em_state_t *ems, gmx_localtop_t *top,
-+                            t_inputrec *inputrec,
-+                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                            gmx_global_stat_t gstat,
-+                            gmx_vsite_t *vsite, gmx_constr_t constr,
-+                            t_fcdata *fcd,
-+                            t_graph *graph, t_mdatoms *mdatoms,
-+                            t_forcerec *fr, rvec mu_tot,
-+                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
-+                            gmx_int64_t count, gmx_bool bFirst)
-+{
-+    real     t;
-+    gmx_bool bNS;
-+    int      nabnsb;
-+    tensor   force_vir, shake_vir, ekin;
-+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
-+    real     terminate = 0;
-+
-+    /* Set the time to the initial time, the time does not change during EM */
-+    t = inputrec->init_t;
-+
-+    if (bFirst ||
-+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
-+    {
-+        /* This is the first state or an old state used before the last ns */
-+        bNS = TRUE;
-+    }
-+    else
-+    {
-+        bNS = FALSE;
-+        if (inputrec->nstlist > 0)
-+        {
-+            bNS = TRUE;
-+        }
-+        else if (inputrec->nstlist == -1)
-+        {
-+            nabnsb = natoms_beyond_ns_buffer(inputrec, fr, &top->cgs, NULL, ems->s.x);
-+            if (PAR(cr))
-+            {
-+                gmx_sumi(1, &nabnsb, cr);
-+            }
-+            bNS = (nabnsb > 0);
-+        }
-+    }
-+
-+    if (vsite)
-+    {
-+        construct_vsites(vsite, ems->s.x, 1, NULL,
-+                         top->idef.iparams, top->idef.il,
-+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
-+    }
-+
-+    if (DOMAINDECOMP(cr) && bNS)
-+    {
-+        /* Repartition the domain decomposition */
-+        em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-+                               ems, top, mdatoms, fr, vsite, constr,
-+                               nrnb, wcycle);
-+    }
-+
-+    /* Calc force & energy on new trial position  */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole in congrad.c
-+     */
-+    do_force(fplog, cr, inputrec,
-+             count, nrnb, wcycle, top, &top_global->groups,
-+             ems->s.box, ems->s.x, &ems->s.hist,
-+             ems->f, force_vir, mdatoms, enerd, fcd,
-+             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
-+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
-+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
-+             (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
-+
-+    /* Clear the unused shake virial and pressure */
-+    clear_mat(shake_vir);
-+    clear_mat(pres);
-+
-+    /* Communicate stuff when parallel */
-+    if (PAR(cr) && inputrec->eI != eiNM)
-+    {
-+        wallcycle_start(wcycle, ewcMoveE);
-+
-+        global_stat(fplog, gstat, cr, enerd, force_vir, shake_vir, mu_tot,
-+                    inputrec, NULL, NULL, NULL, 1, &terminate,
-+                    top_global, &ems->s, FALSE,
-+                    CGLO_ENERGY |
-+                    CGLO_PRESSURE |
-+                    CGLO_CONSTRAINT |
-+                    CGLO_FIRSTITERATE);
-+
-+        wallcycle_stop(wcycle, ewcMoveE);
-+    }
-+
-+    /* Calculate long range corrections to pressure and energy */
-+    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
-+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
-+    enerd->term[F_DISPCORR] = enercorr;
-+    enerd->term[F_EPOT]    += enercorr;
-+    enerd->term[F_PRES]    += prescorr;
-+    enerd->term[F_DVDL]    += dvdlcorr;
-+
-+    ems->epot = enerd->term[F_EPOT];
-+
-+    if (constr)
-+    {
-+        /* Project out the constraint components of the force */
-+        wallcycle_start(wcycle, ewcCONSTR);
-+        dvdl_constr = 0;
-+        constrain(NULL, FALSE, FALSE, constr, &top->idef,
-+                  inputrec, NULL, cr, count, 0, 1.0, mdatoms,
-+                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
-+                  ems->s.lambda[efptBONDED], &dvdl_constr,
-+                  NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
-+        if (fr->bSepDVDL && fplog)
-+        {
-+            gmx_print_sepdvdl(fplog, "Constraints", t, dvdl_constr);
-+        }
-+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-+        m_add(force_vir, shake_vir, vir);
-+        wallcycle_stop(wcycle, ewcCONSTR);
-+    }
-+    else
-+    {
-+        copy_mat(force_vir, vir);
-+    }
-+
-+    clear_mat(ekin);
-+    enerd->term[F_PRES] =
-+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
-+
-+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
-+
-+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
-+    {
-+        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
-+    }
-+}
-+
-+static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
-+                              gmx_mtop_t *mtop,
-+                              em_state_t *s_min, em_state_t *s_b)
-+{
-+    rvec          *fm, *fb, *fmg;
-+    t_block       *cgs_gl;
-+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
-+    double         partsum;
-+    unsigned char *grpnrFREEZE;
-+
-+    if (debug)
-+    {
-+        fprintf(debug, "Doing reorder_partsum\n");
-+    }
-+
-+    fm = s_min->f;
-+    fb = s_b->f;
-+
-+    cgs_gl = dd_charge_groups_global(cr->dd);
-+    index  = cgs_gl->index;
-+
-+    /* Collect fm in a global vector fmg.
-+     * This conflicts with the spirit of domain decomposition,
-+     * but to fully optimize this a much more complicated algorithm is required.
-+     */
-+    snew(fmg, mtop->natoms);
-+
-+    ncg   = s_min->s.ncg_gl;
-+    cg_gl = s_min->s.cg_gl;
-+    i     = 0;
-+    for (c = 0; c < ncg; c++)
-+    {
-+        cg = cg_gl[c];
-+        a0 = index[cg];
-+        a1 = index[cg+1];
-+        for (a = a0; a < a1; a++)
-+        {
-+            copy_rvec(fm[i], fmg[a]);
-+            i++;
-+        }
-+    }
-+    gmx_sum(mtop->natoms*3, fmg[0], cr);
-+
-+    /* Now we will determine the part of the sum for the cgs in state s_b */
-+    ncg         = s_b->s.ncg_gl;
-+    cg_gl       = s_b->s.cg_gl;
-+    partsum     = 0;
-+    i           = 0;
-+    gf          = 0;
-+    grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
-+    for (c = 0; c < ncg; c++)
-+    {
-+        cg = cg_gl[c];
-+        a0 = index[cg];
-+        a1 = index[cg+1];
-+        for (a = a0; a < a1; a++)
-+        {
-+            if (mdatoms->cFREEZE && grpnrFREEZE)
-+            {
-+                gf = grpnrFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
-+                }
-+            }
-+            i++;
-+        }
-+    }
-+
-+    sfree(fmg);
-+
-+    return partsum;
-+}
-+
-+static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
-+                    gmx_mtop_t *mtop,
-+                    em_state_t *s_min, em_state_t *s_b)
-+{
-+    rvec  *fm, *fb;
-+    double sum;
-+    int    gf, i, m;
-+
-+    /* This is just the classical Polak-Ribiere calculation of beta;
-+     * it looks a bit complicated since we take freeze groups into account,
-+     * and might have to sum it in parallel runs.
-+     */
-+
-+    if (!DOMAINDECOMP(cr) ||
-+        (s_min->s.ddp_count == cr->dd->ddp_count &&
-+         s_b->s.ddp_count   == cr->dd->ddp_count))
-+    {
-+        fm  = s_min->f;
-+        fb  = s_b->f;
-+        sum = 0;
-+        gf  = 0;
-+        /* This part of code can be incorrect with DD,
-+         * since the atom ordering in s_b and s_min might differ.
-+         */
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            if (mdatoms->cFREEZE)
-+            {
-+                gf = mdatoms->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
-+                }
-+            }
-+        }
-+    }
-+    else
-+    {
-+        /* We need to reorder cgs while summing */
-+        sum = reorder_partsum(cr, opts, mdatoms, mtop, s_min, s_b);
-+    }
-+    if (PAR(cr))
-+    {
-+        gmx_sumd(1, &sum, cr);
-+    }
-+
-+    return sum/sqr(s_min->fnorm);
-+}
-+
-+double do_cg(FILE *fplog, t_commrec *cr,
-+             int nfile, const t_filenm fnm[],
-+             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+             int gmx_unused nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int gmx_unused stepout,
-+             t_inputrec *inputrec,
-+             gmx_mtop_t *top_global, t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t gmx_unused ed,
-+             t_forcerec *fr,
-+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+             gmx_membed_t gmx_unused membed,
-+             real gmx_unused cpt_period, real gmx_unused max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long gmx_unused Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
-+
-+    em_state_t       *s_min, *s_a, *s_b, *s_c;
-+    gmx_localtop_t   *top;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f;
-+    gmx_global_stat_t gstat;
-+    t_graph          *graph;
-+    rvec             *f_global, *p, *sf, *sfm;
-+    double            gpa, gpb, gpc, tmp, sum[2], minstep;
-+    real              fnormn;
-+    real              stepsize;
-+    real              a, b, c, beta = 0.0;
-+    real              epot_repl = 0;
-+    real              pnorm;
-+    t_mdebin         *mdebin;
-+    gmx_bool          converged, foundlower;
-+    rvec              mu_tot;
-+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
-+    tensor            vir, pres;
-+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
-+    gmx_mdoutf_t      outf;
-+    int               i, m, gf, step, nminstep;
-+    real              terminate = 0;
-+
-+    step = 0;
-+
-+    s_min = init_em_state();
-+    s_a   = init_em_state();
-+    s_b   = init_em_state();
-+    s_c   = init_em_state();
-+
-+    /* Init em and store the local state in s_min */
-+    init_em(fplog, CG, cr, inputrec,
-+            state_global, top_global, s_min, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+
-+    /* Print to log file */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
-+
-+    /* Max number of steps */
-+    number_steps = inputrec->nsteps;
-+
-+    if (MASTER(cr))
-+    {
-+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
-+    }
-+
-+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole in congrad.c
-+     */
-+    evaluate_energy(fplog, cr,
-+                    top_global, s_min, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    where();
-+
-+    if (MASTER(cr))
-+    {
-+        /* Copy stuff to the energy bin for easy printing etc. */
-+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
-+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+        print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+    where();
-+
-+    /* Estimate/guess the initial stepsize */
-+    stepsize = inputrec->em_stepsize/s_min->fnorm;
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
-+                s_min->fmax, s_min->a_fmax+1);
-+        fprintf(stderr, "   F-Norm            = %12.5e\n",
-+                s_min->fnorm/sqrt(state_global->natoms));
-+        fprintf(stderr, "\n");
-+        /* and copy to the log file too... */
-+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
-+                s_min->fmax, s_min->a_fmax+1);
-+        fprintf(fplog, "   F-Norm            = %12.5e\n",
-+                s_min->fnorm/sqrt(state_global->natoms));
-+        fprintf(fplog, "\n");
-+    }
-+    /* Start the loop over CG steps.
-+     * Each successful step is counted, and we continue until
-+     * we either converge or reach the max number of steps.
-+     */
-+    converged = FALSE;
-+    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
-+    {
-+
-+        /* start taking steps in a new direction
-+         * First time we enter the routine, beta=0, and the direction is
-+         * simply the negative gradient.
-+         */
-+
-+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
-+        p   = s_min->s.cg_p;
-+        sf  = s_min->f;
-+        gpa = 0;
-+        gf  = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            if (mdatoms->cFREEZE)
-+            {
-+                gf = mdatoms->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!inputrec->opts.nFreeze[gf][m])
-+                {
-+                    p[i][m] = sf[i][m] + beta*p[i][m];
-+                    gpa    -= p[i][m]*sf[i][m];
-+                    /* f is negative gradient, thus the sign */
-+                }
-+                else
-+                {
-+                    p[i][m] = 0;
-+                }
-+            }
-+        }
-+
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpa, cr);
-+        }
-+
-+        /* Calculate the norm of the search vector */
-+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
-+
-+        /* Just in case stepsize reaches zero due to numerical precision... */
-+        if (stepsize <= 0)
-+        {
-+            stepsize = inputrec->em_stepsize/pnorm;
-+        }
-+
-+        /*
-+         * Double check the value of the derivative in the search direction.
-+         * If it is positive it must be due to the old information in the
-+         * CG formula, so just remove that and start over with beta=0.
-+         * This corresponds to a steepest descent step.
-+         */
-+        if (gpa > 0)
-+        {
-+            beta = 0;
-+            step--;   /* Don't count this step since we are restarting */
-+            continue; /* Go back to the beginning of the big for-loop */
-+        }
-+
-+        /* Calculate minimum allowed stepsize, before the average (norm)
-+         * relative change in coordinate is smaller than precision
-+         */
-+        minstep = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            for (m = 0; m < DIM; m++)
-+            {
-+                tmp = fabs(s_min->s.x[i][m]);
-+                if (tmp < 1.0)
-+                {
-+                    tmp = 1.0;
-+                }
-+                tmp      = p[i][m]/tmp;
-+                minstep += tmp*tmp;
-+            }
-+        }
-+        /* Add up from all CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &minstep, cr);
-+        }
-+
-+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
-+
-+        if (stepsize < minstep)
-+        {
-+            converged = TRUE;
-+            break;
-+        }
-+
-+        /* Write coordinates if necessary */
-+        do_x = do_per_step(step, inputrec->nstxout);
-+        do_f = do_per_step(step, inputrec->nstfout);
-+
-+        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
-+                      top_global, inputrec, step,
-+                      s_min, state_global, f_global);
-+
-+        /* Take a step downhill.
-+         * In theory, we should minimize the function along this direction.
-+         * That is quite possible, but it turns out to take 5-10 function evaluations
-+         * for each line. However, we dont really need to find the exact minimum -
-+         * it is much better to start a new CG step in a modified direction as soon
-+         * as we are close to it. This will save a lot of energy evaluations.
-+         *
-+         * In practice, we just try to take a single step.
-+         * If it worked (i.e. lowered the energy), we increase the stepsize but
-+         * the continue straight to the next CG step without trying to find any minimum.
-+         * If it didn't work (higher energy), there must be a minimum somewhere between
-+         * the old position and the new one.
-+         *
-+         * Due to the finite numerical accuracy, it turns out that it is a good idea
-+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-+         * This leads to lower final energies in the tests I've done. / Erik
-+         */
-+        s_a->epot = s_min->epot;
-+        a         = 0.0;
-+        c         = a + stepsize; /* reference position along line is zero */
-+
-+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
-+        {
-+            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
-+                                   s_min, top, mdatoms, fr, vsite, constr,
-+                                   nrnb, wcycle);
-+        }
-+
-+        /* Take a trial step (new coords in s_c) */
-+        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
-+                   constr, top, nrnb, wcycle, -1);
-+
-+        neval++;
-+        /* Calculate energy for the trial step */
-+        evaluate_energy(fplog, cr,
-+                        top_global, s_c, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, -1, FALSE);
-+
-+        /* Calc derivative along line */
-+        p   = s_c->s.cg_p;
-+        sf  = s_c->f;
-+        gpc = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            for (m = 0; m < DIM; m++)
-+            {
-+                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
-+            }
-+        }
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpc, cr);
-+        }
-+
-+        /* This is the max amount of increase in energy we tolerate */
-+        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
-+
-+        /* Accept the step if the energy is lower, or if it is not significantly higher
-+         * and the line derivative is still negative.
-+         */
-+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
-+        {
-+            foundlower = TRUE;
-+            /* Great, we found a better energy. Increase step for next iteration
-+             * if we are still going down, decrease it otherwise
-+             */
-+            if (gpc < 0)
-+            {
-+                stepsize *= 1.618034; /* The golden section */
-+            }
-+            else
-+            {
-+                stepsize *= 0.618034; /* 1/golden section */
-+            }
-+        }
-+        else
-+        {
-+            /* New energy is the same or higher. We will have to do some work
-+             * to find a smaller value in the interval. Take smaller step next time!
-+             */
-+            foundlower = FALSE;
-+            stepsize  *= 0.618034;
-+        }
-+
-+
-+
-+
-+        /* OK, if we didn't find a lower value we will have to locate one now - there must
-+         * be one in the interval [a=0,c].
-+         * The same thing is valid here, though: Don't spend dozens of iterations to find
-+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-+         *
-+         * I also have a safeguard for potentially really patological functions so we never
-+         * take more than 20 steps before we give up ...
-+         *
-+         * If we already found a lower value we just skip this step and continue to the update.
-+         */
-+        if (!foundlower)
-+        {
-+            nminstep = 0;
-+
-+            do
-+            {
-+                /* Select a new trial point.
-+                 * If the derivatives at points a & c have different sign we interpolate to zero,
-+                 * otherwise just do a bisection.
-+                 */
-+                if (gpa < 0 && gpc > 0)
-+                {
-+                    b = a + gpa*(a-c)/(gpc-gpa);
-+                }
-+                else
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* safeguard if interpolation close to machine accuracy causes errors:
-+                 * never go outside the interval
-+                 */
-+                if (b <= a || b >= c)
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-+                {
-+                    /* Reload the old state */
-+                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
-+                                           s_min, top, mdatoms, fr, vsite, constr,
-+                                           nrnb, wcycle);
-+                }
-+
-+                /* Take a trial step to this new point - new coords in s_b */
-+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
-+                           constr, top, nrnb, wcycle, -1);
-+
-+                neval++;
-+                /* Calculate energy for the trial step */
-+                evaluate_energy(fplog, cr,
-+                                top_global, s_b, top,
-+                                inputrec, nrnb, wcycle, gstat,
-+                                vsite, constr, fcd, graph, mdatoms, fr,
-+                                mu_tot, enerd, vir, pres, -1, FALSE);
-+
-+                /* p does not change within a step, but since the domain decomposition
-+                 * might change, we have to use cg_p of s_b here.
-+                 */
-+                p   = s_b->s.cg_p;
-+                sf  = s_b->f;
-+                gpb = 0;
-+                for (i = 0; i < mdatoms->homenr; i++)
-+                {
-+                    for (m = 0; m < DIM; m++)
-+                    {
-+                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
-+                    }
-+                }
-+                /* Sum the gradient along the line across CPUs */
-+                if (PAR(cr))
-+                {
-+                    gmx_sumd(1, &gpb, cr);
-+                }
-+
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
-+                            s_a->epot, s_b->epot, s_c->epot, gpb);
-+                }
-+
-+                epot_repl = s_b->epot;
-+
-+                /* Keep one of the intervals based on the value of the derivative at the new point */
-+                if (gpb > 0)
-+                {
-+                    /* Replace c endpoint with b */
-+                    swap_em_state(s_b, s_c);
-+                    c   = b;
-+                    gpc = gpb;
-+                }
-+                else
-+                {
-+                    /* Replace a endpoint with b */
-+                    swap_em_state(s_b, s_a);
-+                    a   = b;
-+                    gpa = gpb;
-+                }
-+
-+                /*
-+                 * Stop search as soon as we find a value smaller than the endpoints.
-+                 * Never run more than 20 steps, no matter what.
-+                 */
-+                nminstep++;
-+            }
-+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
-+                   (nminstep < 20));
-+
-+            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
-+                nminstep >= 20)
-+            {
-+                /* OK. We couldn't find a significantly lower energy.
-+                 * If beta==0 this was steepest descent, and then we give up.
-+                 * If not, set beta=0 and restart with steepest descent before quitting.
-+                 */
-+                if (beta == 0.0)
-+                {
-+                    /* Converged */
-+                    converged = TRUE;
-+                    break;
-+                }
-+                else
-+                {
-+                    /* Reset memory before giving up */
-+                    beta = 0.0;
-+                    continue;
-+                }
-+            }
-+
-+            /* Select min energy state of A & C, put the best in B.
-+             */
-+            if (s_c->epot < s_a->epot)
-+            {
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
-+                            s_c->epot, s_a->epot);
-+                }
-+                swap_em_state(s_b, s_c);
-+                gpb = gpc;
-+                b   = c;
-+            }
-+            else
-+            {
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
-+                            s_a->epot, s_c->epot);
-+                }
-+                swap_em_state(s_b, s_a);
-+                gpb = gpa;
-+                b   = a;
-+            }
-+
-+        }
-+        else
-+        {
-+            if (debug)
-+            {
-+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
-+                        s_c->epot);
-+            }
-+            swap_em_state(s_b, s_c);
-+            gpb = gpc;
-+            b   = c;
-+        }
-+
-+        /* new search direction */
-+        /* beta = 0 means forget all memory and restart with steepest descents. */
-+        if (nstcg && ((step % nstcg) == 0))
-+        {
-+            beta = 0.0;
-+        }
-+        else
-+        {
-+            /* s_min->fnorm cannot be zero, because then we would have converged
-+             * and broken out.
-+             */
-+
-+            /* Polak-Ribiere update.
-+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
-+             */
-+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
-+        }
-+        /* Limit beta to prevent oscillations */
-+        if (fabs(beta) > 5.0)
-+        {
-+            beta = 0.0;
-+        }
-+
-+
-+        /* update positions */
-+        swap_em_state(s_min, s_b);
-+        gpa = gpb;
-+
-+        /* Print it if necessary */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
-+                        step, s_min->epot, s_min->fnorm/sqrt(state_global->natoms),
-+                        s_min->fmax, s_min->a_fmax+1);
-+            }
-+            /* Store the new (lower) energies */
-+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
-+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+            do_log = do_per_step(step, inputrec->nstlog);
-+            do_ene = do_per_step(step, inputrec->nstenergy);
-+
-+            /* Prepare IMD energy record, if bIMD is TRUE. */
-+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
-+
-+            if (do_log)
-+            {
-+                print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+            }
-+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-+                       do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+
-+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        /* Stop when the maximum force lies below tolerance.
-+         * If we have reached machine precision, converged is already set to true.
-+         */
-+        converged = converged || (s_min->fmax < inputrec->em_tol);
-+
-+    } /* End of the loop */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    if (converged)
-+    {
-+        step--; /* we never took that last step in this case */
-+
-+    }
-+    if (s_min->fmax > inputrec->em_tol)
-+    {
-+        if (MASTER(cr))
-+        {
-+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
-+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
-+        }
-+        converged = FALSE;
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        /* If we printed energy and/or logfile last step (which was the last step)
-+         * we don't have to do it again, but otherwise print the final values.
-+         */
-+        if (!do_log)
-+        {
-+            /* Write final value to log since we didn't do anything the last step */
-+            print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+        }
-+        if (!do_ene || !do_log)
-+        {
-+            /* Write final energy file entries */
-+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-+                       !do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+    }
-+
-+    /* Print some stuff... */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+
-+    /* IMPORTANT!
-+     * For accurate normal mode calculation it is imperative that we
-+     * store the last conformation into the full precision binary trajectory.
-+     *
-+     * However, we should only do it if we did NOT already write this step
-+     * above (which we did if do_x or do_f was true).
-+     */
-+    do_x = !do_per_step(step, inputrec->nstxout);
-+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
-+
-+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, step,
-+                  s_min, state_global, f_global);
-+
-+    fnormn = s_min->fnorm/sqrt(state_global->natoms);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+
-+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_lbfgs(FILE *fplog, t_commrec *cr,
-+                int nfile, const t_filenm fnm[],
-+                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+                int gmx_unused nstglobalcomm,
-+                gmx_vsite_t *vsite, gmx_constr_t constr,
-+                int gmx_unused stepout,
-+                t_inputrec *inputrec,
-+                gmx_mtop_t *top_global, t_fcdata *fcd,
-+                t_state *state,
-+                t_mdatoms *mdatoms,
-+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                gmx_edsam_t gmx_unused ed,
-+                t_forcerec *fr,
-+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+                gmx_membed_t gmx_unused membed,
-+                real gmx_unused cpt_period, real gmx_unused max_hours,
-+                const char gmx_unused *deviceOptions,
-+                int imdport,
-+                unsigned long gmx_unused Flags,
-+                gmx_walltime_accounting_t walltime_accounting)
-+{
-+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
-+    em_state_t         ems;
-+    gmx_localtop_t    *top;
-+    gmx_enerdata_t    *enerd;
-+    rvec              *f;
-+    gmx_global_stat_t  gstat;
-+    t_graph           *graph;
-+    rvec              *f_global;
-+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
-+    double             stepsize, gpa, gpb, gpc, tmp, minstep;
-+    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
-+    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
-+    real               a, b, c, maxdelta, delta;
-+    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
-+    real               dgdx, dgdg, sq, yr, beta;
-+    t_mdebin          *mdebin;
-+    gmx_bool           converged, first;
-+    rvec               mu_tot;
-+    real               fnorm, fmax;
-+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
-+    tensor             vir, pres;
-+    int                start, end, number_steps;
-+    gmx_mdoutf_t       outf;
-+    int                i, k, m, n, nfmax, gf, step;
-+    int                mdof_flags;
-+    /* not used */
-+    real               terminate;
-+
-+    if (PAR(cr))
-+    {
-+        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
-+    }
-+
-+    if (NULL != constr)
-+    {
-+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
-+    }
-+
-+    n        = 3*state->natoms;
-+    nmaxcorr = inputrec->nbfgscorr;
-+
-+    /* Allocate memory */
-+    /* Use pointers to real so we dont have to loop over both atoms and
-+     * dimensions all the time...
-+     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
-+     * that point to the same memory.
-+     */
-+    snew(xa, n);
-+    snew(xb, n);
-+    snew(xc, n);
-+    snew(fa, n);
-+    snew(fb, n);
-+    snew(fc, n);
-+    snew(frozen, n);
-+
-+    snew(p, n);
-+    snew(lastx, n);
-+    snew(lastf, n);
-+    snew(rho, nmaxcorr);
-+    snew(alpha, nmaxcorr);
-+
-+    snew(dx, nmaxcorr);
-+    for (i = 0; i < nmaxcorr; i++)
-+    {
-+        snew(dx[i], n);
-+    }
-+
-+    snew(dg, nmaxcorr);
-+    for (i = 0; i < nmaxcorr; i++)
-+    {
-+        snew(dg[i], n);
-+    }
-+
-+    step  = 0;
-+    neval = 0;
-+
-+    /* Init em */
-+    init_em(fplog, LBFGS, cr, inputrec,
-+            state, top_global, &ems, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+    /* Do_lbfgs is not completely updated like do_steep and do_cg,
-+     * so we free some memory again.
-+     */
-+    sfree(ems.s.x);
-+    sfree(ems.f);
-+
-+    xx = (real *)state->x;
-+    ff = (real *)f;
-+
-+    start = 0;
-+    end   = mdatoms->homenr;
-+
-+    /* Print to log file */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
-+
-+    do_log = do_ene = do_x = do_f = TRUE;
-+
-+    /* Max number of steps */
-+    number_steps = inputrec->nsteps;
-+
-+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
-+    gf = 0;
-+    for (i = start; i < end; i++)
-+    {
-+        if (mdatoms->cFREEZE)
-+        {
-+            gf = mdatoms->cFREEZE[i];
-+        }
-+        for (m = 0; m < DIM; m++)
-+        {
-+            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
-+        }
-+    }
-+    if (MASTER(cr))
-+    {
-+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
-+    }
-+
-+    if (vsite)
-+    {
-+        construct_vsites(vsite, state->x, 1, NULL,
-+                         top->idef.iparams, top->idef.il,
-+                         fr->ePBC, fr->bMolPBC, cr, state->box);
-+    }
-+
-+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole
-+     */
-+    neval++;
-+    ems.s.x = state->x;
-+    ems.f   = f;
-+    evaluate_energy(fplog, cr,
-+                    top_global, &ems, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    where();
-+
-+    if (MASTER(cr))
-+    {
-+        /* Copy stuff to the energy bin for easy printing etc. */
-+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                   mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
-+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+    where();
-+
-+    /* This is the starting energy */
-+    Epot = enerd->term[F_EPOT];
-+
-+    fnorm = ems.fnorm;
-+    fmax  = ems.fmax;
-+    nfmax = ems.a_fmax;
-+
-+    /* Set the initial step.
-+     * since it will be multiplied by the non-normalized search direction
-+     * vector (force vector the first time), we scale it by the
-+     * norm of the force.
-+     */
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
-+        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
-+        fprintf(stderr, "\n");
-+        /* and copy to the log file too... */
-+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
-+        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
-+        fprintf(fplog, "\n");
-+    }
-+
-+    point = 0;
-+    for (i = 0; i < n; i++)
-+    {
-+        if (!frozen[i])
-+        {
-+            dx[point][i] = ff[i]; /* Initial search direction */
-+        }
-+        else
-+        {
-+            dx[point][i] = 0;
-+        }
-+    }
-+
-+    stepsize  = 1.0/fnorm;
-+    converged = FALSE;
-+
-+    /* Start the loop over BFGS steps.
-+     * Each successful step is counted, and we continue until
-+     * we either converge or reach the max number of steps.
-+     */
-+
-+    ncorr = 0;
-+
-+    /* Set the gradient from the force */
-+    converged = FALSE;
-+    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
-+    {
-+
-+        /* Write coordinates if necessary */
-+        do_x = do_per_step(step, inputrec->nstxout);
-+        do_f = do_per_step(step, inputrec->nstfout);
-+
-+        mdof_flags = 0;
-+        if (do_x)
-+        {
-+            mdof_flags |= MDOF_X;
-+        }
-+
-+        if (do_f)
-+        {
-+            mdof_flags |= MDOF_F;
-+        }
-+
-+        if (inputrec->bIMD)
-+        {
-+            mdof_flags |= MDOF_IMD;
-+        }
-+
-+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
-+                                         top_global, step, (real)step, state, state, f, f);
-+
-+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
-+
-+        /* pointer to current direction - point=0 first time here */
-+        s = dx[point];
-+
-+        /* calculate line gradient */
-+        for (gpa = 0, i = 0; i < n; i++)
-+        {
-+            gpa -= s[i]*ff[i];
-+        }
-+
-+        /* Calculate minimum allowed stepsize, before the average (norm)
-+         * relative change in coordinate is smaller than precision
-+         */
-+        for (minstep = 0, i = 0; i < n; i++)
-+        {
-+            tmp = fabs(xx[i]);
-+            if (tmp < 1.0)
-+            {
-+                tmp = 1.0;
-+            }
-+            tmp      = s[i]/tmp;
-+            minstep += tmp*tmp;
-+        }
-+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
-+
-+        if (stepsize < minstep)
-+        {
-+            converged = TRUE;
-+            break;
-+        }
-+
-+        /* Store old forces and coordinates */
-+        for (i = 0; i < n; i++)
-+        {
-+            lastx[i] = xx[i];
-+            lastf[i] = ff[i];
-+        }
-+        Epot0 = Epot;
-+
-+        first = TRUE;
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            xa[i] = xx[i];
-+        }
-+
-+        /* Take a step downhill.
-+         * In theory, we should minimize the function along this direction.
-+         * That is quite possible, but it turns out to take 5-10 function evaluations
-+         * for each line. However, we dont really need to find the exact minimum -
-+         * it is much better to start a new BFGS step in a modified direction as soon
-+         * as we are close to it. This will save a lot of energy evaluations.
-+         *
-+         * In practice, we just try to take a single step.
-+         * If it worked (i.e. lowered the energy), we increase the stepsize but
-+         * the continue straight to the next BFGS step without trying to find any minimum.
-+         * If it didn't work (higher energy), there must be a minimum somewhere between
-+         * the old position and the new one.
-+         *
-+         * Due to the finite numerical accuracy, it turns out that it is a good idea
-+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-+         * This leads to lower final energies in the tests I've done. / Erik
-+         */
-+        foundlower = FALSE;
-+        EpotA      = Epot0;
-+        a          = 0.0;
-+        c          = a + stepsize; /* reference position along line is zero */
-+
-+        /* Check stepsize first. We do not allow displacements
-+         * larger than emstep.
-+         */
-+        do
-+        {
-+            c        = a + stepsize;
-+            maxdelta = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                delta = c*s[i];
-+                if (delta > maxdelta)
-+                {
-+                    maxdelta = delta;
-+                }
-+            }
-+            if (maxdelta > inputrec->em_stepsize)
-+            {
-+                stepsize *= 0.1;
-+            }
-+        }
-+        while (maxdelta > inputrec->em_stepsize);
-+
-+        /* Take a trial step */
-+        for (i = 0; i < n; i++)
-+        {
-+            xc[i] = lastx[i] + c*s[i];
-+        }
-+
-+        neval++;
-+        /* Calculate energy for the trial step */
-+        ems.s.x = (rvec *)xc;
-+        ems.f   = (rvec *)fc;
-+        evaluate_energy(fplog, cr,
-+                        top_global, &ems, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, step, FALSE);
-+        EpotC = ems.epot;
-+
-+        /* Calc derivative along line */
-+        for (gpc = 0, i = 0; i < n; i++)
-+        {
-+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
-+        }
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpc, cr);
-+        }
-+
-+        /* This is the max amount of increase in energy we tolerate */
-+        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
-+
-+        /* Accept the step if the energy is lower, or if it is not significantly higher
-+         * and the line derivative is still negative.
-+         */
-+        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
-+        {
-+            foundlower = TRUE;
-+            /* Great, we found a better energy. Increase step for next iteration
-+             * if we are still going down, decrease it otherwise
-+             */
-+            if (gpc < 0)
-+            {
-+                stepsize *= 1.618034; /* The golden section */
-+            }
-+            else
-+            {
-+                stepsize *= 0.618034; /* 1/golden section */
-+            }
-+        }
-+        else
-+        {
-+            /* New energy is the same or higher. We will have to do some work
-+             * to find a smaller value in the interval. Take smaller step next time!
-+             */
-+            foundlower = FALSE;
-+            stepsize  *= 0.618034;
-+        }
-+
-+        /* OK, if we didn't find a lower value we will have to locate one now - there must
-+         * be one in the interval [a=0,c].
-+         * The same thing is valid here, though: Don't spend dozens of iterations to find
-+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-+         *
-+         * I also have a safeguard for potentially really patological functions so we never
-+         * take more than 20 steps before we give up ...
-+         *
-+         * If we already found a lower value we just skip this step and continue to the update.
-+         */
-+
-+        if (!foundlower)
-+        {
-+
-+            nminstep = 0;
-+            do
-+            {
-+                /* Select a new trial point.
-+                 * If the derivatives at points a & c have different sign we interpolate to zero,
-+                 * otherwise just do a bisection.
-+                 */
-+
-+                if (gpa < 0 && gpc > 0)
-+                {
-+                    b = a + gpa*(a-c)/(gpc-gpa);
-+                }
-+                else
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* safeguard if interpolation close to machine accuracy causes errors:
-+                 * never go outside the interval
-+                 */
-+                if (b <= a || b >= c)
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* Take a trial step */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xb[i] = lastx[i] + b*s[i];
-+                }
-+
-+                neval++;
-+                /* Calculate energy for the trial step */
-+                ems.s.x = (rvec *)xb;
-+                ems.f   = (rvec *)fb;
-+                evaluate_energy(fplog, cr,
-+                                top_global, &ems, top,
-+                                inputrec, nrnb, wcycle, gstat,
-+                                vsite, constr, fcd, graph, mdatoms, fr,
-+                                mu_tot, enerd, vir, pres, step, FALSE);
-+                EpotB = ems.epot;
-+
-+                fnorm = ems.fnorm;
-+
-+                for (gpb = 0, i = 0; i < n; i++)
-+                {
-+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
-+
-+                }
-+                /* Sum the gradient along the line across CPUs */
-+                if (PAR(cr))
-+                {
-+                    gmx_sumd(1, &gpb, cr);
-+                }
-+
-+                /* Keep one of the intervals based on the value of the derivative at the new point */
-+                if (gpb > 0)
-+                {
-+                    /* Replace c endpoint with b */
-+                    EpotC = EpotB;
-+                    c     = b;
-+                    gpc   = gpb;
-+                    /* swap coord pointers b/c */
-+                    xtmp = xb;
-+                    ftmp = fb;
-+                    xb   = xc;
-+                    fb   = fc;
-+                    xc   = xtmp;
-+                    fc   = ftmp;
-+                }
-+                else
-+                {
-+                    /* Replace a endpoint with b */
-+                    EpotA = EpotB;
-+                    a     = b;
-+                    gpa   = gpb;
-+                    /* swap coord pointers a/b */
-+                    xtmp = xb;
-+                    ftmp = fb;
-+                    xb   = xa;
-+                    fb   = fa;
-+                    xa   = xtmp;
-+                    fa   = ftmp;
-+                }
-+
-+                /*
-+                 * Stop search as soon as we find a value smaller than the endpoints,
-+                 * or if the tolerance is below machine precision.
-+                 * Never run more than 20 steps, no matter what.
-+                 */
-+                nminstep++;
-+            }
-+            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
-+
-+            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
-+            {
-+                /* OK. We couldn't find a significantly lower energy.
-+                 * If ncorr==0 this was steepest descent, and then we give up.
-+                 * If not, reset memory to restart as steepest descent before quitting.
-+                 */
-+                if (ncorr == 0)
-+                {
-+                    /* Converged */
-+                    converged = TRUE;
-+                    break;
-+                }
-+                else
-+                {
-+                    /* Reset memory */
-+                    ncorr = 0;
-+                    /* Search in gradient direction */
-+                    for (i = 0; i < n; i++)
-+                    {
-+                        dx[point][i] = ff[i];
-+                    }
-+                    /* Reset stepsize */
-+                    stepsize = 1.0/fnorm;
-+                    continue;
-+                }
-+            }
-+
-+            /* Select min energy state of A & C, put the best in xx/ff/Epot
-+             */
-+            if (EpotC < EpotA)
-+            {
-+                Epot = EpotC;
-+                /* Use state C */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xx[i] = xc[i];
-+                    ff[i] = fc[i];
-+                }
-+                stepsize = c;
-+            }
-+            else
-+            {
-+                Epot = EpotA;
-+                /* Use state A */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xx[i] = xa[i];
-+                    ff[i] = fa[i];
-+                }
-+                stepsize = a;
-+            }
-+
-+        }
-+        else
-+        {
-+            /* found lower */
-+            Epot = EpotC;
-+            /* Use state C */
-+            for (i = 0; i < n; i++)
-+            {
-+                xx[i] = xc[i];
-+                ff[i] = fc[i];
-+            }
-+            stepsize = c;
-+        }
-+
-+        /* Update the memory information, and calculate a new
-+         * approximation of the inverse hessian
-+         */
-+
-+        /* Have new data in Epot, xx, ff */
-+        if (ncorr < nmaxcorr)
-+        {
-+            ncorr++;
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            dg[point][i]  = lastf[i]-ff[i];
-+            dx[point][i] *= stepsize;
-+        }
-+
-+        dgdg = 0;
-+        dgdx = 0;
-+        for (i = 0; i < n; i++)
-+        {
-+            dgdg += dg[point][i]*dg[point][i];
-+            dgdx += dg[point][i]*dx[point][i];
-+        }
-+
-+        diag = dgdx/dgdg;
-+
-+        rho[point] = 1.0/dgdx;
-+        point++;
-+
-+        if (point >= nmaxcorr)
-+        {
-+            point = 0;
-+        }
-+
-+        /* Update */
-+        for (i = 0; i < n; i++)
-+        {
-+            p[i] = ff[i];
-+        }
-+
-+        cp = point;
-+
-+        /* Recursive update. First go back over the memory points */
-+        for (k = 0; k < ncorr; k++)
-+        {
-+            cp--;
-+            if (cp < 0)
-+            {
-+                cp = ncorr-1;
-+            }
-+
-+            sq = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                sq += dx[cp][i]*p[i];
-+            }
-+
-+            alpha[cp] = rho[cp]*sq;
-+
-+            for (i = 0; i < n; i++)
-+            {
-+                p[i] -= alpha[cp]*dg[cp][i];
-+            }
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            p[i] *= diag;
-+        }
-+
-+        /* And then go forward again */
-+        for (k = 0; k < ncorr; k++)
-+        {
-+            yr = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                yr += p[i]*dg[cp][i];
-+            }
-+
-+            beta = rho[cp]*yr;
-+            beta = alpha[cp]-beta;
-+
-+            for (i = 0; i < n; i++)
-+            {
-+                p[i] += beta*dx[cp][i];
-+            }
-+
-+            cp++;
-+            if (cp >= ncorr)
-+            {
-+                cp = 0;
-+            }
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            if (!frozen[i])
-+            {
-+                dx[point][i] = p[i];
-+            }
-+            else
-+            {
-+                dx[point][i] = 0;
-+            }
-+        }
-+
-+        stepsize = 1.0;
-+
-+        /* Test whether the convergence criterion is met */
-+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
-+
-+        /* Print it if necessary */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
-+                        step, Epot, fnorm/sqrt(state->natoms), fmax, nfmax+1);
-+            }
-+            /* Store the new (lower) energies */
-+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                       mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
-+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+            do_log = do_per_step(step, inputrec->nstlog);
-+            do_ene = do_per_step(step, inputrec->nstenergy);
-+            if (do_log)
-+            {
-+                print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+            }
-+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-+                       do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+
-+        /* Send x and E to IMD client, if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state->box, state->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        /* Stop when the maximum force lies below tolerance.
-+         * If we have reached machine precision, converged is already set to true.
-+         */
-+
-+        converged = converged || (fmax < inputrec->em_tol);
-+
-+    } /* End of the loop */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    if (converged)
-+    {
-+        step--; /* we never took that last step in this case */
-+
-+    }
-+    if (fmax > inputrec->em_tol)
-+    {
-+        if (MASTER(cr))
-+        {
-+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
-+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
-+        }
-+        converged = FALSE;
-+    }
-+
-+    /* If we printed energy and/or logfile last step (which was the last step)
-+     * we don't have to do it again, but otherwise print the final values.
-+     */
-+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
-+    {
-+        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+    }
-+    if (!do_ene || !do_log) /* Write final energy file entries */
-+    {
-+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-+                   !do_log ? fplog : NULL, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+
-+    /* Print some stuff... */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+
-+    /* IMPORTANT!
-+     * For accurate normal mode calculation it is imperative that we
-+     * store the last conformation into the full precision binary trajectory.
-+     *
-+     * However, we should only do it if we did NOT already write this step
-+     * above (which we did if do_x or do_f was true).
-+     */
-+    do_x = !do_per_step(step, inputrec->nstxout);
-+    do_f = !do_per_step(step, inputrec->nstfout);
-+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, step,
-+                  &ems, state, f);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
-+                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
-+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
-+                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
-+
-+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_steep(FILE *fplog, t_commrec *cr,
-+                int nfile, const t_filenm fnm[],
-+                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+                int gmx_unused nstglobalcomm,
-+                gmx_vsite_t *vsite, gmx_constr_t constr,
-+                int gmx_unused stepout,
-+                t_inputrec *inputrec,
-+                gmx_mtop_t *top_global, t_fcdata *fcd,
-+                t_state *state_global,
-+                t_mdatoms *mdatoms,
-+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                gmx_edsam_t gmx_unused  ed,
-+                t_forcerec *fr,
-+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+                gmx_membed_t gmx_unused membed,
-+                real gmx_unused cpt_period, real gmx_unused max_hours,
-+                const char  gmx_unused *deviceOptions,
-+                int imdport,
-+                unsigned long gmx_unused Flags,
-+                gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char       *SD = "Steepest Descents";
-+    em_state_t       *s_min, *s_try;
-+    rvec             *f_global;
-+    gmx_localtop_t   *top;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f;
-+    gmx_global_stat_t gstat;
-+    t_graph          *graph;
-+    real              stepsize, constepsize;
-+    real              ustep, fnormn;
-+    gmx_mdoutf_t      outf;
-+    t_mdebin         *mdebin;
-+    gmx_bool          bDone, bAbort, do_x, do_f;
-+    tensor            vir, pres;
-+    rvec              mu_tot;
-+    int               nsteps;
-+    int               count          = 0;
-+    int               steps_accepted = 0;
-+    /* not used */
-+    real              terminate = 0;
-+
-+    s_min = init_em_state();
-+    s_try = init_em_state();
-+
-+    /* Init em and store the local state in s_try */
-+    init_em(fplog, SD, cr, inputrec,
-+            state_global, top_global, s_try, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+
-+    /* Print to log file  */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
-+
-+    /* Set variables for stepsize (in nm). This is the largest
-+     * step that we are going to make in any direction.
-+     */
-+    ustep    = inputrec->em_stepsize;
-+    stepsize = 0;
-+
-+    /* Max number of steps  */
-+    nsteps = inputrec->nsteps;
-+
-+    if (MASTER(cr))
-+    {
-+        /* Print to the screen  */
-+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
-+    }
-+
-+    /**** HERE STARTS THE LOOP ****
-+     * count is the counter for the number of steps
-+     * bDone will be TRUE when the minimization has converged
-+     * bAbort will be TRUE when nsteps steps have been performed or when
-+     * the stepsize becomes smaller than is reasonable for machine precision
-+     */
-+    count  = 0;
-+    bDone  = FALSE;
-+    bAbort = FALSE;
-+    while (!bDone && !bAbort)
-+    {
-+        bAbort = (nsteps >= 0) && (count == nsteps);
-+
-+        /* set new coordinates, except for first step */
-+        if (count > 0)
-+        {
-+            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
-+                       s_min, stepsize, s_min->f, s_try,
-+                       constr, top, nrnb, wcycle, count);
-+        }
-+
-+        evaluate_energy(fplog, cr,
-+                        top_global, s_try, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, count, count == 0);
-+
-+        if (MASTER(cr))
-+        {
-+            print_ebin_header(fplog, count, count, s_try->s.lambda[efptFEP]);
-+        }
-+
-+        if (count == 0)
-+        {
-+            s_min->epot = s_try->epot + 1;
-+        }
-+
-+        /* Print it if necessary  */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
-+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
-+                        (s_try->epot < s_min->epot) ? '\n' : '\r');
-+            }
-+
-+            if (s_try->epot < s_min->epot)
-+            {
-+                /* Store the new (lower) energies  */
-+                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
-+                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
-+                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+                /* Prepare IMD energy record, if bIMD is TRUE. */
-+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
-+
-+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
-+                           do_per_step(steps_accepted, inputrec->nstdisreout),
-+                           do_per_step(steps_accepted, inputrec->nstorireout),
-+                           fplog, count, count, eprNORMAL, TRUE,
-+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+                fflush(fplog);
-+            }
-+        }
-+
-+        /* Now if the new energy is smaller than the previous...
-+         * or if this is the first step!
-+         * or if we did random steps!
-+         */
-+
-+        if ( (count == 0) || (s_try->epot < s_min->epot) )
-+        {
-+            steps_accepted++;
-+
-+            /* Test whether the convergence criterion is met...  */
-+            bDone = (s_try->fmax < inputrec->em_tol);
-+
-+            /* Copy the arrays for force, positions and energy  */
-+            /* The 'Min' array always holds the coords and forces of the minimal
-+               sampled energy  */
-+            swap_em_state(s_min, s_try);
-+            if (count > 0)
-+            {
-+                ustep *= 1.2;
-+            }
-+
-+            /* Write to trn, if necessary */
-+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
-+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
-+            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
-+                          top_global, inputrec, count,
-+                          s_min, state_global, f_global);
-+        }
-+        else
-+        {
-+            /* If energy is not smaller make the step smaller...  */
-+            ustep *= 0.5;
-+
-+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-+            {
-+                /* Reload the old state */
-+                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-+                                       s_min, top, mdatoms, fr, vsite, constr,
-+                                       nrnb, wcycle);
-+            }
-+        }
-+
-+        /* Determine new step  */
-+        stepsize = ustep/s_min->fmax;
-+
-+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
-+#ifdef GMX_DOUBLE
-+        if (count == nsteps || ustep < 1e-12)
-+#else
-+        if (count == nsteps || ustep < 1e-6)
-+#endif
-+        {
-+            if (MASTER(cr))
-+            {
-+                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
-+                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
-+            }
-+            bAbort = TRUE;
-+        }
-+
-+        /* Send IMD energies and positions, if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        count++;
-+    } /* End of the loop  */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    /* Print some data...  */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, count,
-+                  s_min, state_global, f_global);
-+
-+    fnormn = s_min->fnorm/sqrt(state_global->natoms);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    inputrec->nsteps = count;
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_nm(FILE *fplog, t_commrec *cr,
-+             int nfile, const t_filenm fnm[],
-+             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused  bCompact,
-+             int gmx_unused nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int gmx_unused stepout,
-+             t_inputrec *inputrec,
-+             gmx_mtop_t *top_global, t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t  gmx_unused ed,
-+             t_forcerec *fr,
-+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+             gmx_membed_t gmx_unused membed,
-+             real gmx_unused cpt_period, real gmx_unused max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long gmx_unused Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char          *NM = "Normal Mode Analysis";
-+    gmx_mdoutf_t         outf;
-+    int                  natoms, atom, d;
-+    int                  nnodes, node;
-+    rvec                *f_global;
-+    gmx_localtop_t      *top;
-+    gmx_enerdata_t      *enerd;
-+    rvec                *f;
-+    gmx_global_stat_t    gstat;
-+    t_graph             *graph;
-+    real                 t, t0, lambda, lam0;
-+    gmx_bool             bNS;
-+    tensor               vir, pres;
-+    rvec                 mu_tot;
-+    rvec                *fneg, *dfdx;
-+    gmx_bool             bSparse; /* use sparse matrix storage format */
-+    size_t               sz = 0;
-+    gmx_sparsematrix_t * sparse_matrix           = NULL;
-+    real           *     full_matrix             = NULL;
-+    em_state_t       *   state_work;
-+
-+    /* added with respect to mdrun */
-+    int        i, j, k, row, col;
-+    real       der_range = 10.0*sqrt(GMX_REAL_EPS);
-+    real       x_min;
-+    real       fnorm, fmax;
-+
-+    if (constr != NULL)
-+    {
-+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
-+    }
-+
-+    state_work = init_em_state();
-+
-+    /* Init em and store the local state in state_minimum */
-+    init_em(fplog, NM, cr, inputrec,
-+            state_global, top_global, state_work, &top,
-+            &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, NULL, imdport, Flags, wcycle);
-+
-+    natoms = top_global->natoms;
-+    snew(fneg, natoms);
-+    snew(dfdx, natoms);
-+
-+#ifndef GMX_DOUBLE
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr,
-+                "NOTE: This version of Gromacs has been compiled in single precision,\n"
-+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
-+                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
-+                "      are fairly modest even if you recompile in double precision.\n\n");
-+    }
-+#endif
-+
-+    /* Check if we can/should use sparse storage format.
-+     *
-+     * Sparse format is only useful when the Hessian itself is sparse, which it
-+     * will be when we use a cutoff.
-+     * For small systems (n<1000) it is easier to always use full matrix format, though.
-+     */
-+    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
-+    {
-+        md_print_info(cr, fplog, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
-+        bSparse = FALSE;
-+    }
-+    else if (top_global->natoms < 1000)
-+    {
-+        md_print_info(cr, fplog, "Small system size (N=%d), using full Hessian format.\n", top_global->natoms);
-+        bSparse = FALSE;
-+    }
-+    else
-+    {
-+        md_print_info(cr, fplog, "Using compressed symmetric sparse Hessian format.\n");
-+        bSparse = TRUE;
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        sz = DIM*top_global->natoms;
-+
-+        fprintf(stderr, "Allocating Hessian memory...\n\n");
-+
-+        if (bSparse)
-+        {
-+            sparse_matrix = gmx_sparsematrix_init(sz);
-+            sparse_matrix->compressed_symmetric = TRUE;
-+        }
-+        else
-+        {
-+            snew(full_matrix, sz*sz);
-+        }
-+    }
-+
-+    /* Initial values */
-+    t0           = inputrec->init_t;
-+    lam0         = inputrec->fepvals->init_lambda;
-+    t            = t0;
-+    lambda       = lam0;
-+
-+    init_nrnb(nrnb);
-+
-+    where();
-+
-+    /* Write start time and temperature */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
-+
-+    /* fudge nr of steps to nr of atoms */
-+    inputrec->nsteps = natoms*2;
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
-+                *(top_global->name), (int)inputrec->nsteps);
-+    }
-+
-+    nnodes = cr->nnodes;
-+
-+    /* Make evaluate_energy do a single node force calculation */
-+    cr->nnodes = 1;
-+    evaluate_energy(fplog, cr,
-+                    top_global, state_work, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    cr->nnodes = nnodes;
-+
-+    /* if forces are not small, warn user */
-+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
-+
-+    md_print_info(cr, fplog, "Maximum force:%12.5e\n", state_work->fmax);
-+    if (state_work->fmax > 1.0e-3)
-+    {
-+        md_print_info(cr, fplog,
-+                      "The force is probably not small enough to "
-+                      "ensure that you are at a minimum.\n"
-+                      "Be aware that negative eigenvalues may occur\n"
-+                      "when the resulting matrix is diagonalized.\n\n");
-+    }
-+
-+    /***********************************************************
-+     *
-+     *      Loop over all pairs in matrix
-+     *
-+     *      do_force called twice. Once with positive and
-+     *      once with negative displacement
-+     *
-+     ************************************************************/
-+
-+    /* Steps are divided one by one over the nodes */
-+    for (atom = cr->nodeid; atom < natoms; atom += nnodes)
-+    {
-+
-+        for (d = 0; d < DIM; d++)
-+        {
-+            x_min = state_work->s.x[atom][d];
-+
-+            state_work->s.x[atom][d] = x_min - der_range;
-+
-+            /* Make evaluate_energy do a single node force calculation */
-+            cr->nnodes = 1;
-+            evaluate_energy(fplog, cr,
-+                            top_global, state_work, top,
-+                            inputrec, nrnb, wcycle, gstat,
-+                            vsite, constr, fcd, graph, mdatoms, fr,
-+                            mu_tot, enerd, vir, pres, atom*2, FALSE);
-+
-+            for (i = 0; i < natoms; i++)
-+            {
-+                copy_rvec(state_work->f[i], fneg[i]);
-+            }
-+
-+            state_work->s.x[atom][d] = x_min + der_range;
-+
-+            evaluate_energy(fplog, cr,
-+                            top_global, state_work, top,
-+                            inputrec, nrnb, wcycle, gstat,
-+                            vsite, constr, fcd, graph, mdatoms, fr,
-+                            mu_tot, enerd, vir, pres, atom*2+1, FALSE);
-+            cr->nnodes = nnodes;
-+
-+            /* x is restored to original */
-+            state_work->s.x[atom][d] = x_min;
-+
-+            for (j = 0; j < natoms; j++)
-+            {
-+                for (k = 0; (k < DIM); k++)
-+                {
-+                    dfdx[j][k] =
-+                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
-+                }
-+            }
-+
-+            if (!MASTER(cr))
-+            {
-+#ifdef GMX_MPI
-+#ifdef GMX_DOUBLE
-+#define mpi_type MPI_DOUBLE
-+#else
-+#define mpi_type MPI_FLOAT
-+#endif
-+                MPI_Send(dfdx[0], natoms*DIM, mpi_type, MASTERNODE(cr), cr->nodeid,
-+                         cr->mpi_comm_mygroup);
-+#endif
-+            }
-+            else
-+            {
-+                for (node = 0; (node < nnodes && atom+node < natoms); node++)
-+                {
-+                    if (node > 0)
-+                    {
-+#ifdef GMX_MPI
-+                        MPI_Status stat;
-+                        MPI_Recv(dfdx[0], natoms*DIM, mpi_type, node, node,
-+                                 cr->mpi_comm_mygroup, &stat);
-+#undef mpi_type
-+#endif
-+                    }
-+
-+                    row = (atom + node)*DIM + d;
-+
-+                    for (j = 0; j < natoms; j++)
-+                    {
-+                        for (k = 0; k < DIM; k++)
-+                        {
-+                            col = j*DIM + k;
-+
-+                            if (bSparse)
-+                            {
-+                                if (col >= row && dfdx[j][k] != 0.0)
-+                                {
-+                                    gmx_sparsematrix_increment_value(sparse_matrix,
-+                                                                     row, col, dfdx[j][k]);
-+                                }
-+                            }
-+                            else
-+                            {
-+                                full_matrix[row*sz+col] = dfdx[j][k];
-+                            }
-+                        }
-+                    }
-+                }
-+            }
-+
-+            if (bVerbose && fplog)
-+            {
-+                fflush(fplog);
-+            }
-+        }
-+        /* write progress */
-+        if (MASTER(cr) && bVerbose)
-+        {
-+            fprintf(stderr, "\rFinished step %d out of %d",
-+                    min(atom+nnodes, natoms), natoms);
-+            fflush(stderr);
-+        }
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\n\nWriting Hessian...\n");
-+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, natoms*2);
-+
-+    return 0;
-+}
-diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c
-index 3d98d59..b34d23c 100644
---- a/src/programs/mdrun/md.c
-+++ b/src/programs/mdrun/md.c
-@@ -96,6 +96,12 @@
- #include "gromacs/swap/swapcoords.h"
- #include "gromacs/imd/imd.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #ifdef GMX_FAHCORE
- #include "corewrap.h"
- #endif
-@@ -224,6 +230,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-     /* Interactive MD */
-     gmx_bool          bIMDstep = FALSE;
- 
-+    /* PLUMED */
-+    int plumedNeedsEnergy=0;
-+    int plumedWantsToStop=0;
-+    matrix plumed_vir;
-+    /* END PLUMED */
-+
- #ifdef GMX_FAHCORE
-     /* Temporary addition for FAHCORE checkpointing */
-     int chkpt_ret;
-@@ -651,6 +663,48 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-         fprintf(fplog, "\n");
-     }
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      /* detect plumed API version */
-+      int pversion=0;
-+      plumed_cmd(plumedmain,"getApiVersion",&pversion);
-+      /* setting kbT is only implemented with api>1) */
-+      real kbT=ir->opts.ref_t[0]*BOLTZ;
-+      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
-+
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        plumed_cmd(plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }
-+      }
-+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
-+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
-+      plumed_cmd(plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
-+      plumed_cmd(plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
-+
-     walltime_accounting_start(walltime_accounting);
-     wallcycle_start(wcycle, ewcRUN);
-     print_start(fplog, cr, walltime_accounting, "mdrun");
-@@ -955,6 +1009,13 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-                                     do_verbose && !bPMETuneRunning);
-                 wallcycle_stop(wcycle, ewcDOMDEC);
-                 /* If using an iterative integrator, reallocate space to match the decomposition */
-+
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+                }
-+                /* END PLUMED */
-             }
-         }
- 
-@@ -1078,12 +1139,45 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-              * This is parallellized as well, and does communication too.
-              * Check comments in sim_util.c
-              */
-+
-+            /* PLUMED */
-+            plumedNeedsEnergy=0;
-+            if(plumedswitch){
-+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
-+              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
-+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
-+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
-+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
-+              plumed_cmd(plumedmain,"prepareCalc",NULL);
-+              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
-+              plumed_cmd(plumedmain,"setForces",&f[0][0]);
-+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+              clear_mat(plumed_vir);
-+              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
-+            }
-+            /* END PLUMED */
-             do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
-                      state->box, state->x, &state->hist,
-                      f, force_vir, mdatoms, enerd, fcd,
-                      state->lambda, graph,
-                      fr, vsite, mu_tot, t, mdoutf_get_fp_field(outf), ed, bBornRadii,
-                      (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+            /* PLUMED */
-+            if(plumedswitch){
-+              if(plumedNeedsEnergy){
-+                msmul(force_vir,2.0,plumed_vir);
-+                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+                plumed_cmd(plumedmain,"performCalc",NULL);
-+                msmul(plumed_vir,0.5,force_vir);
-+              } else {
-+                msmul(plumed_vir,0.5,plumed_vir);
-+                m_add(force_vir,plumed_vir,force_vir);
-+              }
-+              if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+                 do_per_step(step,repl_ex_nst)) plumed_cmd(plumedmain,"GREX savePositions",NULL);
-+              if(plumedWantsToStop) ir->nsteps=step_rel+1;
-+            }
-+            /* END PLUMED */
-         }
- 
-         if (bVV && !bStartingFromCpt && !bRerunMD)
-diff --git a/src/programs/mdrun/md.c.preplumed b/src/programs/mdrun/md.c.preplumed
-new file mode 100644
-index 0000000..3d98d59
---- /dev/null
-+++ b/src/programs/mdrun/md.c.preplumed
-@@ -0,0 +1,2058 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include "typedefs.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "sysstuff.h"
-+#include "vec.h"
-+#include "vcm.h"
-+#include "mdebin.h"
-+#include "nrnb.h"
-+#include "calcmu.h"
-+#include "index.h"
-+#include "vsite.h"
-+#include "update.h"
-+#include "ns.h"
-+#include "mdrun.h"
-+#include "md_support.h"
-+#include "md_logging.h"
-+#include "network.h"
-+#include "xvgr.h"
-+#include "physics.h"
-+#include "names.h"
-+#include "force.h"
-+#include "disre.h"
-+#include "orires.h"
-+#include "pme.h"
-+#include "mdatoms.h"
-+#include "repl_ex.h"
-+#include "deform.h"
-+#include "qmmm.h"
-+#include "domdec.h"
-+#include "domdec_network.h"
-+#include "gromacs/gmxlib/topsort.h"
-+#include "coulomb.h"
-+#include "constr.h"
-+#include "shellfc.h"
-+#include "gromacs/gmxpreprocess/compute_io.h"
-+#include "checkpoint.h"
-+#include "mtop_util.h"
-+#include "sighandler.h"
-+#include "txtdump.h"
-+#include "gromacs/utility/cstringutil.h"
-+#include "pme_loadbal.h"
-+#include "bondf.h"
-+#include "membed.h"
-+#include "types/nlistheuristics.h"
-+#include "types/iteratedconstraints.h"
-+#include "nbnxn_cuda_data_mgmt.h"
-+
-+#include "gromacs/utility/gmxmpi.h"
-+#include "gromacs/fileio/confio.h"
-+#include "gromacs/fileio/trajectory_writing.h"
-+#include "gromacs/fileio/trnio.h"
-+#include "gromacs/fileio/trxio.h"
-+#include "gromacs/fileio/xtcio.h"
-+#include "gromacs/timing/wallcycle.h"
-+#include "gromacs/timing/walltime_accounting.h"
-+#include "gromacs/pulling/pull.h"
-+#include "gromacs/swap/swapcoords.h"
-+#include "gromacs/imd/imd.h"
-+
-+#ifdef GMX_FAHCORE
-+#include "corewrap.h"
-+#endif
-+
-+static void reset_all_counters(FILE *fplog, t_commrec *cr,
-+                               gmx_int64_t step,
-+                               gmx_int64_t *step_rel, t_inputrec *ir,
-+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
-+                               gmx_walltime_accounting_t walltime_accounting,
-+                               nbnxn_cuda_ptr_t cu_nbv)
-+{
-+    char sbuf[STEPSTRSIZE];
-+
-+    /* Reset all the counters related to performance over the run */
-+    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
-+                  gmx_step_str(step, sbuf));
-+
-+    if (cu_nbv)
-+    {
-+        nbnxn_cuda_reset_timings(cu_nbv);
-+    }
-+
-+    wallcycle_stop(wcycle, ewcRUN);
-+    wallcycle_reset_all(wcycle);
-+    if (DOMAINDECOMP(cr))
-+    {
-+        reset_dd_statistics_counters(cr->dd);
-+    }
-+    init_nrnb(nrnb);
-+    ir->init_step += *step_rel;
-+    ir->nsteps    -= *step_rel;
-+    *step_rel      = 0;
-+    wallcycle_start(wcycle, ewcRUN);
-+    walltime_accounting_start(walltime_accounting);
-+    print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
-+}
-+
-+double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-+             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
-+             int nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int stepout, t_inputrec *ir,
-+             gmx_mtop_t *top_global,
-+             t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t ed, t_forcerec *fr,
-+             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
-+             real cpt_period, real max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    gmx_mdoutf_t    outf = NULL;
-+    gmx_int64_t     step, step_rel;
-+    double          elapsed_time;
-+    double          t, t0, lam0[efptNR];
-+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
-+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
-+                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
-+                    bBornRadii, bStartingFromCpt;
-+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
-+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
-+                      bForceUpdate = FALSE, bCPT;
-+    gmx_bool          bMasterState;
-+    int               force_flags, cglo_flags;
-+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
-+    int               i, m;
-+    t_trxstatus      *status;
-+    rvec              mu_tot;
-+    t_vcm            *vcm;
-+    t_state          *bufstate = NULL;
-+    matrix           *scale_tot, pcoupl_mu, M, ebox;
-+    gmx_nlheur_t      nlh;
-+    t_trxframe        rerun_fr;
-+    gmx_repl_ex_t     repl_ex = NULL;
-+    int               nchkpt  = 1;
-+    gmx_localtop_t   *top;
-+    t_mdebin         *mdebin   = NULL;
-+    t_state          *state    = NULL;
-+    rvec             *f_global = NULL;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f = NULL;
-+    gmx_global_stat_t gstat;
-+    gmx_update_t      upd   = NULL;
-+    t_graph          *graph = NULL;
-+    globsig_t         gs;
-+    gmx_groups_t     *groups;
-+    gmx_ekindata_t   *ekind, *ekind_save;
-+    gmx_shellfc_t     shellfc;
-+    int               count, nconverged = 0;
-+    real              timestep   = 0;
-+    double            tcount     = 0;
-+    gmx_bool          bConverged = TRUE, bOK, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
-+    gmx_bool          bAppend;
-+    gmx_bool          bResetCountersHalfMaxH = FALSE;
-+    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
-+    gmx_bool          bUpdateDoLR;
-+    real              dvdl_constr;
-+    rvec             *cbuf = NULL;
-+    matrix            lastbox;
-+    real              veta_save, scalevir, tracevir;
-+    real              vetanew = 0;
-+    int               lamnew  = 0;
-+    /* for FEP */
-+    int               nstfep;
-+    double            cycles;
-+    real              saved_conserved_quantity = 0;
-+    real              last_ekin                = 0;
-+    int               iter_i;
-+    t_extmass         MassQ;
-+    int             **trotter_seq;
-+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
-+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
-+    gmx_iterate_t     iterate;
-+    gmx_int64_t       multisim_nsteps = -1;                        /* number of steps to do  before first multisim
-+                                                                          simulation stops. If equal to zero, don't
-+                                                                          communicate any more between multisims.*/
-+    /* PME load balancing data for GPU kernels */
-+    pme_load_balancing_t pme_loadbal = NULL;
-+    double               cycles_pmes;
-+    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
-+
-+    /* Interactive MD */
-+    gmx_bool          bIMDstep = FALSE;
-+
-+#ifdef GMX_FAHCORE
-+    /* Temporary addition for FAHCORE checkpointing */
-+    int chkpt_ret;
-+#endif
-+
-+    /* Check for special mdrun options */
-+    bRerunMD = (Flags & MD_RERUN);
-+    bAppend  = (Flags & MD_APPENDFILES);
-+    if (Flags & MD_RESETCOUNTERSHALFWAY)
-+    {
-+        if (ir->nsteps > 0)
-+        {
-+            /* Signal to reset the counters half the simulation steps. */
-+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
-+        }
-+        /* Signal to reset the counters halfway the simulation time. */
-+        bResetCountersHalfMaxH = (max_hours > 0);
-+    }
-+
-+    /* md-vv uses averaged full step velocities for T-control
-+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
-+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
-+    bVV = EI_VV(ir->eI);
-+    if (bVV) /* to store the initial velocities while computing virial */
-+    {
-+        snew(cbuf, top_global->natoms);
-+    }
-+    /* all the iteratative cases - only if there are constraints */
-+    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
-+    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
-+                                          false in this step.  The correct value, true or false,
-+                                          is set at each step, as it depends on the frequency of temperature
-+                                          and pressure control.*/
-+    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
-+
-+    if (bRerunMD)
-+    {
-+        /* Since we don't know if the frames read are related in any way,
-+         * rebuild the neighborlist at every step.
-+         */
-+        ir->nstlist       = 1;
-+        ir->nstcalcenergy = 1;
-+        nstglobalcomm     = 1;
-+    }
-+
-+    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
-+
-+    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
-+    bGStatEveryStep = (nstglobalcomm == 1);
-+
-+    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
-+    {
-+        fprintf(fplog,
-+                "To reduce the energy communication with nstlist = -1\n"
-+                "the neighbor list validity should not be checked at every step,\n"
-+                "this means that exact integration is not guaranteed.\n"
-+                "The neighbor list validity is checked after:\n"
-+                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
-+                "In most cases this will result in exact integration.\n"
-+                "This reduces the energy communication by a factor of 2 to 3.\n"
-+                "If you want less energy communication, set nstlist > 3.\n\n");
-+    }
-+
-+    if (bRerunMD)
-+    {
-+        ir->nstxout_compressed = 0;
-+    }
-+    groups = &top_global->groups;
-+
-+    /* Initial values */
-+    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
-+            &(state_global->fep_state), lam0,
-+            nrnb, top_global, &upd,
-+            nfile, fnm, &outf, &mdebin,
-+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags, wcycle);
-+
-+    clear_mat(total_vir);
-+    clear_mat(pres);
-+    /* Energy terms and groups */
-+    snew(enerd, 1);
-+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
-+                  enerd);
-+    if (DOMAINDECOMP(cr))
-+    {
-+        f = NULL;
-+    }
-+    else
-+    {
-+        snew(f, top_global->natoms);
-+    }
-+
-+    /* Kinetic energy data */
-+    snew(ekind, 1);
-+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
-+    /* needed for iteration of constraints */
-+    snew(ekind_save, 1);
-+    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
-+    /* Copy the cos acceleration to the groups struct */
-+    ekind->cosacc.cos_accel = ir->cos_accel;
-+
-+    gstat = global_stat_init(ir);
-+    debug_gmx();
-+
-+    /* Check for polarizable models and flexible constraints */
-+    shellfc = init_shell_flexcon(fplog,
-+                                 top_global, n_flexible_constraints(constr),
-+                                 (ir->bContinuation ||
-+                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
-+                                 NULL : state_global->x);
-+    if (shellfc && ir->nstcalcenergy != 1)
-+    {
-+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
-+    }
-+    if (shellfc && DOMAINDECOMP(cr))
-+    {
-+        gmx_fatal(FARGS, "Shell particles are not implemented with domain decomposition, use a single rank");
-+    }
-+    if (shellfc && ir->eI == eiNM)
-+    {
-+        /* Currently shells don't work with Normal Modes */
-+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with shells.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
-+    }
-+
-+    if (vsite && ir->eI == eiNM)
-+    {
-+        /* Currently virtual sites don't work with Normal Modes */
-+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
-+    }
-+
-+    if (DEFORM(*ir))
-+    {
-+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
-+        set_deform_reference_box(upd,
-+                                 deform_init_init_step_tpx,
-+                                 deform_init_box_tpx);
-+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
-+    }
-+
-+    {
-+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
-+        if ((io > 2000) && MASTER(cr))
-+        {
-+            fprintf(stderr,
-+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
-+                    io);
-+        }
-+    }
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        top = dd_init_local_top(top_global);
-+
-+        snew(state, 1);
-+        dd_init_local_state(cr->dd, state_global, state);
-+
-+        if (DDMASTER(cr->dd) && ir->nstfout)
-+        {
-+            snew(f_global, state_global->natoms);
-+        }
-+    }
-+    else
-+    {
-+        top = gmx_mtop_generate_local_top(top_global, ir);
-+
-+        forcerec_set_excl_load(fr, top);
-+
-+        state    = serial_init_local_state(state_global);
-+        f_global = f;
-+
-+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
-+
-+        if (vsite)
-+        {
-+            set_vsite_top(vsite, top, mdatoms, cr);
-+        }
-+
-+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
-+        {
-+            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
-+        }
-+
-+        if (shellfc)
-+        {
-+            make_local_shells(cr, mdatoms, shellfc);
-+        }
-+
-+        setup_bonded_threading(fr, &top->idef);
-+    }
-+
-+    /* Set up interactive MD (IMD) */
-+    init_IMD(ir, cr, top_global, fplog, ir->nstcalcenergy, state_global->x,
-+             nfile, fnm, oenv, imdport, Flags);
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        /* Distribute the charge groups over the nodes from the master node */
-+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
-+                            state_global, top_global, ir,
-+                            state, &f, mdatoms, top, fr,
-+                            vsite, shellfc, constr,
-+                            nrnb, wcycle, FALSE);
-+
-+    }
-+
-+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
-+
-+    if (opt2bSet("-cpi", nfile, fnm))
-+    {
-+        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
-+    }
-+    else
-+    {
-+        bStateFromCP = FALSE;
-+    }
-+
-+    if (ir->bExpanded)
-+    {
-+        init_expanded_ensemble(bStateFromCP, ir, &state->dfhist);
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        if (bStateFromCP)
-+        {
-+            /* Update mdebin with energy history if appending to output files */
-+            if (Flags & MD_APPENDFILES)
-+            {
-+                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
-+            }
-+            else
-+            {
-+                /* We might have read an energy history from checkpoint,
-+                 * free the allocated memory and reset the counts.
-+                 */
-+                done_energyhistory(&state_global->enerhist);
-+                init_energyhistory(&state_global->enerhist);
-+            }
-+        }
-+        /* Set the initial energy history in state by updating once */
-+        update_energyhistory(&state_global->enerhist, mdebin);
-+    }
-+
-+    /* Initialize constraints */
-+    if (constr && !DOMAINDECOMP(cr))
-+    {
-+        set_constraints(constr, top, ir, mdatoms, cr);
-+    }
-+
-+    if (repl_ex_nst > 0 && MASTER(cr))
-+    {
-+        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
-+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
-+    }
-+
-+    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
-+     * PME tuning is not supported with PME only for LJ and not for Coulomb.
-+     */
-+    if ((Flags & MD_TUNEPME) &&
-+        EEL_PME(fr->eeltype) &&
-+        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
-+        !bRerunMD)
-+    {
-+        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
-+        cycles_pmes = 0;
-+        if (cr->duty & DUTY_PME)
-+        {
-+            /* Start tuning right away, as we can't measure the load */
-+            bPMETuneRunning = TRUE;
-+        }
-+        else
-+        {
-+            /* Separate PME nodes, we can measure the PP/PME load balance */
-+            bPMETuneTry = TRUE;
-+        }
-+    }
-+
-+    if (!ir->bContinuation && !bRerunMD)
-+    {
-+        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
-+        {
-+            /* Set the velocities of frozen particles to zero */
-+            for (i = 0; i < mdatoms->homenr; i++)
-+            {
-+                for (m = 0; m < DIM; m++)
-+                {
-+                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
-+                    {
-+                        state->v[i][m] = 0;
-+                    }
-+                }
-+            }
-+        }
-+
-+        if (constr)
-+        {
-+            /* Constrain the initial coordinates and velocities */
-+            do_constrain_first(fplog, constr, ir, mdatoms, state,
-+                               cr, nrnb, fr, top);
-+        }
-+        if (vsite)
-+        {
-+            /* Construct the virtual sites for the initial configuration */
-+            construct_vsites(vsite, state->x, ir->delta_t, NULL,
-+                             top->idef.iparams, top->idef.il,
-+                             fr->ePBC, fr->bMolPBC, cr, state->box);
-+        }
-+    }
-+
-+    debug_gmx();
-+
-+    /* set free energy calculation frequency as the minimum
-+       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
-+    nstfep = ir->fepvals->nstdhdl;
-+    if (ir->bExpanded)
-+    {
-+        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl, nstfep);
-+    }
-+    if (repl_ex_nst > 0)
-+    {
-+        nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
-+    }
-+
-+    /* I'm assuming we need global communication the first time! MRS */
-+    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
-+                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
-+                  | (bVV ? CGLO_PRESSURE : 0)
-+                  | (bVV ? CGLO_CONSTRAINT : 0)
-+                  | (bRerunMD ? CGLO_RERUNMD : 0)
-+                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
-+
-+    bSumEkinhOld = FALSE;
-+    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                    constr, NULL, FALSE, state->box,
-+                    top_global, &bSumEkinhOld, cglo_flags);
-+    if (ir->eI == eiVVAK)
-+    {
-+        /* a second call to get the half step temperature initialized as well */
-+        /* we do the same call as above, but turn the pressure off -- internally to
-+           compute_globals, this is recognized as a velocity verlet half-step
-+           kinetic energy calculation.  This minimized excess variables, but
-+           perhaps loses some logic?*/
-+
-+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                        constr, NULL, FALSE, state->box,
-+                        top_global, &bSumEkinhOld,
-+                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
-+    }
-+
-+    /* Calculate the initial half step temperature, and save the ekinh_old */
-+    if (!(Flags & MD_STARTFROMCPT))
-+    {
-+        for (i = 0; (i < ir->opts.ngtc); i++)
-+        {
-+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
-+        }
-+    }
-+    if (ir->eI != eiVV)
-+    {
-+        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
-+                                     and there is no previous step */
-+    }
-+
-+    /* if using an iterative algorithm, we need to create a working directory for the state. */
-+    if (bIterativeCase)
-+    {
-+        bufstate = init_bufstate(state);
-+    }
-+
-+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
-+       temperature control */
-+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
-+
-+    if (MASTER(cr))
-+    {
-+        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
-+        {
-+            fprintf(fplog,
-+                    "RMS relative constraint deviation after constraining: %.2e\n",
-+                    constr_rmsd(constr, FALSE));
-+        }
-+        if (EI_STATE_VELOCITY(ir->eI))
-+        {
-+            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
-+        }
-+        if (bRerunMD)
-+        {
-+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
-+                    " input trajectory '%s'\n\n",
-+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
-+                        "run input file,\nwhich may not correspond to the time "
-+                        "needed to process input trajectory.\n\n");
-+            }
-+        }
-+        else
-+        {
-+            char tbuf[20];
-+            fprintf(stderr, "starting mdrun '%s'\n",
-+                    *(top_global->name));
-+            if (ir->nsteps >= 0)
-+            {
-+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
-+            }
-+            else
-+            {
-+                sprintf(tbuf, "%s", "infinite");
-+            }
-+            if (ir->init_step > 0)
-+            {
-+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
-+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
-+                        gmx_step_str(ir->init_step, sbuf2),
-+                        ir->init_step*ir->delta_t);
-+            }
-+            else
-+            {
-+                fprintf(stderr, "%s steps, %s ps.\n",
-+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
-+            }
-+        }
-+        fprintf(fplog, "\n");
-+    }
-+
-+    walltime_accounting_start(walltime_accounting);
-+    wallcycle_start(wcycle, ewcRUN);
-+    print_start(fplog, cr, walltime_accounting, "mdrun");
-+
-+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
-+#ifdef GMX_FAHCORE
-+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
-+                                      NULL, 0);
-+    if (chkpt_ret == 0)
-+    {
-+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
-+    }
-+#endif
-+
-+    debug_gmx();
-+    /***********************************************************
-+     *
-+     *             Loop over MD steps
-+     *
-+     ************************************************************/
-+
-+    /* if rerunMD then read coordinates and velocities from input trajectory */
-+    if (bRerunMD)
-+    {
-+        if (getenv("GMX_FORCE_UPDATE"))
-+        {
-+            bForceUpdate = TRUE;
-+        }
-+
-+        rerun_fr.natoms = 0;
-+        if (MASTER(cr))
-+        {
-+            bNotLastFrame = read_first_frame(oenv, &status,
-+                                             opt2fn("-rerun", nfile, fnm),
-+                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
-+            if (rerun_fr.natoms != top_global->natoms)
-+            {
-+                gmx_fatal(FARGS,
-+                          "Number of atoms in trajectory (%d) does not match the "
-+                          "run input file (%d)\n",
-+                          rerun_fr.natoms, top_global->natoms);
-+            }
-+            if (ir->ePBC != epbcNONE)
-+            {
-+                if (!rerun_fr.bBox)
-+                {
-+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
-+                }
-+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
-+                {
-+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
-+                }
-+            }
-+        }
-+
-+        if (PAR(cr))
-+        {
-+            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
-+        }
-+
-+        if (ir->ePBC != epbcNONE)
-+        {
-+            /* Set the shift vectors.
-+             * Necessary here when have a static box different from the tpr box.
-+             */
-+            calc_shifts(rerun_fr.box, fr->shift_vec);
-+        }
-+    }
-+
-+    /* loop over MD steps or if rerunMD to end of input trajectory */
-+    bFirstStep = TRUE;
-+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
-+    bStateFromTPX    = !bStateFromCP;
-+    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
-+    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
-+    bLastStep        = FALSE;
-+    bSumEkinhOld     = FALSE;
-+    bDoReplEx        = FALSE;
-+    bExchanged       = FALSE;
-+    bNeedRepartition = FALSE;
-+
-+    init_global_signals(&gs, cr, ir, repl_ex_nst);
-+
-+    step     = ir->init_step;
-+    step_rel = 0;
-+
-+    if (ir->nstlist == -1)
-+    {
-+        init_nlistheuristics(&nlh, bGStatEveryStep, step);
-+    }
-+
-+    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
-+    {
-+        /* check how many steps are left in other sims */
-+        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
-+    }
-+
-+
-+    /* and stop now if we should */
-+    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
-+                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
-+    while (!bLastStep || (bRerunMD && bNotLastFrame))
-+    {
-+
-+        wallcycle_start(wcycle, ewcSTEP);
-+
-+        if (bRerunMD)
-+        {
-+            if (rerun_fr.bStep)
-+            {
-+                step     = rerun_fr.step;
-+                step_rel = step - ir->init_step;
-+            }
-+            if (rerun_fr.bTime)
-+            {
-+                t = rerun_fr.time;
-+            }
-+            else
-+            {
-+                t = step;
-+            }
-+        }
-+        else
-+        {
-+            bLastStep = (step_rel == ir->nsteps);
-+            t         = t0 + step*ir->delta_t;
-+        }
-+
-+        if (ir->efep != efepNO || ir->bSimTemp)
-+        {
-+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
-+               requiring different logic. */
-+
-+            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
-+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
-+            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
-+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
-+                            && (ir->bExpanded) && (step > 0) && (!bStartingFromCpt));
-+        }
-+
-+        bDoReplEx = ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+                     do_per_step(step, repl_ex_nst));
-+
-+        if (bSimAnn)
-+        {
-+            update_annealing_target_temp(&(ir->opts), t);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            if (!DOMAINDECOMP(cr) || MASTER(cr))
-+            {
-+                for (i = 0; i < state_global->natoms; i++)
-+                {
-+                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
-+                }
-+                if (rerun_fr.bV)
-+                {
-+                    for (i = 0; i < state_global->natoms; i++)
-+                    {
-+                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
-+                    }
-+                }
-+                else
-+                {
-+                    for (i = 0; i < state_global->natoms; i++)
-+                    {
-+                        clear_rvec(state_global->v[i]);
-+                    }
-+                    if (bRerunWarnNoV)
-+                    {
-+                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
-+                                "         Ekin, temperature and pressure are incorrect,\n"
-+                                "         the virial will be incorrect when constraints are present.\n"
-+                                "\n");
-+                        bRerunWarnNoV = FALSE;
-+                    }
-+                }
-+            }
-+            copy_mat(rerun_fr.box, state_global->box);
-+            copy_mat(state_global->box, state->box);
-+
-+            if (vsite && (Flags & MD_RERUN_VSITE))
-+            {
-+                if (DOMAINDECOMP(cr))
-+                {
-+                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, use a single rank");
-+                }
-+                if (graph)
-+                {
-+                    /* Following is necessary because the graph may get out of sync
-+                     * with the coordinates if we only have every N'th coordinate set
-+                     */
-+                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
-+                    shift_self(graph, state->box, state->x);
-+                }
-+                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-+                                 top->idef.iparams, top->idef.il,
-+                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-+                if (graph)
-+                {
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+            }
-+        }
-+
-+        /* Stop Center of Mass motion */
-+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
-+
-+        if (bRerunMD)
-+        {
-+            /* for rerun MD always do Neighbour Searching */
-+            bNS      = (bFirstStep || ir->nstlist != 0);
-+            bNStList = bNS;
-+        }
-+        else
-+        {
-+            /* Determine whether or not to do Neighbour Searching and LR */
-+            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
-+
-+            bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP ||
-+                   (ir->nstlist == -1 && nlh.nabnsb > 0));
-+
-+            if (bNS && ir->nstlist == -1)
-+            {
-+                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bNeedRepartition || bDoFEP, step);
-+            }
-+        }
-+
-+        /* check whether we should stop because another simulation has
-+           stopped. */
-+        if (MULTISIM(cr))
-+        {
-+            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
-+                 (multisim_nsteps != ir->nsteps) )
-+            {
-+                if (bNS)
-+                {
-+                    if (MASTER(cr))
-+                    {
-+                        fprintf(stderr,
-+                                "Stopping simulation %d because another one has finished\n",
-+                                cr->ms->sim);
-+                    }
-+                    bLastStep         = TRUE;
-+                    gs.sig[eglsCHKPT] = 1;
-+                }
-+            }
-+        }
-+
-+        /* < 0 means stop at next step, > 0 means stop at next NS step */
-+        if ( (gs.set[eglsSTOPCOND] < 0) ||
-+             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
-+        {
-+            bLastStep = TRUE;
-+        }
-+
-+        /* Determine whether or not to update the Born radii if doing GB */
-+        bBornRadii = bFirstStep;
-+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
-+        {
-+            bBornRadii = TRUE;
-+        }
-+
-+        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
-+        do_verbose = bVerbose &&
-+            (step % stepout == 0 || bFirstStep || bLastStep);
-+
-+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
-+        {
-+            if (bRerunMD)
-+            {
-+                bMasterState = TRUE;
-+            }
-+            else
-+            {
-+                bMasterState = FALSE;
-+                /* Correct the new box if it is too skewed */
-+                if (DYNAMIC_BOX(*ir))
-+                {
-+                    if (correct_box(fplog, step, state->box, graph))
-+                    {
-+                        bMasterState = TRUE;
-+                    }
-+                }
-+                if (DOMAINDECOMP(cr) && bMasterState)
-+                {
-+                    dd_collect_state(cr->dd, state, state_global);
-+                }
-+            }
-+
-+            if (DOMAINDECOMP(cr))
-+            {
-+                /* Repartition the domain decomposition */
-+                wallcycle_start(wcycle, ewcDOMDEC);
-+                dd_partition_system(fplog, step, cr,
-+                                    bMasterState, nstglobalcomm,
-+                                    state_global, top_global, ir,
-+                                    state, &f, mdatoms, top, fr,
-+                                    vsite, shellfc, constr,
-+                                    nrnb, wcycle,
-+                                    do_verbose && !bPMETuneRunning);
-+                wallcycle_stop(wcycle, ewcDOMDEC);
-+                /* If using an iterative integrator, reallocate space to match the decomposition */
-+            }
-+        }
-+
-+        if (MASTER(cr) && do_log)
-+        {
-+            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
-+        }
-+
-+        if (ir->efep != efepNO)
-+        {
-+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
-+        }
-+
-+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
-+        {
-+
-+            /* We need the kinetic energy at minus the half step for determining
-+             * the full step kinetic energy and possibly for T-coupling.*/
-+            /* This may not be quite working correctly yet . . . . */
-+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-+                            constr, NULL, FALSE, state->box,
-+                            top_global, &bSumEkinhOld,
-+                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-+        }
-+        clear_mat(force_vir);
-+
-+        /* We write a checkpoint at this MD step when:
-+         * either at an NS step when we signalled through gs,
-+         * or at the last step (but not when we do not want confout),
-+         * but never at the first step or with rerun.
-+         */
-+        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
-+                 (bLastStep && (Flags & MD_CONFOUT))) &&
-+                step > ir->init_step && !bRerunMD);
-+        if (bCPT)
-+        {
-+            gs.set[eglsCHKPT] = 0;
-+        }
-+
-+        /* Determine the energy and pressure:
-+         * at nstcalcenergy steps and at energy output steps (set below).
-+         */
-+        if (EI_VV(ir->eI) && (!bInitStep))
-+        {
-+            /* for vv, the first half of the integration actually corresponds
-+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
-+               but the virial needs to be calculated on both the current step and the 'next' step. Future
-+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
-+
-+            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
-+            bCalcVir  = bCalcEner ||
-+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
-+        }
-+        else
-+        {
-+            bCalcEner = do_per_step(step, ir->nstcalcenergy);
-+            bCalcVir  = bCalcEner ||
-+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
-+        }
-+
-+        /* Do we need global communication ? */
-+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
-+                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
-+                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
-+
-+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
-+
-+        if (do_ene || do_log || bDoReplEx)
-+        {
-+            bCalcVir  = TRUE;
-+            bCalcEner = TRUE;
-+            bGStat    = TRUE;
-+        }
-+
-+        /* these CGLO_ options remain the same throughout the iteration */
-+        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
-+                      (bGStat ? CGLO_GSTAT : 0)
-+                      );
-+
-+        force_flags = (GMX_FORCE_STATECHANGED |
-+                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
-+                       GMX_FORCE_ALLFORCES |
-+                       GMX_FORCE_SEPLRF |
-+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
-+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
-+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
-+                       );
-+
-+        if (fr->bTwinRange)
-+        {
-+            if (do_per_step(step, ir->nstcalclr))
-+            {
-+                force_flags |= GMX_FORCE_DO_LR;
-+            }
-+        }
-+
-+        if (shellfc)
-+        {
-+            /* Now is the time to relax the shells */
-+            count = relax_shell_flexcon(fplog, cr, bVerbose, step,
-+                                        ir, bNS, force_flags,
-+                                        top,
-+                                        constr, enerd, fcd,
-+                                        state, f, force_vir, mdatoms,
-+                                        nrnb, wcycle, graph, groups,
-+                                        shellfc, fr, bBornRadii, t, mu_tot,
-+                                        &bConverged, vsite,
-+                                        mdoutf_get_fp_field(outf));
-+            tcount += count;
-+
-+            if (bConverged)
-+            {
-+                nconverged++;
-+            }
-+        }
-+        else
-+        {
-+            /* The coordinates (x) are shifted (to get whole molecules)
-+             * in do_force.
-+             * This is parallellized as well, and does communication too.
-+             * Check comments in sim_util.c
-+             */
-+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
-+                     state->box, state->x, &state->hist,
-+                     f, force_vir, mdatoms, enerd, fcd,
-+                     state->lambda, graph,
-+                     fr, vsite, mu_tot, t, mdoutf_get_fp_field(outf), ed, bBornRadii,
-+                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+        }
-+
-+        if (bVV && !bStartingFromCpt && !bRerunMD)
-+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
-+        {
-+            wallcycle_start(wcycle, ewcUPDATE);
-+            if (ir->eI == eiVV && bInitStep)
-+            {
-+                /* if using velocity verlet with full time step Ekin,
-+                 * take the first half step only to compute the
-+                 * virial for the first step. From there,
-+                 * revert back to the initial coordinates
-+                 * so that the input is actually the initial step.
-+                 */
-+                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
-+            }
-+            else
-+            {
-+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
-+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
-+            }
-+
-+            /* If we are using twin-range interactions where the long-range component
-+             * is only evaluated every nstcalclr>1 steps, we should do a special update
-+             * step to combine the long-range forces on these steps.
-+             * For nstcalclr=1 this is not done, since the forces would have been added
-+             * directly to the short-range forces already.
-+             *
-+             * TODO Remove various aspects of VV+twin-range in master
-+             * branch, because VV integrators did not ever support
-+             * twin-range multiple time stepping with constraints.
-+             */
-+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
-+                          f, bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                          ekind, M, upd, bInitStep, etrtVELOCITY1,
-+                          cr, nrnb, constr, &top->idef);
-+
-+            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
-+            {
-+                gmx_iterate_init(&iterate, TRUE);
-+            }
-+            /* for iterations, we save these vectors, as we will be self-consistently iterating
-+               the calculations */
-+
-+            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
-+
-+            /* save the state */
-+            if (iterate.bIterationActive)
-+            {
-+                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-+            }
-+
-+            bFirstIterate = TRUE;
-+            while (bFirstIterate || iterate.bIterationActive)
-+            {
-+                if (iterate.bIterationActive)
-+                {
-+                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-+                    if (bFirstIterate && bTrotter)
-+                    {
-+                        /* The first time through, we need a decent first estimate
-+                           of veta(t+dt) to compute the constraints.  Do
-+                           this by computing the box volume part of the
-+                           trotter integration at this time. Nothing else
-+                           should be changed by this routine here.  If
-+                           !(first time), we start with the previous value
-+                           of veta.  */
-+
-+                        veta_save = state->veta;
-+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
-+                        vetanew     = state->veta;
-+                        state->veta = veta_save;
-+                    }
-+                }
-+
-+                bOK = TRUE;
-+                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
-+                {
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, shake_vir,
-+                                       cr, nrnb, wcycle, upd, constr,
-+                                       TRUE, bCalcVir, vetanew);
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+
-+                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-+                    {
-+                        /* Correct the virial for multiple time stepping */
-+                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-+                    }
-+
-+                    if (!bOK)
-+                    {
-+                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-+                    }
-+
-+                }
-+                else if (graph)
-+                {
-+                    /* Need to unshift here if a do_force has been
-+                       called in the previous step */
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+
-+                /* if VV, compute the pressure and constraints */
-+                /* For VV2, we strictly only need this if using pressure
-+                 * control, but we really would like to have accurate pressures
-+                 * printed out.
-+                 * Think about ways around this in the future?
-+                 * For now, keep this choice in comments.
-+                 */
-+                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-+                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
-+                bPres = TRUE;
-+                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-+                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
-+                {
-+                    bSumEkinhOld = TRUE;
-+                }
-+                /* for vv, the first half of the integration actually corresponds to the previous step.
-+                   So we need information from the last step in the first half of the integration */
-+                if (bGStat || do_per_step(step-1, nstglobalcomm))
-+                {
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                    constr, NULL, FALSE, state->box,
-+                                    top_global, &bSumEkinhOld,
-+                                    cglo_flags
-+                                    | CGLO_ENERGY
-+                                    | (bTemp ? CGLO_TEMPERATURE : 0)
-+                                    | (bPres ? CGLO_PRESSURE : 0)
-+                                    | (bPres ? CGLO_CONSTRAINT : 0)
-+                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
-+                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-+                                    | CGLO_SCALEEKIN
-+                                    );
-+                    /* explanation of above:
-+                       a) We compute Ekin at the full time step
-+                       if 1) we are using the AveVel Ekin, and it's not the
-+                       initial step, or 2) if we are using AveEkin, but need the full
-+                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-+                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-+                       EkinAveVel because it's needed for the pressure */
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+                }
-+                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-+                if (!bInitStep)
-+                {
-+                    if (bTrotter)
-+                    {
-+                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
-+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
-+                    }
-+                    else
-+                    {
-+                        if (bExchanged)
-+                        {
-+                            wallcycle_stop(wcycle, ewcUPDATE);
-+                            /* We need the kinetic energy at minus the half step for determining
-+                             * the full step kinetic energy and possibly for T-coupling.*/
-+                            /* This may not be quite working correctly yet . . . . */
-+                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-+                                            constr, NULL, FALSE, state->box,
-+                                            top_global, &bSumEkinhOld,
-+                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-+                            wallcycle_start(wcycle, ewcUPDATE);
-+                        }
-+                    }
-+                }
-+
-+                if (iterate.bIterationActive &&
-+                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-+                                   state->veta, &vetanew))
-+                {
-+                    break;
-+                }
-+                bFirstIterate = FALSE;
-+            }
-+
-+            if (bTrotter && !bInitStep)
-+            {
-+                copy_mat(shake_vir, state->svir_prev);
-+                copy_mat(force_vir, state->fvir_prev);
-+                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
-+                {
-+                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
-+                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE);
-+                    enerd->term[F_EKIN] = trace(ekind->ekin);
-+                }
-+            }
-+            /* if it's the initial step, we performed this first step just to get the constraint virial */
-+            if (bInitStep && ir->eI == eiVV)
-+            {
-+                copy_rvecn(cbuf, state->v, 0, state->natoms);
-+            }
-+            wallcycle_stop(wcycle, ewcUPDATE);
-+        }
-+
-+        /* MRS -- now done iterating -- compute the conserved quantity */
-+        if (bVV)
-+        {
-+            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
-+            if (ir->eI == eiVV)
-+            {
-+                last_ekin = enerd->term[F_EKIN];
-+            }
-+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
-+            {
-+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
-+            }
-+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
-+            if (!bRerunMD)
-+            {
-+                sum_dhdl(enerd, state->lambda, ir->fepvals);
-+            }
-+        }
-+
-+        /* ########  END FIRST UPDATE STEP  ############## */
-+        /* ########  If doing VV, we now have v(dt) ###### */
-+        if (bDoExpanded)
-+        {
-+            /* perform extended ensemble sampling in lambda - we don't
-+               actually move to the new state before outputting
-+               statistics, but if performing simulated tempering, we
-+               do update the velocities and the tau_t. */
-+
-+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, state->v, mdatoms);
-+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
-+            copy_df_history(&state_global->dfhist, &state->dfhist);
-+        }
-+
-+        /* Now we have the energies and forces corresponding to the
-+         * coordinates at time t. We must output all of this before
-+         * the update.
-+         */
-+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
-+                                 ir, state, state_global, top_global, fr,
-+                                 outf, mdebin, ekind, f, f_global,
-+                                 &nchkpt,
-+                                 bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
-+                                 bSumEkinhOld);
-+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
-+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x, ir, t, wcycle);
-+
-+        /* kludge -- virial is lost with restart for NPT control. Must restart */
-+        if (bStartingFromCpt && bVV)
-+        {
-+            copy_mat(state->svir_prev, shake_vir);
-+            copy_mat(state->fvir_prev, force_vir);
-+        }
-+
-+        elapsed_time = walltime_accounting_get_current_elapsed_time(walltime_accounting);
-+
-+        /* Check whether everything is still allright */
-+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
-+#ifdef GMX_THREAD_MPI
-+            && MASTER(cr)
-+#endif
-+            )
-+        {
-+            /* this is just make gs.sig compatible with the hack
-+               of sending signals around by MPI_Reduce with together with
-+               other floats */
-+            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
-+            {
-+                gs.sig[eglsSTOPCOND] = 1;
-+            }
-+            if (gmx_get_stop_condition() == gmx_stop_cond_next)
-+            {
-+                gs.sig[eglsSTOPCOND] = -1;
-+            }
-+            /* < 0 means stop at next step, > 0 means stop at next NS step */
-+            if (fplog)
-+            {
-+                fprintf(fplog,
-+                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
-+                        gmx_get_signal_name(),
-+                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
-+                fflush(fplog);
-+            }
-+            fprintf(stderr,
-+                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
-+                    gmx_get_signal_name(),
-+                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
-+            fflush(stderr);
-+            handled_stop_condition = (int)gmx_get_stop_condition();
-+        }
-+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
-+                 (max_hours > 0 && elapsed_time > max_hours*60.0*60.0*0.99) &&
-+                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
-+        {
-+            /* Signal to terminate the run */
-+            gs.sig[eglsSTOPCOND] = 1;
-+            if (fplog)
-+            {
-+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
-+            }
-+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
-+        }
-+
-+        if (bResetCountersHalfMaxH && MASTER(cr) &&
-+            elapsed_time > max_hours*60.0*60.0*0.495)
-+        {
-+            gs.sig[eglsRESETCOUNTERS] = 1;
-+        }
-+
-+        if (ir->nstlist == -1 && !bRerunMD)
-+        {
-+            /* When bGStatEveryStep=FALSE, global_stat is only called
-+             * when we check the atom displacements, not at NS steps.
-+             * This means that also the bonded interaction count check is not
-+             * performed immediately after NS. Therefore a few MD steps could
-+             * be performed with missing interactions.
-+             * But wrong energies are never written to file,
-+             * since energies are only written after global_stat
-+             * has been called.
-+             */
-+            if (step >= nlh.step_nscheck)
-+            {
-+                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
-+                                                     nlh.scale_tot, state->x);
-+            }
-+            else
-+            {
-+                /* This is not necessarily true,
-+                 * but step_nscheck is determined quite conservatively.
-+                 */
-+                nlh.nabnsb = 0;
-+            }
-+        }
-+
-+        /* In parallel we only have to check for checkpointing in steps
-+         * where we do global communication,
-+         *  otherwise the other nodes don't know.
-+         */
-+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
-+                           cpt_period >= 0 &&
-+                           (cpt_period == 0 ||
-+                            elapsed_time >= nchkpt*cpt_period*60.0)) &&
-+            gs.set[eglsCHKPT] == 0)
-+        {
-+            gs.sig[eglsCHKPT] = 1;
-+        }
-+
-+        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
-+        if (EI_VV(ir->eI))
-+        {
-+            if (!bInitStep)
-+            {
-+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-+            }
-+            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
-+            {
-+                gmx_bool bIfRandomize;
-+                bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
-+                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
-+                if (constr && bIfRandomize)
-+                {
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, tmp_vir,
-+                                       cr, nrnb, wcycle, upd, constr,
-+                                       TRUE, bCalcVir, vetanew);
-+                }
-+            }
-+        }
-+
-+        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
-+        {
-+            gmx_iterate_init(&iterate, TRUE);
-+            /* for iterations, we save these vectors, as we will be redoing the calculations */
-+            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-+        }
-+
-+        bFirstIterate = TRUE;
-+        while (bFirstIterate || iterate.bIterationActive)
-+        {
-+            /* We now restore these vectors to redo the calculation with improved extended variables */
-+            if (iterate.bIterationActive)
-+            {
-+                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-+            }
-+
-+            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
-+               so scroll down for that logic */
-+
-+            /* #########   START SECOND UPDATE STEP ################# */
-+            /* Box is changed in update() when we do pressure coupling,
-+             * but we should still use the old box for energy corrections and when
-+             * writing it to the energy file, so it matches the trajectory files for
-+             * the same timestep above. Make a copy in a separate array.
-+             */
-+            copy_mat(state->box, lastbox);
-+
-+            bOK         = TRUE;
-+            dvdl_constr = 0;
-+
-+            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
-+            {
-+                wallcycle_start(wcycle, ewcUPDATE);
-+                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-+                if (bTrotter)
-+                {
-+                    if (iterate.bIterationActive)
-+                    {
-+                        if (bFirstIterate)
-+                        {
-+                            scalevir = 1;
-+                        }
-+                        else
-+                        {
-+                            /* we use a new value of scalevir to converge the iterations faster */
-+                            scalevir = tracevir/trace(shake_vir);
-+                        }
-+                        msmul(shake_vir, scalevir, shake_vir);
-+                        m_add(force_vir, shake_vir, total_vir);
-+                        clear_mat(shake_vir);
-+                    }
-+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-+                    /* We can only do Berendsen coupling after we have summed
-+                     * the kinetic energy or virial. Since the happens
-+                     * in global_state after update, we should only do it at
-+                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
-+                     */
-+                }
-+                else
-+                {
-+                    update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-+                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
-+                }
-+
-+                if (bVV)
-+                {
-+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                    /* velocity half-step update */
-+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                                  ekind, M, upd, FALSE, etrtVELOCITY2,
-+                                  cr, nrnb, constr, &top->idef);
-+                }
-+
-+                /* Above, initialize just copies ekinh into ekin,
-+                 * it doesn't copy position (for VV),
-+                 * and entire integrator for MD.
-+                 */
-+
-+                if (ir->eI == eiVVAK)
-+                {
-+                    copy_rvecn(state->x, cbuf, 0, state->natoms);
-+                }
-+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                              ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-+                wallcycle_stop(wcycle, ewcUPDATE);
-+
-+                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
-+                                   fr->bMolPBC, graph, f,
-+                                   &top->idef, shake_vir,
-+                                   cr, nrnb, wcycle, upd, constr,
-+                                   FALSE, bCalcVir, state->veta);
-+
-+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-+                {
-+                    /* Correct the virial for multiple time stepping */
-+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-+                }
-+
-+                if (ir->eI == eiVVAK)
-+                {
-+                    /* erase F_EKIN and F_TEMP here? */
-+                    /* just compute the kinetic energy at the half step to perform a trotter step */
-+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                    constr, NULL, FALSE, lastbox,
-+                                    top_global, &bSumEkinhOld,
-+                                    cglo_flags | CGLO_TEMPERATURE
-+                                    );
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-+                    /* now we know the scaling, we can compute the positions again again */
-+                    copy_rvecn(cbuf, state->x, 0, state->natoms);
-+
-+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                                  ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+
-+                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
-+                    /* are the small terms in the shake_vir here due
-+                     * to numerical errors, or are they important
-+                     * physically? I'm thinking they are just errors, but not completely sure.
-+                     * For now, will call without actually constraining, constr=NULL*/
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, tmp_vir,
-+                                       cr, nrnb, wcycle, upd, NULL,
-+                                       FALSE, bCalcVir,
-+                                       state->veta);
-+                }
-+                if (!bOK)
-+                {
-+                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-+                }
-+
-+                if (fr->bSepDVDL && fplog && do_log)
-+                {
-+                    gmx_print_sepdvdl(fplog, "Constraint dV/dl", 0.0, dvdl_constr);
-+                }
-+                if (bVV)
-+                {
-+                    /* this factor or 2 correction is necessary
-+                       because half of the constraint force is removed
-+                       in the vv step, so we have to double it.  See
-+                       the Redmine issue #1255.  It is not yet clear
-+                       if the factor of 2 is exact, or just a very
-+                       good approximation, and this will be
-+                       investigated.  The next step is to see if this
-+                       can be done adding a dhdl contribution from the
-+                       rattle step, but this is somewhat more
-+                       complicated with the current code. Will be
-+                       investigated, hopefully for 4.6.3. However,
-+                       this current solution is much better than
-+                       having it completely wrong.
-+                     */
-+                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
-+                }
-+                else
-+                {
-+                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-+                }
-+            }
-+            else if (graph)
-+            {
-+                /* Need to unshift here */
-+                unshift_self(graph, state->box, state->x);
-+            }
-+
-+            if (vsite != NULL)
-+            {
-+                wallcycle_start(wcycle, ewcVSITECONSTR);
-+                if (graph != NULL)
-+                {
-+                    shift_self(graph, state->box, state->x);
-+                }
-+                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-+                                 top->idef.iparams, top->idef.il,
-+                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-+
-+                if (graph != NULL)
-+                {
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+                wallcycle_stop(wcycle, ewcVSITECONSTR);
-+            }
-+
-+            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
-+            /* With Leap-Frog we can skip compute_globals at
-+             * non-communication steps, but we need to calculate
-+             * the kinetic energy one step before communication.
-+             */
-+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
-+            {
-+                if (ir->nstlist == -1 && bFirstIterate)
-+                {
-+                    gs.sig[eglsNABNSB] = nlh.nabnsb;
-+                }
-+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                constr,
-+                                bFirstIterate ? &gs : NULL,
-+                                (step_rel % gs.nstms == 0) &&
-+                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
-+                                lastbox,
-+                                top_global, &bSumEkinhOld,
-+                                cglo_flags
-+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
-+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
-+                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
-+                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-+                                | CGLO_CONSTRAINT
-+                                );
-+                if (ir->nstlist == -1 && bFirstIterate)
-+                {
-+                    nlh.nabnsb         = gs.set[eglsNABNSB];
-+                    gs.set[eglsNABNSB] = 0;
-+                }
-+            }
-+            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
-+            /* #############  END CALC EKIN AND PRESSURE ################# */
-+
-+            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-+               the virial that should probably be addressed eventually. state->veta has better properies,
-+               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-+               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
-+
-+            if (iterate.bIterationActive &&
-+                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-+                               trace(shake_vir), &tracevir))
-+            {
-+                break;
-+            }
-+            bFirstIterate = FALSE;
-+        }
-+
-+        if (!bVV || bRerunMD)
-+        {
-+            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
-+            sum_dhdl(enerd, state->lambda, ir->fepvals);
-+        }
-+        update_box(fplog, step, ir, mdatoms, state, f,
-+                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, upd);
-+
-+        /* ################# END UPDATE STEP 2 ################# */
-+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
-+
-+        /* The coordinates (x) were unshifted in update */
-+        if (!bGStat)
-+        {
-+            /* We will not sum ekinh_old,
-+             * so signal that we still have to do it.
-+             */
-+            bSumEkinhOld = TRUE;
-+        }
-+
-+        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
-+
-+        /* use the directly determined last velocity, not actually the averaged half steps */
-+        if (bTrotter && ir->eI == eiVV)
-+        {
-+            enerd->term[F_EKIN] = last_ekin;
-+        }
-+        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
-+
-+        if (bVV)
-+        {
-+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
-+        }
-+        else
-+        {
-+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
-+        }
-+        /* #########  END PREPARING EDR OUTPUT  ###########  */
-+
-+        /* Output stuff */
-+        if (MASTER(cr))
-+        {
-+            gmx_bool do_dr, do_or;
-+
-+            if (fplog && do_log && bDoExpanded)
-+            {
-+                /* only needed if doing expanded ensemble */
-+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
-+                                          &state_global->dfhist, state->fep_state, ir->nstlog, step);
-+            }
-+            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
-+            {
-+                if (bCalcEner)
-+                {
-+                    upd_mdebin(mdebin, bDoDHDL, TRUE,
-+                               t, mdatoms->tmass, enerd, state,
-+                               ir->fepvals, ir->expandedvals, lastbox,
-+                               shake_vir, force_vir, total_vir, pres,
-+                               ekind, mu_tot, constr);
-+                }
-+                else
-+                {
-+                    upd_mdebin_step(mdebin);
-+                }
-+
-+                do_dr  = do_per_step(step, ir->nstdisreout);
-+                do_or  = do_per_step(step, ir->nstorireout);
-+
-+                print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : NULL,
-+                           step, t,
-+                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
-+            }
-+            if (ir->ePull != epullNO)
-+            {
-+                pull_print_output(ir->pull, step, t);
-+            }
-+
-+            if (do_per_step(step, ir->nstlog))
-+            {
-+                if (fflush(fplog) != 0)
-+                {
-+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
-+                }
-+            }
-+        }
-+        if (bDoExpanded)
-+        {
-+            /* Have to do this part _after_ outputting the logfile and the edr file */
-+            /* Gets written into the state at the beginning of next loop*/
-+            state->fep_state = lamnew;
-+        }
-+        /* Print the remaining wall clock time for the run */
-+        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
-+        {
-+            if (shellfc)
-+            {
-+                fprintf(stderr, "\n");
-+            }
-+            print_time(stderr, walltime_accounting, step, ir, cr);
-+        }
-+
-+        /* Ion/water position swapping.
-+         * Not done in last step since trajectory writing happens before this call
-+         * in the MD loop and exchanges would be lost anyway. */
-+        bNeedRepartition = FALSE;
-+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
-+            do_per_step(step, ir->swap->nstswap))
-+        {
-+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
-+                                             bRerunMD ? rerun_fr.x   : state->x,
-+                                             bRerunMD ? rerun_fr.box : state->box,
-+                                             top_global, MASTER(cr) && bVerbose, bRerunMD);
-+
-+            if (bNeedRepartition && DOMAINDECOMP(cr))
-+            {
-+                dd_collect_state(cr->dd, state, state_global);
-+            }
-+        }
-+
-+        /* Replica exchange */
-+        bExchanged = FALSE;
-+        if (bDoReplEx)
-+        {
-+            bExchanged = replica_exchange(fplog, cr, repl_ex,
-+                                          state_global, enerd,
-+                                          state, step, t);
-+        }
-+
-+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
-+        {
-+            dd_partition_system(fplog, step, cr, TRUE, 1,
-+                                state_global, top_global, ir,
-+                                state, &f, mdatoms, top, fr,
-+                                vsite, shellfc, constr,
-+                                nrnb, wcycle, FALSE);
-+        }
-+
-+        bFirstStep       = FALSE;
-+        bInitStep        = FALSE;
-+        bStartingFromCpt = FALSE;
-+
-+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
-+        /* With all integrators, except VV, we need to retain the pressure
-+         * at the current step for coupling at the next step.
-+         */
-+        if ((state->flags & (1<<estPRES_PREV)) &&
-+            (bGStatEveryStep ||
-+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
-+        {
-+            /* Store the pressure in t_state for pressure coupling
-+             * at the next MD step.
-+             */
-+            copy_mat(pres, state->pres_prev);
-+        }
-+
-+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
-+
-+        if ( (membed != NULL) && (!bLastStep) )
-+        {
-+            rescale_membed(step_rel, membed, state_global->x);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            if (MASTER(cr))
-+            {
-+                /* read next frame from input trajectory */
-+                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
-+            }
-+
-+            if (PAR(cr))
-+            {
-+                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
-+            }
-+        }
-+
-+        if (!bRerunMD || !rerun_fr.bStep)
-+        {
-+            /* increase the MD step number */
-+            step++;
-+            step_rel++;
-+        }
-+
-+        cycles = wallcycle_stop(wcycle, ewcSTEP);
-+        if (DOMAINDECOMP(cr) && wcycle)
-+        {
-+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
-+        }
-+
-+        if (bPMETuneRunning || bPMETuneTry)
-+        {
-+            /* PME grid + cut-off optimization with GPUs or PME nodes */
-+
-+            /* Count the total cycles over the last steps */
-+            cycles_pmes += cycles;
-+
-+            /* We can only switch cut-off at NS steps */
-+            if (step % ir->nstlist == 0)
-+            {
-+                /* PME grid + cut-off optimization with GPUs or PME nodes */
-+                if (bPMETuneTry)
-+                {
-+                    if (DDMASTER(cr->dd))
-+                    {
-+                        /* PME node load is too high, start tuning */
-+                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
-+                    }
-+                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-+
-+                    if (bPMETuneRunning &&
-+                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
-+                        !(cr->duty & DUTY_PME))
-+                    {
-+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
-+                         * With GPUs + separate PME ranks, we don't want DLB.
-+                         * This could happen when we scan coarse grids and
-+                         * it would then never be turned off again.
-+                         * This would hurt performance at the final, optimal
-+                         * grid spacing, where DLB almost never helps.
-+                         * Also, DLB can limit the cut-off for PME tuning.
-+                         */
-+                        dd_dlb_set_lock(cr->dd, TRUE);
-+                    }
-+
-+                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
-+                    {
-+                        bPMETuneTry     = FALSE;
-+                    }
-+                }
-+                if (bPMETuneRunning)
-+                {
-+                    /* init_step might not be a multiple of nstlist,
-+                     * but the first cycle is always skipped anyhow.
-+                     */
-+                    bPMETuneRunning =
-+                        pme_load_balance(pme_loadbal, cr,
-+                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
-+                                         fplog,
-+                                         ir, state, cycles_pmes,
-+                                         fr->ic, fr->nbv, &fr->pmedata,
-+                                         step);
-+
-+                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
-+                    fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-+                    fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-+                    fr->rlist         = fr->ic->rlist;
-+                    fr->rlistlong     = fr->ic->rlistlong;
-+                    fr->rcoulomb      = fr->ic->rcoulomb;
-+                    fr->rvdw          = fr->ic->rvdw;
-+
-+                    if (ir->eDispCorr != edispcNO)
-+                    {
-+                        calc_enervirdiff(NULL, ir->eDispCorr, fr);
-+                    }
-+
-+                    if (!bPMETuneRunning &&
-+                        DOMAINDECOMP(cr) &&
-+                        dd_dlb_is_locked(cr->dd))
-+                    {
-+                        /* Unlock the DLB=auto, DLB is allowed to activate
-+                         * (but we don't expect it to activate in most cases).
-+                         */
-+                        dd_dlb_set_lock(cr->dd, FALSE);
-+                    }
-+                }
-+                cycles_pmes = 0;
-+            }
-+        }
-+
-+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
-+            gs.set[eglsRESETCOUNTERS] != 0)
-+        {
-+            /* Reset all the counters related to performance over the run */
-+            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
-+                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
-+            wcycle_set_reset_counters(wcycle, -1);
-+            if (!(cr->duty & DUTY_PME))
-+            {
-+                /* Tell our PME node to reset its counters */
-+                gmx_pme_send_resetcounters(cr, step);
-+            }
-+            /* Correct max_hours for the elapsed time */
-+            max_hours                -= elapsed_time/(60.0*60.0);
-+            bResetCountersHalfMaxH    = FALSE;
-+            gs.set[eglsRESETCOUNTERS] = 0;
-+        }
-+
-+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
-+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
-+
-+    }
-+    /* End of main MD loop */
-+    debug_gmx();
-+
-+    /* Closing TNG files can include compressing data. Therefore it is good to do that
-+     * before stopping the time measurements. */
-+    mdoutf_tng_close(outf);
-+
-+    /* Stop measuring walltime */
-+    walltime_accounting_end(walltime_accounting);
-+
-+    if (bRerunMD && MASTER(cr))
-+    {
-+        close_trj(status);
-+    }
-+
-+    if (!(cr->duty & DUTY_PME))
-+    {
-+        /* Tell the PME only node to finish */
-+        gmx_pme_send_finish(cr);
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        if (ir->nstcalcenergy > 0 && !bRerunMD)
-+        {
-+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
-+                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
-+        }
-+    }
-+
-+    done_mdoutf(outf);
-+    debug_gmx();
-+
-+    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
-+    {
-+        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
-+        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
-+    }
-+
-+    if (pme_loadbal != NULL)
-+    {
-+        pme_loadbal_done(pme_loadbal, cr, fplog,
-+                         fr->nbv != NULL && fr->nbv->bUseGPU);
-+    }
-+
-+    if (shellfc && fplog)
-+    {
-+        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
-+                (nconverged*100.0)/step_rel);
-+        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
-+                tcount/step_rel);
-+    }
-+
-+    if (repl_ex_nst > 0 && MASTER(cr))
-+    {
-+        print_replica_exchange_statistics(fplog, repl_ex);
-+    }
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(ir->bIMD, ir->imd);
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
-+
-+    return 0;
-+}
-diff --git a/src/programs/mdrun/mdrun.cpp b/src/programs/mdrun/mdrun.cpp
-index 6bac3f0..e9fbf48 100644
---- a/src/programs/mdrun/mdrun.cpp
-+++ b/src/programs/mdrun/mdrun.cpp
-@@ -55,6 +55,12 @@
- 
- #include "gromacs/commandline/pargs.h"
- #include "gromacs/fileio/filenm.h"
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain; 
-+extern void(*plumedcmd)(plumed,const char*,const void*);
-+/* END PLUMED */
- 
- int gmx_mdrun(int argc, char *argv[])
- {
-@@ -428,6 +434,7 @@ int gmx_mdrun(int argc, char *argv[])
-         { efMTX, "-mtx",    "nm",       ffOPTWR },
-         { efNDX, "-dn",     "dipole",   ffOPTWR },
-         { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-plumed", "plumed",   ffOPTRD },   /* PLUMED */
-         { efDAT, "-membed", "membed",   ffOPTRD },
-         { efTOP, "-mp",     "membed",   ffOPTRD },
-         { efNDX, "-mn",     "membed",   ffOPTRD },
-@@ -780,6 +787,32 @@ int gmx_mdrun(int argc, char *argv[])
-     ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-     ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
- 
-+    /* PLUMED */
-+    plumedswitch=0;
-+    if (opt2bSet("-plumed",NFILE,fnm)) plumedswitch=1;
-+    if(plumedswitch){
-+      plumedcmd=plumed_cmd;
-+      int plumed_is_there=0;
-+      int real_precision=sizeof(real);
-+      real energyUnits=1.0;
-+      real lengthUnits=1.0;
-+      real timeUnits=1.0;
-+  
-+      if(!plumed_installed()){
-+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
-+      }
-+      plumedmain=plumed_create();
-+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
-+      // this is not necessary for gromacs units:
-+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
-+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
-+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
-+      //
-+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,NFILE,fnm));
-+      plumedswitch=1;
-+    }
-+    /* END PLUMED */
-+
-     rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-                   nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-                   dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
-@@ -788,6 +821,12 @@ int gmx_mdrun(int argc, char *argv[])
-                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-                   pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      plumed_finalize(plumedmain);
-+    }
-+    /* END PLUMED */
-+
-     /* Log file has to be closed in mdrunner if we are appending to it
-        (fplog not set here) */
-     if (MASTER(cr) && !bAppendFiles)
-diff --git a/src/programs/mdrun/mdrun.cpp.preplumed b/src/programs/mdrun/mdrun.cpp.preplumed
-new file mode 100644
-index 0000000..6bac3f0
---- /dev/null
-+++ b/src/programs/mdrun/mdrun.cpp.preplumed
-@@ -0,0 +1,799 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#include "mdrun_main.h"
-+
-+#ifdef HAVE_CONFIG_H
-+#include "config.h"
-+#endif
-+
-+#include <stdio.h>
-+
-+#include "gromacs/legacyheaders/checkpoint.h"
-+#include "gromacs/legacyheaders/copyrite.h"
-+#include "gromacs/legacyheaders/gmx_fatal.h"
-+#include "gromacs/legacyheaders/macros.h"
-+#include "gromacs/legacyheaders/main.h"
-+#include "gromacs/legacyheaders/mdrun.h"
-+#include "gromacs/legacyheaders/network.h"
-+#include "gromacs/legacyheaders/readinp.h"
-+#include "gromacs/legacyheaders/typedefs.h"
-+#include "gromacs/legacyheaders/types/commrec.h"
-+
-+#include "gromacs/commandline/pargs.h"
-+#include "gromacs/fileio/filenm.h"
-+
-+int gmx_mdrun(int argc, char *argv[])
-+{
-+    const char   *desc[] = {
-+        "[THISMODULE] is the main computational chemistry engine",
-+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
-+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
-+        "test particle insertion or (re)calculation of energies.",
-+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
-+        "builds a Hessian matrix from single conformation.",
-+        "For usual Normal Modes-like calculations, make sure that",
-+        "the structure provided is properly energy-minimized.",
-+        "The generated matrix can be diagonalized by [gmx-nmeig].[PAR]",
-+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
-+        "and distributes the topology over ranks if needed.",
-+        "[TT]mdrun[tt] produces at least four output files.",
-+        "A single log file ([TT]-g[tt]) is written, unless the option",
-+        "[TT]-seppot[tt] is used, in which case each rank writes a log file.",
-+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
-+        "optionally forces.",
-+        "The structure file ([TT]-c[tt]) contains the coordinates and",
-+        "velocities of the last step.",
-+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
-+        "pressure, etc, a lot of these things are also printed in the log file.",
-+        "Optionally coordinates can be written to a compressed trajectory file",
-+        "([TT]-x[tt]).[PAR]",
-+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
-+        "turned on.[PAR]",
-+        "A simulation can be run in parallel using two different parallelization",
-+        "schemes: MPI parallelization and/or OpenMP thread parallelization.",
-+        "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
-+        "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
-+        "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
-+        "are supported when [TT]mdrun[tt] is compiled with OpenMP. Full OpenMP support",
-+        "is only available with the Verlet cut-off scheme, with the (older)",
-+        "group scheme only PME-only ranks can use OpenMP parallelization.",
-+        "In all cases [TT]mdrun[tt] will by default try to use all the available",
-+        "hardware resources. With a normal MPI library only the options",
-+        "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
-+        "for PME-only ranks, can be used to control the number of threads.",
-+        "With thread-MPI there are additional options [TT]-nt[tt], which sets",
-+        "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
-+        "of thread-MPI threads.",
-+        "The number of OpenMP threads used by [TT]mdrun[tt] can also be set with",
-+        "the standard environment variable, [TT]OMP_NUM_THREADS[tt].",
-+        "The [TT]GMX_PME_NUM_THREADS[tt] environment variable can be used to specify",
-+        "the number of threads used by the PME-only ranks.[PAR]",
-+        "Note that combined MPI+OpenMP parallelization is in many cases",
-+        "slower than either on its own. However, at high parallelization, using the",
-+        "combination is often beneficial as it reduces the number of domains and/or",
-+        "the number of MPI ranks. (Less and larger domains can improve scaling,",
-+        "with separate PME ranks, using fewer MPI ranks reduces communication costs.)",
-+        "OpenMP-only parallelization is typically faster than MPI-only parallelization",
-+        "on a single CPU(-die). Since we currently don't have proper hardware",
-+        "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
-+        "automatically use OpenMP-only parallelization when you use up to 4",
-+        "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
-+        "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
-+        "parallelization is used (except with GPUs, see below).",
-+        "[PAR]",
-+        "To quickly test the performance of the new Verlet cut-off scheme",
-+        "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
-+        "the [TT]-testverlet[tt] option. This should not be used for production,",
-+        "since it can slightly modify potentials and it will remove charge groups",
-+        "making analysis difficult, as the [TT].tpr[tt] file will still contain",
-+        "charge groups. For production simulations it is highly recommended",
-+        "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
-+        "[PAR]",
-+        "With GPUs (only supported with the Verlet cut-off scheme), the number",
-+        "of GPUs should match the number of particle-particle ranks, i.e.",
-+        "excluding PME-only ranks. With thread-MPI, unless set on the command line, the number",
-+        "of MPI threads will automatically be set to the number of GPUs detected.",
-+        "To use a subset of the available GPUs, or to manually provide a mapping of",
-+        "GPUs to PP ranks, you can use the [TT]-gpu_id[tt] option. The argument of [TT]-gpu_id[tt] is",
-+        "a string of digits (without delimiter) representing device id-s of the GPUs to be used.",
-+        "For example, \"[TT]02[tt]\" specifies using GPUs 0 and 2 in the first and second PP ranks per compute node",
-+        "respectively. To select different sets of GPU-s",
-+        "on different nodes of a compute cluster, use the [TT]GMX_GPU_ID[tt] environment",
-+        "variable instead. The format for [TT]GMX_GPU_ID[tt] is identical to ",
-+        "[TT]-gpu_id[tt], with the difference that an environment variable can have",
-+        "different values on different compute nodes. Multiple MPI ranks on each node",
-+        "can share GPUs. This is accomplished by specifying the id(s) of the GPU(s)",
-+        "multiple times, e.g. \"[TT]0011[tt]\" for four ranks sharing two GPUs in this node.",
-+        "This works within a single simulation, or a multi-simulation, with any form of MPI.",
-+        "[PAR]",
-+        "With the Verlet cut-off scheme and verlet-buffer-tolerance set,",
-+        "the pair-list update interval nstlist can be chosen freely with",
-+        "the option [TT]-nstlist[tt]. [TT]mdrun[tt] will then adjust",
-+        "the pair-list cut-off to maintain accuracy, and not adjust nstlist.",
-+        "Otherwise, by default, [TT]mdrun[tt] will try to increase the",
-+        "value of nstlist set in the [TT].mdp[tt] file to improve the",
-+        "performance. For CPU-only runs, nstlist might increase to 20, for",
-+        "GPU runs up to 40. For medium to high parallelization or with",
-+        "fast GPUs, a (user-supplied) larger nstlist value can give much",
-+        "better performance.",
-+        "[PAR]",
-+        "When using PME with separate PME ranks or with a GPU, the two major",
-+        "compute tasks, the non-bonded force calculation and the PME calculation",
-+        "run on different compute resources. If this load is not balanced,",
-+        "some of the resources will be idle part of time. With the Verlet",
-+        "cut-off scheme this load is automatically balanced when the PME load",
-+        "is too high (but not when it is too low). This is done by scaling",
-+        "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
-+        "few hundred steps different settings are tried and the fastest is chosen",
-+        "for the rest of the simulation. This does not affect the accuracy of",
-+        "the results, but it does affect the decomposition of the Coulomb energy",
-+        "into particle and mesh contributions. The auto-tuning can be turned off",
-+        "with the option [TT]-notunepme[tt].",
-+        "[PAR]",
-+        "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,",
-+        "when all (logical) cores on a compute node are used by [TT]mdrun[tt],",
-+        "even when no multi-threading is used,",
-+        "as this usually results in significantly better performance.",
-+        "If the queuing systems or the OpenMP library pinned threads, we honor",
-+        "this and don't pin again, even though the layout may be sub-optimal.",
-+        "If you want to have [TT]mdrun[tt] override an already set thread affinity",
-+        "or pin threads when using less cores, use [TT]-pin on[tt].",
-+        "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,",
-+        "there are multiple logical cores per physical core.",
-+        "The option [TT]-pinstride[tt] sets the stride in logical cores for",
-+        "pinning consecutive threads. Without SMT, 1 is usually the best choice.",
-+        "With Intel Hyper-Threading 2 is best when using half or less of the",
-+        "logical cores, 1 otherwise. The default value of 0 do exactly that:",
-+        "it minimizes the threads per logical core, to optimize performance.",
-+        "If you want to run multiple [TT]mdrun[tt] jobs on the same physical node,"
-+        "you should set [TT]-pinstride[tt] to 1 when using all logical cores.",
-+        "When running multiple [TT]mdrun[tt] (or other) simulations on the same physical",
-+        "node, some simulations need to start pinning from a non-zero core",
-+        "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
-+        "the offset in logical cores for pinning.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] is started with more than 1 rank,",
-+        "parallelization with domain decomposition is used.",
-+        "[PAR]",
-+        "With domain decomposition, the spatial decomposition can be set",
-+        "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
-+        "The user only needs to change this when the system is very inhomogeneous.",
-+        "Dynamic load balancing is set with the option [TT]-dlb[tt],",
-+        "which can give a significant performance improvement,",
-+        "especially for inhomogeneous systems. The only disadvantage of",
-+        "dynamic load balancing is that runs are no longer binary reproducible,",
-+        "but in most cases this is not important.",
-+        "By default the dynamic load balancing is automatically turned on",
-+        "when the measured performance loss due to load imbalance is 5% or more.",
-+        "At low parallelization these are the only important options",
-+        "for domain decomposition.",
-+        "At high parallelization the options in the next two sections",
-+        "could be important for increasing the performace.",
-+        "[PAR]",
-+        "When PME is used with domain decomposition, separate ranks can",
-+        "be assigned to do only the PME mesh calculation;",
-+        "this is computationally more efficient starting at about 12 ranks,",
-+        "or even fewer when OpenMP parallelization is used.",
-+        "The number of PME ranks is set with option [TT]-npme[tt],",
-+        "but this cannot be more than half of the ranks.",
-+        "By default [TT]mdrun[tt] makes a guess for the number of PME",
-+        "ranks when the number of ranks is larger than 16. With GPUs,",
-+        "using separate PME ranks is not selected automatically,",
-+        "since the optimal setup depends very much on the details",
-+        "of the hardware. In all cases, you might gain performance",
-+        "by optimizing [TT]-npme[tt]. Performance statistics on this issue",
-+        "are written at the end of the log file.",
-+        "For good load balancing at high parallelization, the PME grid x and y",
-+        "dimensions should be divisible by the number of PME ranks",
-+        "(the simulation will run correctly also when this is not the case).",
-+        "[PAR]",
-+        "This section lists all options that affect the domain decomposition.",
-+        "[PAR]",
-+        "Option [TT]-rdd[tt] can be used to set the required maximum distance",
-+        "for inter charge-group bonded interactions.",
-+        "Communication for two-body bonded interactions below the non-bonded",
-+        "cut-off distance always comes for free with the non-bonded communication.",
-+        "Atoms beyond the non-bonded cut-off are only communicated when they have",
-+        "missing bonded interactions; this means that the extra cost is minor",
-+        "and nearly indepedent of the value of [TT]-rdd[tt].",
-+        "With dynamic load balancing option [TT]-rdd[tt] also sets",
-+        "the lower limit for the domain decomposition cell sizes.",
-+        "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
-+        "the initial coordinates. The chosen value will be a balance",
-+        "between interaction range and communication cost.",
-+        "[PAR]",
-+        "When inter charge-group bonded interactions are beyond",
-+        "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
-+        "For pair interactions and tabulated bonds",
-+        "that do not generate exclusions, this check can be turned off",
-+        "with the option [TT]-noddcheck[tt].",
-+        "[PAR]",
-+        "When constraints are present, option [TT]-rcon[tt] influences",
-+        "the cell size limit as well.",
-+        "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
-+        "should not be beyond the smallest cell size. A error message is",
-+        "generated when this happens and the user should change the decomposition",
-+        "or decrease the LINCS order and increase the number of LINCS iterations.",
-+        "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
-+        "in a conservative fashion. For high parallelization it can be useful",
-+        "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
-+        "[PAR]",
-+        "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
-+        "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
-+        "the cells can scale down by at least this factor. This option is used",
-+        "for the automated spatial decomposition (when not using [TT]-dd[tt])",
-+        "as well as for determining the number of grid pulses, which in turn",
-+        "sets the minimum allowed cell size. Under certain circumstances",
-+        "the value of [TT]-dds[tt] might need to be adjusted to account for",
-+        "high or low spatial inhomogeneity of the system.",
-+        "[PAR]",
-+        "The option [TT]-gcom[tt] can be used to only do global communication",
-+        "every n steps.",
-+        "This can improve performance for highly parallel simulations",
-+        "where this global communication step becomes the bottleneck.",
-+        "For a global thermostat and/or barostat the temperature",
-+        "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
-+        "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
-+        "With [TT]-rerun[tt] an input trajectory can be given for which ",
-+        "forces and energies will be (re)calculated. Neighbor searching will be",
-+        "performed for every frame, unless [TT]nstlist[tt] is zero",
-+        "(see the [TT].mdp[tt] file).[PAR]",
-+        "ED (essential dynamics) sampling and/or additional flooding potentials",
-+        "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]",
-+        "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool",
-+        "or by using options in the essdyn menu of the WHAT IF program.",
-+        "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
-+        "contains projections of positions, velocities and forces onto selected",
-+        "eigenvectors.[PAR]",
-+        "When user-defined potential functions have been selected in the",
-+        "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
-+        "a formatted table with potential functions. The file is read from",
-+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
-+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
-+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
-+        "normal Coulomb.",
-+        "When pair interactions are present, a separate table for pair interaction",
-+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
-+        "When tabulated bonded functions are present in the topology,",
-+        "interaction functions are read using the [TT]-tableb[tt] option.",
-+        "For each different tabulated interaction type the table file name is",
-+        "modified in a different way: before the file extension an underscore is",
-+        "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
-+        "and finally the table number of the interaction type.[PAR]",
-+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
-+        "coordinates and forces when pulling is selected",
-+        "in the [TT].mdp[tt] file.[PAR]",
-+        "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
-+        "simulated in parallel.",
-+        "As many input files/directories are required as the number of systems. ",
-+        "The [TT]-multidir[tt] option takes a list of directories (one for each ",
-+        "system) and runs in each of them, using the input/output file names, ",
-+        "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
-+        "directories.",
-+        "With [TT]-multi[tt], the system number is appended to the run input ",
-+        "and each output filename, for instance [TT]topol.tpr[tt] becomes",
-+        "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
-+        "The number of ranks per system is the total number of ranks",
-+        "divided by the number of systems.",
-+        "One use of this option is for NMR refinement: when distance",
-+        "or orientation restraints are present these can be ensemble averaged",
-+        "over all the systems.[PAR]",
-+        "With [TT]-replex[tt] replica exchange is attempted every given number",
-+        "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
-+        "[TT]-multidir[tt] option, described above.",
-+        "All run input files should use a different coupling temperature,",
-+        "the order of the files is not important. The random seed is set with",
-+        "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
-+        "is performed after every exchange.[PAR]",
-+        "Finally some experimental algorithms can be tested when the",
-+        "appropriate options have been given. Currently under",
-+        "investigation are: polarizability.",
-+        "[PAR]",
-+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
-+        "a protein into a membrane. The data file should contain the options",
-+        "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
-+        "both apply to this as well.",
-+        "[PAR]",
-+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
-+        "crashes due to too large forces. With this option coordinates and",
-+        "forces of atoms with a force larger than a certain value will",
-+        "be printed to stderr.",
-+        "[PAR]",
-+        "Checkpoints containing the complete state of the system are written",
-+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
-+        "unless option [TT]-cpt[tt] is set to -1.",
-+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
-+        "make sure that a recent state of the system is always available,",
-+        "even when the simulation is terminated while writing a checkpoint.",
-+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
-+        "with the step number.",
-+        "A simulation can be continued by reading the full state from file",
-+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
-+        "if no checkpoint file is found, Gromacs just assumes a normal run and",
-+        "starts from the first step of the [TT].tpr[tt] file. By default the output",
-+        "will be appending to the existing output files. The checkpoint file",
-+        "contains checksums of all output files, such that you will never",
-+        "loose data when some output files are modified, corrupt or removed.",
-+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
-+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
-+        "[TT]*[tt] all files are present with names and checksums matching those stored",
-+        "in the checkpoint file: files are appended[PAR]",
-+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
-+        "With [TT]-noappend[tt] new output files are opened and the simulation",
-+        "part number is added to all output file names.",
-+        "Note that in all cases the checkpoint file itself is not renamed",
-+        "and will be overwritten, unless its name does not match",
-+        "the [TT]-cpo[tt] option.",
-+        "[PAR]",
-+        "With checkpointing the output is appended to previously written",
-+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
-+        "output files are present (except for the checkpoint file).",
-+        "The integrity of the files to be appended is verified using checksums",
-+        "which are stored in the checkpoint file. This ensures that output can",
-+        "not be mixed up or corrupted due to file appending. When only some",
-+        "of the previous output files are present, a fatal error is generated",
-+        "and no old output files are modified and no new output files are opened.",
-+        "The result with appending will be the same as from a single run.",
-+        "The contents will be binary identical, unless you use a different number",
-+        "of ranks or dynamic load balancing or the FFT library uses optimizations",
-+        "through timing.",
-+        "[PAR]",
-+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
-+        "file is written at the first neighbor search step where the run time",
-+        "exceeds [TT]-maxh[tt]*0.99 hours.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
-+        "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
-+        "pressed), it will stop after the next neighbor search step ",
-+        "(with nstlist=0 at the next step).",
-+        "In both cases all the usual output will be written to file.",
-+        "When running with MPI, a signal to one of the [TT]mdrun[tt] ranks",
-+        "is sufficient, this signal should not be sent to mpirun or",
-+        "the [TT]mdrun[tt] process that is the parent of the others.",
-+        "[PAR]",
-+        "Interactive molecular dynamics (IMD) can be activated by using at least one",
-+        "of the three IMD switches: The [TT]-imdterm[tt] switch allows to terminate the",
-+        "simulation from the molecular viewer (e.g. VMD). With [TT]-imdwait[tt],",
-+        "[TT]mdrun[tt] pauses whenever no IMD client is connected. Pulling from the",
-+        "IMD remote can be turned on by [TT]-imdpull[tt].",
-+        "The port [TT]mdrun[tt] listens to can be altered by [TT]-imdport[tt].The",
-+        "file pointed to by [TT]-if[tt] contains atom indices and forces if IMD",
-+        "pulling is used."
-+        "[PAR]",
-+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
-+    };
-+    t_commrec    *cr;
-+    t_filenm      fnm[] = {
-+        { efTPX, NULL,      NULL,       ffREAD },
-+        { efTRN, "-o",      NULL,       ffWRITE },
-+        { efCOMPRESSED, "-x", NULL,     ffOPTWR },
-+        { efCPT, "-cpi",    NULL,       ffOPTRD },
-+        { efCPT, "-cpo",    NULL,       ffOPTWR },
-+        { efSTO, "-c",      "confout",  ffWRITE },
-+        { efEDR, "-e",      "ener",     ffWRITE },
-+        { efLOG, "-g",      "md",       ffWRITE },
-+        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
-+        { efXVG, "-field",  "field",    ffOPTWR },
-+        { efXVG, "-table",  "table",    ffOPTRD },
-+        { efXVG, "-tabletf", "tabletf",    ffOPTRD },
-+        { efXVG, "-tablep", "tablep",   ffOPTRD },
-+        { efXVG, "-tableb", "table",    ffOPTRD },
-+        { efTRX, "-rerun",  "rerun",    ffOPTRD },
-+        { efXVG, "-tpi",    "tpi",      ffOPTWR },
-+        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
-+        { efEDI, "-ei",     "sam",      ffOPTRD },
-+        { efXVG, "-eo",     "edsam",    ffOPTWR },
-+        { efXVG, "-devout", "deviatie", ffOPTWR },
-+        { efXVG, "-runav",  "runaver",  ffOPTWR },
-+        { efXVG, "-px",     "pullx",    ffOPTWR },
-+        { efXVG, "-pf",     "pullf",    ffOPTWR },
-+        { efXVG, "-ro",     "rotation", ffOPTWR },
-+        { efLOG, "-ra",     "rotangles", ffOPTWR },
-+        { efLOG, "-rs",     "rotslabs", ffOPTWR },
-+        { efLOG, "-rt",     "rottorque", ffOPTWR },
-+        { efMTX, "-mtx",    "nm",       ffOPTWR },
-+        { efNDX, "-dn",     "dipole",   ffOPTWR },
-+        { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-membed", "membed",   ffOPTRD },
-+        { efTOP, "-mp",     "membed",   ffOPTRD },
-+        { efNDX, "-mn",     "membed",   ffOPTRD },
-+        { efXVG, "-if",     "imdforces", ffOPTWR },
-+        { efXVG, "-swap",   "swapions", ffOPTWR }
-+    };
-+#define NFILE asize(fnm)
-+
-+    /* Command line options ! */
-+    gmx_bool        bDDBondCheck  = TRUE;
-+    gmx_bool        bDDBondComm   = TRUE;
-+    gmx_bool        bTunePME      = TRUE;
-+    gmx_bool        bTestVerlet   = FALSE;
-+    gmx_bool        bVerbose      = FALSE;
-+    gmx_bool        bCompact      = TRUE;
-+    gmx_bool        bSepPot       = FALSE;
-+    gmx_bool        bRerunVSite   = FALSE;
-+    gmx_bool        bConfout      = TRUE;
-+    gmx_bool        bReproducible = FALSE;
-+    gmx_bool        bIMDwait      = FALSE;
-+    gmx_bool        bIMDterm      = FALSE;
-+    gmx_bool        bIMDpull      = FALSE;
-+
-+    int             npme          = -1;
-+    int             nstlist       = 0;
-+    int             nmultisim     = 0;
-+    int             nstglobalcomm = -1;
-+    int             repl_ex_nst   = 0;
-+    int             repl_ex_seed  = -1;
-+    int             repl_ex_nex   = 0;
-+    int             nstepout      = 100;
-+    int             resetstep     = -1;
-+    gmx_int64_t     nsteps        = -2;   /* the value -2 means that the mdp option will be used */
-+    int             imdport       = 8888; /* can be almost anything, 8888 is easy to remember */
-+
-+    rvec            realddxyz          = {0, 0, 0};
-+    const char     *ddno_opt[ddnoNR+1] =
-+    { NULL, "interleave", "pp_pme", "cartesian", NULL };
-+    const char     *dddlb_opt[] =
-+    { NULL, "auto", "no", "yes", NULL };
-+    const char     *thread_aff_opt[threadaffNR+1] =
-+    { NULL, "auto", "on", "off", NULL };
-+    const char     *nbpu_opt[] =
-+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
-+    real            rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
-+    char           *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
-+    real            cpt_period            = 15.0, max_hours = -1;
-+    gmx_bool        bAppendFiles          = TRUE;
-+    gmx_bool        bKeepAndNumCPT        = FALSE;
-+    gmx_bool        bResetCountersHalfWay = FALSE;
-+    output_env_t    oenv                  = NULL;
-+    const char     *deviceOptions         = "";
-+
-+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
-+     * But unfortunately we are not allowed to call a function here,
-+     * since declarations follow below.
-+     */
-+    gmx_hw_opt_t    hw_opt = {
-+        0, 0, 0, 0, threadaffSEL, 0, 0,
-+        { NULL, FALSE, 0, NULL }
-+    };
-+
-+    t_pargs         pa[] = {
-+
-+        { "-dd",      FALSE, etRVEC, {&realddxyz},
-+          "Domain decomposition grid, 0 is optimize" },
-+        { "-ddorder", FALSE, etENUM, {ddno_opt},
-+          "DD rank order" },
-+        { "-npme",    FALSE, etINT, {&npme},
-+          "Number of separate ranks to be used for PME, -1 is guess" },
-+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
-+          "Total number of threads to start (0 is guess)" },
-+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
-+          "Number of thread-MPI threads to start (0 is guess)" },
-+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
-+          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
-+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
-+          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
-+        { "-pin",     FALSE, etENUM, {thread_aff_opt},
-+          "Set thread affinities" },
-+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
-+          "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
-+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
-+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
-+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
-+          "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
-+        { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
-+          "Check for all bonded interactions with DD" },
-+        { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
-+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
-+        { "-rdd",     FALSE, etREAL, {&rdd},
-+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
-+        { "-rcon",    FALSE, etREAL, {&rconstr},
-+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
-+        { "-dlb",     FALSE, etENUM, {dddlb_opt},
-+          "Dynamic load balancing (with DD)" },
-+        { "-dds",     FALSE, etREAL, {&dlb_scale},
-+          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
-+          "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
-+        { "-ddcsx",   FALSE, etSTR, {&ddcsx},
-+          "HIDDENA string containing a vector of the relative sizes in the x "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-ddcsy",   FALSE, etSTR, {&ddcsy},
-+          "HIDDENA string containing a vector of the relative sizes in the y "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-ddcsz",   FALSE, etSTR, {&ddcsz},
-+          "HIDDENA string containing a vector of the relative sizes in the z "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-gcom",    FALSE, etINT, {&nstglobalcomm},
-+          "Global communication frequency" },
-+        { "-nb",      FALSE, etENUM, {&nbpu_opt},
-+          "Calculate non-bonded interactions on" },
-+        { "-nstlist", FALSE, etINT, {&nstlist},
-+          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
-+        { "-tunepme", FALSE, etBOOL, {&bTunePME},
-+          "Optimize PME load between PP/PME ranks or GPU/CPU" },
-+        { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
-+          "Test the Verlet non-bonded scheme" },
-+        { "-v",       FALSE, etBOOL, {&bVerbose},
-+          "Be loud and noisy" },
-+        { "-compact", FALSE, etBOOL, {&bCompact},
-+          "Write a compact log file" },
-+        { "-seppot",  FALSE, etBOOL, {&bSepPot},
-+          "Write separate V and dVdl terms for each interaction type and rank to the log file(s)" },
-+        { "-pforce",  FALSE, etREAL, {&pforce},
-+          "Print all forces larger than this (kJ/mol nm)" },
-+        { "-reprod",  FALSE, etBOOL, {&bReproducible},
-+          "Try to avoid optimizations that affect binary reproducibility" },
-+        { "-cpt",     FALSE, etREAL, {&cpt_period},
-+          "Checkpoint interval (minutes)" },
-+        { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
-+          "Keep and number checkpoint files" },
-+        { "-append",  FALSE, etBOOL, {&bAppendFiles},
-+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
-+        { "-nsteps",  FALSE, etINT64, {&nsteps},
-+          "Run this number of steps, overrides .mdp file option" },
-+        { "-maxh",   FALSE, etREAL, {&max_hours},
-+          "Terminate after 0.99 times this time (hours)" },
-+        { "-multi",   FALSE, etINT, {&nmultisim},
-+          "Do multiple simulations in parallel" },
-+        { "-replex",  FALSE, etINT, {&repl_ex_nst},
-+          "Attempt replica exchange periodically with this period (steps)" },
-+        { "-nex",  FALSE, etINT, {&repl_ex_nex},
-+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
-+        { "-reseed",  FALSE, etINT, {&repl_ex_seed},
-+          "Seed for replica exchange, -1 is generate a seed" },
-+        { "-imdport",    FALSE, etINT, {&imdport},
-+          "HIDDENIMD listening port" },
-+        { "-imdwait",  FALSE, etBOOL, {&bIMDwait},
-+          "HIDDENPause the simulation while no IMD client is connected" },
-+        { "-imdterm",  FALSE, etBOOL, {&bIMDterm},
-+          "HIDDENAllow termination of the simulation from IMD client" },
-+        { "-imdpull",  FALSE, etBOOL, {&bIMDpull},
-+          "HIDDENAllow pulling in the simulation from IMD client" },
-+        { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
-+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
-+        { "-confout", FALSE, etBOOL, {&bConfout},
-+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
-+        { "-stepout", FALSE, etINT, {&nstepout},
-+          "HIDDENFrequency of writing the remaining wall clock time for the run" },
-+        { "-resetstep", FALSE, etINT, {&resetstep},
-+          "HIDDENReset cycle counters after these many time steps" },
-+        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
-+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
-+    };
-+    unsigned long   Flags, PCA_Flags;
-+    ivec            ddxyz;
-+    int             dd_node_order;
-+    gmx_bool        bAddPart;
-+    FILE           *fplog, *fpmulti;
-+    int             sim_part, sim_part_fn;
-+    const char     *part_suffix = ".part";
-+    char            suffix[STRLEN];
-+    int             rc;
-+    char          **multidir = NULL;
-+
-+
-+    cr = init_commrec();
-+
-+    PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
-+
-+    /* Comment this in to do fexist calls only on master
-+     * works not with rerun or tables at the moment
-+     * also comment out the version of init_forcerec in md.c
-+     * with NULL instead of opt2fn
-+     */
-+    /*
-+       if (!MASTER(cr))
-+       {
-+       PCA_Flags |= PCA_NOT_READ_NODE;
-+       }
-+     */
-+
-+    if (!parse_common_args(&argc, argv, PCA_Flags, NFILE, fnm, asize(pa), pa,
-+                           asize(desc), desc, 0, NULL, &oenv))
-+    {
-+        return 0;
-+    }
-+
-+
-+    /* we set these early because they might be used in init_multisystem()
-+       Note that there is the potential for npme>nnodes until the number of
-+       threads is set later on, if there's thread parallelization. That shouldn't
-+       lead to problems. */
-+    dd_node_order = nenum(ddno_opt);
-+    cr->npmenodes = npme;
-+
-+    hw_opt.thread_affinity = nenum(thread_aff_opt);
-+
-+    /* now check the -multi and -multidir option */
-+    if (opt2bSet("-multidir", NFILE, fnm))
-+    {
-+        if (nmultisim > 0)
-+        {
-+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
-+        }
-+        nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
-+    }
-+
-+
-+    if (repl_ex_nst != 0 && nmultisim < 2)
-+    {
-+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multi)");
-+    }
-+
-+    if (repl_ex_nex < 0)
-+    {
-+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
-+    }
-+
-+    if (nmultisim > 1)
-+    {
-+#ifndef GMX_THREAD_MPI
-+        gmx_bool bParFn = (multidir == NULL);
-+        init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
-+#else
-+        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library. "
-+                  "Please compile GROMACS with MPI support");
-+#endif
-+    }
-+
-+    bAddPart = !bAppendFiles;
-+
-+    /* Check if there is ANY checkpoint file available */
-+    sim_part    = 1;
-+    sim_part_fn = sim_part;
-+    if (opt2bSet("-cpi", NFILE, fnm))
-+    {
-+        if (bSepPot && bAppendFiles)
-+        {
-+            gmx_fatal(FARGS, "Output file appending is not supported with -seppot");
-+        }
-+
-+        bAppendFiles =
-+            read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
-+                                                          fnm, cr),
-+                                            &sim_part_fn, NULL, cr,
-+                                            bAppendFiles, NFILE, fnm,
-+                                            part_suffix, &bAddPart);
-+        if (sim_part_fn == 0 && MULTIMASTER(cr))
-+        {
-+            fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
-+        }
-+        else
-+        {
-+            sim_part = sim_part_fn + 1;
-+        }
-+
-+        if (MULTISIM(cr) && MASTER(cr))
-+        {
-+            if (MULTIMASTER(cr))
-+            {
-+                /* Log file is not yet available, so if there's a
-+                 * problem we can only write to stderr. */
-+                fpmulti = stderr;
-+            }
-+            else
-+            {
-+                fpmulti = NULL;
-+            }
-+            check_multi_int(fpmulti, cr->ms, sim_part, "simulation part", TRUE);
-+        }
-+    }
-+    else
-+    {
-+        bAppendFiles = FALSE;
-+    }
-+
-+    if (!bAppendFiles)
-+    {
-+        sim_part_fn = sim_part;
-+    }
-+
-+    if (bAddPart)
-+    {
-+        /* Rename all output files (except checkpoint files) */
-+        /* create new part name first (zero-filled) */
-+        sprintf(suffix, "%s%04d", part_suffix, sim_part_fn);
-+
-+        add_suffix_to_output_names(fnm, NFILE, suffix);
-+        if (MULTIMASTER(cr))
-+        {
-+            fprintf(stdout, "Checkpoint file is from part %d, new output files will be suffixed '%s'.\n", sim_part-1, suffix);
-+        }
-+    }
-+
-+    Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
-+    Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
-+    Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
-+    Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
-+    Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
-+    Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
-+    Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
-+    Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
-+    Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
-+    Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
-+    Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
-+    Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
-+    Flags = Flags | (sim_part > 1    ? MD_STARTFROMCPT : 0);
-+    Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
-+    Flags = Flags | (bIMDwait      ? MD_IMDWAIT      : 0);
-+    Flags = Flags | (bIMDterm      ? MD_IMDTERM      : 0);
-+    Flags = Flags | (bIMDpull      ? MD_IMDPULL      : 0);
-+
-+    /* We postpone opening the log file if we are appending, so we can
-+       first truncate the old log file and append to the correct position
-+       there instead.  */
-+    if ((MASTER(cr) || bSepPot) && !bAppendFiles)
-+    {
-+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
-+                     !bSepPot, Flags & MD_APPENDFILES, &fplog);
-+        please_cite(fplog, "Hess2008b");
-+        please_cite(fplog, "Spoel2005a");
-+        please_cite(fplog, "Lindahl2001a");
-+        please_cite(fplog, "Berendsen95a");
-+    }
-+    else if (!MASTER(cr) && bSepPot)
-+    {
-+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr, !bSepPot, Flags, &fplog);
-+    }
-+    else
-+    {
-+        fplog = NULL;
-+    }
-+
-+    ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
-+    ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-+    ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
-+
-+    rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-+                  nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-+                  dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
-+                  nbpu_opt[0], nstlist,
-+                  nsteps, nstepout, resetstep,
-+                  nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-+                  pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
-+
-+    /* Log file has to be closed in mdrunner if we are appending to it
-+       (fplog not set here) */
-+    if (MASTER(cr) && !bAppendFiles)
-+    {
-+        gmx_log_close(fplog);
-+    }
-+
-+    return rc;
-+}
-diff --git a/src/programs/mdrun/repl_ex.c b/src/programs/mdrun/repl_ex.c
-index 46a9bc0..cfb0b7f 100644
---- a/src/programs/mdrun/repl_ex.c
-+++ b/src/programs/mdrun/repl_ex.c
-@@ -51,6 +51,12 @@
- #include "domdec.h"
- #include "gromacs/random/random.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #define PROBABILITYCUTOFF 100
- /* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
- 
-@@ -112,14 +118,16 @@ static gmx_bool repl_quantity(const gmx_multisim_t *ms,
-     qall[re->repl] = q;
-     gmx_sum_sim(ms->nsim, qall, ms);
- 
--    bDiff = FALSE;
--    for (s = 1; s < ms->nsim; s++)
--    {
--        if (qall[s] != qall[0])
--        {
-+    /* PLUMED */
-+    //bDiff = FALSE;
-+    //for (s = 1; s < ms->nsim; s++)
-+    //{
-+    //    if (qall[s] != qall[0])
-+    //    {
-             bDiff = TRUE;
--        }
--    }
-+    //    }
-+    //}
-+    /* END PLUMED */
- 
-     if (bDiff)
-     {
-@@ -269,6 +277,10 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-         re->ind[i] = i;
-     }
- 
-+    /* PLUMED */
-+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
-+    // in those cases replicas can share the same temperature.
-+    /*
-     if (re->type < ereENDSINGLE)
-     {
- 
-@@ -277,11 +289,12 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-             for (j = i+1; j < re->nrepl; j++)
-             {
-                 if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
--                {
-+                {*/
-                     /* Unordered replicas are supposed to work, but there
-                      * is still an issues somewhere.
-                      * Note that at this point still re->ind[i]=i.
-                      */
-+                 /*
-                     gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
-                               i, j,
-                               erename[re->type],
-@@ -299,6 +312,8 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-             }
-         }
-     }
-+    */
-+    /* END PLUMED */
- 
-     /* keep track of all the swaps, starting with the initial placement. */
-     snew(re->allswaps, re->nrepl);
-@@ -982,6 +997,10 @@ test_for_replica_exchange(FILE                 *fplog,
-         pind[i] = re->ind[i];
-     }
- 
-+    /* PLUMED */
-+    int plumed_test_exchange_pattern=0;
-+    /* END PLUMED */
-+
-     if (bMultiEx)
-     {
-         /* multiple random switch exchange */
-@@ -1057,6 +1076,31 @@ test_for_replica_exchange(FILE                 *fplog,
-         /* standard nearest neighbor replica exchange */
- 
-         m = (step / re->nst) % 2;
-+        /* PLUMED */
-+        if(plumedswitch){
-+          int partner=re->repl;
-+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
-+          if(plumed_test_exchange_pattern>0){
-+            int *list;
-+            snew(list,re->nrepl);
-+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
-+            plumed_cmd(plumedmain,"getExchangesList",list);
-+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
-+            sfree(list);
-+          }
-+
-+          for(i=1; i<re->nrepl; i++) {
-+            if (i % 2 != m) continue;
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+            if(re->repl==a) partner=b;
-+            if(re->repl==b) partner=a;
-+          }
-+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
-+          plumed_cmd(plumedmain,"GREX calculate",NULL);
-+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
-+        }
-+        /* END PLUMED */
-         for (i = 1; i < re->nrepl; i++)
-         {
-             a = re->ind[i-1];
-@@ -1066,6 +1110,18 @@ test_for_replica_exchange(FILE                 *fplog,
-             if (i % 2 == m)
-             {
-                 delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  real adb,bdb,dplumed;
-+                  char buf[300];
-+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
-+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
-+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
-+                  delta+=dplumed;
-+                  if (bPrint)
-+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
-+                }
-+                /* END PLUMED */
-                 if (delta <= 0)
-                 {
-                     /* accepted */
-@@ -1092,11 +1148,22 @@ test_for_replica_exchange(FILE                 *fplog,
- 
-                 if (bEx[i])
-                 {
-+                  /* PLUMED */
-+                  if(!plumed_test_exchange_pattern) {
-+                    /* standard neighbour swapping */
-                     /* swap these two */
-                     tmp       = pind[i-1];
-                     pind[i-1] = pind[i];
-                     pind[i]   = tmp;
-                     re->nexchange[i]++;  /* statistics for back compatibility */
-+                  } else {
-+                    /* alternative swapping patterns */
-+                    tmp       = pind[a];
-+                    pind[a]   = pind[b];
-+                    pind[b]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                  }
-+                  /* END PLUMED */
-                 }
-             }
-             else
-@@ -1112,6 +1179,15 @@ test_for_replica_exchange(FILE                 *fplog,
-         re->nattempt[m]++;
-     }
- 
-+    /* PLUMED */
-+    if(plumed_test_exchange_pattern>0) {
-+      for (i = 0; i < re->nrepl; i++)
-+      {
-+          re->ind[i] = i;
-+      }
-+    }
-+    /* END PLUMED */
-+
-     /* record which moves were made and accepted */
-     for (i = 0; i < re->nrepl; i++)
-     {
-@@ -1316,6 +1392,10 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
-     /* The order in which multiple exchanges will occur. */
-     gmx_bool bThisReplicaExchanged = FALSE;
- 
-+    /* PLUMED */
-+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
-+    /* END PLUMED */
-+
-     if (MASTER(cr))
-     {
-         replica_id  = re->repl;
-diff --git a/src/programs/mdrun/repl_ex.c.preplumed b/src/programs/mdrun/repl_ex.c.preplumed
-new file mode 100644
-index 0000000..46a9bc0
---- /dev/null
-+++ b/src/programs/mdrun/repl_ex.c.preplumed
-@@ -0,0 +1,1439 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <math.h>
-+#include "repl_ex.h"
-+#include "network.h"
-+#include "gromacs/random/random.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "physics.h"
-+#include "copyrite.h"
-+#include "macros.h"
-+#include "vec.h"
-+#include "names.h"
-+#include "domdec.h"
-+#include "gromacs/random/random.h"
-+
-+#define PROBABILITYCUTOFF 100
-+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
-+
-+enum {
-+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
-+};
-+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
-+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
-+   it are multiple replica exchange methods */
-+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
-+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
-+
-+typedef struct gmx_repl_ex
-+{
-+    int       repl;
-+    int       nrepl;
-+    real      temp;
-+    int       type;
-+    real    **q;
-+    gmx_bool  bNPT;
-+    real     *pres;
-+    int      *ind;
-+    int      *allswaps;
-+    int       nst;
-+    int       nex;
-+    int       seed;
-+    int       nattempt[2];
-+    real     *prob_sum;
-+    int     **nmoves;
-+    int      *nexchange;
-+    gmx_rng_t rng;
-+
-+    /* these are helper arrays for replica exchange; allocated here so they
-+       don't have to be allocated each time */
-+    int      *destinations;
-+    int     **cyclic;
-+    int     **order;
-+    int      *tmpswap;
-+    gmx_bool *incycle;
-+    gmx_bool *bEx;
-+
-+    /* helper arrays to hold the quantities that are exchanged */
-+    real  *prob;
-+    real  *Epot;
-+    real  *beta;
-+    real  *Vol;
-+    real **de;
-+
-+} t_gmx_repl_ex;
-+
-+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
-+                              struct gmx_repl_ex *re, int ere, real q)
-+{
-+    real    *qall;
-+    gmx_bool bDiff;
-+    int      i, s;
-+
-+    snew(qall, ms->nsim);
-+    qall[re->repl] = q;
-+    gmx_sum_sim(ms->nsim, qall, ms);
-+
-+    bDiff = FALSE;
-+    for (s = 1; s < ms->nsim; s++)
-+    {
-+        if (qall[s] != qall[0])
-+        {
-+            bDiff = TRUE;
-+        }
-+    }
-+
-+    if (bDiff)
-+    {
-+        /* Set the replica exchange type and quantities */
-+        re->type = ere;
-+
-+        snew(re->q[ere], re->nrepl);
-+        for (s = 0; s < ms->nsim; s++)
-+        {
-+            re->q[ere][s] = qall[s];
-+        }
-+    }
-+    sfree(qall);
-+    return bDiff;
-+}
-+
-+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-+                                    const gmx_multisim_t *ms,
-+                                    const t_state *state,
-+                                    const t_inputrec *ir,
-+                                    int nst, int nex, int init_seed)
-+{
-+    real                temp, pres;
-+    int                 i, j, k;
-+    struct gmx_repl_ex *re;
-+    gmx_bool            bTemp;
-+    gmx_bool            bLambda = FALSE;
-+
-+    fprintf(fplog, "\nInitializing Replica Exchange\n");
-+
-+    if (ms == NULL || ms->nsim == 1)
-+    {
-+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
-+    }
-+    if (!EI_DYNAMICS(ir->eI))
-+    {
-+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
-+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
-+         * distinct from MULTISIM(cr). A multi-simulation only runs
-+         * with real MPI parallelism, but this does not imply PAR(cr)
-+         * is true!
-+         *
-+         * Since we are using a dynamical integrator, the only
-+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
-+         * synonymous. The only way for cr->nnodes > 1 to be true is
-+         * if we are using DD. */
-+    }
-+
-+    snew(re, 1);
-+
-+    re->repl     = ms->sim;
-+    re->nrepl    = ms->nsim;
-+    snew(re->q, ereENDSINGLE);
-+
-+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
-+
-+    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
-+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
-+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
-+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
-+                      "first exchange step: init_step/-replex", FALSE);
-+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
-+    check_multi_int(fplog, ms, ir->opts.ngtc,
-+                    "the number of temperature coupling groups", FALSE);
-+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
-+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
-+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
-+
-+    re->temp = ir->opts.ref_t[0];
-+    for (i = 1; (i < ir->opts.ngtc); i++)
-+    {
-+        if (ir->opts.ref_t[i] != re->temp)
-+        {
-+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
-+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
-+        }
-+    }
-+
-+    re->type = -1;
-+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
-+    if (ir->efep != efepNO)
-+    {
-+        bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
-+    }
-+    if (re->type == -1)  /* nothing was assigned */
-+    {
-+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
-+    }
-+    if (bLambda && bTemp)
-+    {
-+        re->type = ereTL;
-+    }
-+
-+    if (bTemp)
-+    {
-+        please_cite(fplog, "Sugita1999a");
-+        if (ir->epc != epcNO)
-+        {
-+            re->bNPT = TRUE;
-+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
-+            please_cite(fplog, "Okabe2001a");
-+        }
-+        if (ir->etc == etcBERENDSEN)
-+        {
-+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
-+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
-+        }
-+    }
-+    if (bLambda)
-+    {
-+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
-+        {
-+            gmx_fatal(FARGS, "delta_lambda is not zero");
-+        }
-+    }
-+    if (re->bNPT)
-+    {
-+        snew(re->pres, re->nrepl);
-+        if (ir->epct == epctSURFACETENSION)
-+        {
-+            pres = ir->ref_p[ZZ][ZZ];
-+        }
-+        else
-+        {
-+            pres = 0;
-+            j    = 0;
-+            for (i = 0; i < DIM; i++)
-+            {
-+                if (ir->compress[i][i] != 0)
-+                {
-+                    pres += ir->ref_p[i][i];
-+                    j++;
-+                }
-+            }
-+            pres /= j;
-+        }
-+        re->pres[re->repl] = pres;
-+        gmx_sum_sim(re->nrepl, re->pres, ms);
-+    }
-+
-+    /* Make an index for increasing replica order */
-+    /* only makes sense if one or the other is varying, not both!
-+       if both are varying, we trust the order the person gave. */
-+    snew(re->ind, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->ind[i] = i;
-+    }
-+
-+    if (re->type < ereENDSINGLE)
-+    {
-+
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = i+1; j < re->nrepl; j++)
-+            {
-+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
-+                {
-+                    /* Unordered replicas are supposed to work, but there
-+                     * is still an issues somewhere.
-+                     * Note that at this point still re->ind[i]=i.
-+                     */
-+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
-+                              i, j,
-+                              erename[re->type],
-+                              re->q[re->type][i], re->q[re->type][j],
-+                              erename[re->type]);
-+
-+                    k          = re->ind[i];
-+                    re->ind[i] = re->ind[j];
-+                    re->ind[j] = k;
-+                }
-+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
-+                {
-+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
-+                }
-+            }
-+        }
-+    }
-+
-+    /* keep track of all the swaps, starting with the initial placement. */
-+    snew(re->allswaps, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->allswaps[i] = re->ind[i];
-+    }
-+
-+    switch (re->type)
-+    {
-+        case ereTEMP:
-+            fprintf(fplog, "\nReplica exchange in temperature\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        case ereLAMBDA:
-+            fprintf(fplog, "\nReplica exchange in lambda\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        case ereTL:
-+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        default:
-+            gmx_incons("Unknown replica exchange quantity");
-+    }
-+    if (re->bNPT)
-+    {
-+        fprintf(fplog, "\nRepl  p");
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
-+        }
-+
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
-+            {
-+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
-+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
-+            }
-+        }
-+    }
-+    re->nst = nst;
-+    if (init_seed == -1)
-+    {
-+        if (MASTERSIM(ms))
-+        {
-+            re->seed = (int)gmx_rng_make_seed();
-+        }
-+        else
-+        {
-+            re->seed = 0;
-+        }
-+        gmx_sumi_sim(1, &(re->seed), ms);
-+    }
-+    else
-+    {
-+        re->seed = init_seed;
-+    }
-+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
-+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
-+    re->rng = gmx_rng_init(re->seed);
-+
-+    re->nattempt[0] = 0;
-+    re->nattempt[1] = 0;
-+
-+    snew(re->prob_sum, re->nrepl);
-+    snew(re->nexchange, re->nrepl);
-+    snew(re->nmoves, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->nmoves[i], re->nrepl);
-+    }
-+    fprintf(fplog, "Replica exchange information below: x=exchange, pr=probability\n");
-+
-+    /* generate space for the helper functions so we don't have to snew each time */
-+
-+    snew(re->destinations, re->nrepl);
-+    snew(re->incycle, re->nrepl);
-+    snew(re->tmpswap, re->nrepl);
-+    snew(re->cyclic, re->nrepl);
-+    snew(re->order, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->cyclic[i], re->nrepl);
-+        snew(re->order[i], re->nrepl);
-+    }
-+    /* allocate space for the functions storing the data for the replicas */
-+    /* not all of these arrays needed in all cases, but they don't take
-+       up much space, since the max size is nrepl**2 */
-+    snew(re->prob, re->nrepl);
-+    snew(re->bEx, re->nrepl);
-+    snew(re->beta, re->nrepl);
-+    snew(re->Vol, re->nrepl);
-+    snew(re->Epot, re->nrepl);
-+    snew(re->de, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->de[i], re->nrepl);
-+    }
-+    re->nex = nex;
-+    return re;
-+}
-+
-+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
-+{
-+    real *buf;
-+    int   i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+
-+static void exchange_ints(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, int *v, int n)
-+{
-+    int *buf;
-+    int  i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
-+             buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
-+             ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
-+{
-+    double *buf;
-+    int     i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
-+{
-+    rvec *buf;
-+    int   i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            copy_rvec(buf[i], v[i]);
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
-+{
-+    /* When t_state changes, this code should be updated. */
-+    int ngtc, nnhpres;
-+    ngtc    = state->ngtc * state->nhchainlength;
-+    nnhpres = state->nnhpres* state->nhchainlength;
-+    exchange_rvecs(ms, b, state->box, DIM);
-+    exchange_rvecs(ms, b, state->box_rel, DIM);
-+    exchange_rvecs(ms, b, state->boxv, DIM);
-+    exchange_reals(ms, b, &(state->veta), 1);
-+    exchange_reals(ms, b, &(state->vol0), 1);
-+    exchange_rvecs(ms, b, state->svir_prev, DIM);
-+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
-+    exchange_rvecs(ms, b, state->pres_prev, DIM);
-+    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
-+    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
-+    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
-+    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
-+    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
-+    exchange_rvecs(ms, b, state->x, state->natoms);
-+    exchange_rvecs(ms, b, state->v, state->natoms);
-+    exchange_rvecs(ms, b, state->sd_X, state->natoms);
-+}
-+
-+static void copy_rvecs(rvec *s, rvec *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            copy_rvec(s[i], d[i]);
-+        }
-+    }
-+}
-+
-+static void copy_doubles(const double *s, double *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+static void copy_reals(const real *s, real *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+static void copy_ints(const int *s, int *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
-+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
-+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
-+#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
-+
-+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
-+{
-+    /* When t_state changes, this code should be updated. */
-+    int ngtc, nnhpres;
-+    ngtc    = state->ngtc * state->nhchainlength;
-+    nnhpres = state->nnhpres* state->nhchainlength;
-+    scopy_rvecs(box, DIM);
-+    scopy_rvecs(box_rel, DIM);
-+    scopy_rvecs(boxv, DIM);
-+    state_local->veta = state->veta;
-+    state_local->vol0 = state->vol0;
-+    scopy_rvecs(svir_prev, DIM);
-+    scopy_rvecs(fvir_prev, DIM);
-+    scopy_rvecs(pres_prev, DIM);
-+    scopy_doubles(nosehoover_xi, ngtc);
-+    scopy_doubles(nosehoover_vxi, ngtc);
-+    scopy_doubles(nhpres_xi, nnhpres);
-+    scopy_doubles(nhpres_vxi, nnhpres);
-+    scopy_doubles(therm_integral, state->ngtc);
-+    scopy_rvecs(x, state->natoms);
-+    scopy_rvecs(v, state->natoms);
-+    scopy_rvecs(sd_X, state->natoms);
-+    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
-+    scopy_reals(lambda, efptNR);
-+}
-+
-+static void scale_velocities(t_state *state, real fac)
-+{
-+    int i;
-+
-+    if (state->v)
-+    {
-+        for (i = 0; i < state->natoms; i++)
-+        {
-+            svmul(fac, state->v[i], state->v[i]);
-+        }
-+    }
-+}
-+
-+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
-+{
-+    int   i, j, ntot;
-+    float Tprint;
-+
-+    ntot = nattempt[0] + nattempt[1];
-+    fprintf(fplog, "\n");
-+    fprintf(fplog, "Repl");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "    ");  /* put the title closer to the center */
-+    }
-+    fprintf(fplog, "Empirical Transition Matrix\n");
-+
-+    fprintf(fplog, "Repl");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%8d", (i+1));
-+    }
-+    fprintf(fplog, "\n");
-+
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "Repl");
-+        for (j = 0; j < n; j++)
-+        {
-+            Tprint = 0.0;
-+            if (nmoves[i][j] > 0)
-+            {
-+                Tprint = nmoves[i][j]/(2.0*ntot);
-+            }
-+            fprintf(fplog, "%8.4f", Tprint);
-+        }
-+        fprintf(fplog, "%3d\n", i);
-+    }
-+}
-+
-+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
-+{
-+    int i;
-+
-+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
-+    for (i = 1; i < n; i++)
-+    {
-+        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
-+{
-+    int i;
-+
-+    for (i = 0; i < n; i++)
-+    {
-+        tmpswap[i] = allswaps[i];
-+    }
-+    for (i = 0; i < n; i++)
-+    {
-+        allswaps[i] = tmpswap[pind[i]];
-+    }
-+
-+    fprintf(fplog, "\nAccepted Exchanges:   ");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%d ", pind[i]);
-+    }
-+    fprintf(fplog, "\n");
-+
-+    /* the "Order After Exchange" is the state label corresponding to the configuration that
-+       started in state listed in order, i.e.
-+
-+       3 0 1 2
-+
-+       means that the:
-+       configuration starting in simulation 3 is now in simulation 0,
-+       configuration starting in simulation 0 is now in simulation 1,
-+       configuration starting in simulation 1 is now in simulation 2,
-+       configuration starting in simulation 2 is now in simulation 3
-+     */
-+    fprintf(fplog, "Order After Exchange: ");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%d ", allswaps[i]);
-+    }
-+    fprintf(fplog, "\n\n");
-+}
-+
-+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
-+{
-+    int  i;
-+    char buf[8];
-+
-+    fprintf(fplog, "Repl %2s ", leg);
-+    for (i = 1; i < n; i++)
-+    {
-+        if (prob[i] >= 0)
-+        {
-+            sprintf(buf, "%4.2f", prob[i]);
-+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
-+        }
-+        else
-+        {
-+            fprintf(fplog, "     ");
-+        }
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static void print_count(FILE *fplog, const char *leg, int n, int *count)
-+{
-+    int i;
-+
-+    fprintf(fplog, "Repl %2s ", leg);
-+    for (i = 1; i < n; i++)
-+    {
-+        fprintf(fplog, " %4d", count[i]);
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
-+{
-+
-+    real   ediff, dpV, delta = 0;
-+    real  *Epot = re->Epot;
-+    real  *Vol  = re->Vol;
-+    real **de   = re->de;
-+    real  *beta = re->beta;
-+
-+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
-+       to the non permuted case */
-+
-+    switch (re->type)
-+    {
-+        case ereTEMP:
-+            /*
-+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
-+             */
-+            ediff = Epot[b] - Epot[a];
-+            delta = -(beta[bp] - beta[ap])*ediff;
-+            break;
-+        case ereLAMBDA:
-+            /* two cases:  when we are permuted, and not.  */
-+            /* non-permuted:
-+               ediff =  E_new - E_old
-+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
-+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
-+                     =  de[b][a] + de[a][b] */
-+
-+            /* permuted:
-+               ediff =  E_new - E_old
-+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
-+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
-+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
-+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
-+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
-+            /* but, in the current code implementation, we flip configurations, not indices . . .
-+               So let's examine that.
-+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
-+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
-+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
-+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
-+                     So the simple solution is to flip the
-+                     position of perturbed and original indices in the tests.
-+             */
-+
-+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
-+            delta = ediff*beta[a]; /* assume all same temperature in this case */
-+            break;
-+        case ereTL:
-+            /* not permuted:  */
-+            /* delta =  reduced E_new - reduced E_old
-+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
-+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
-+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
-+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
-+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
-+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
-+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
-+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
-+            /* permuted (big breath!) */
-+            /*   delta =  reduced E_new - reduced E_old
-+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
-+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
-+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
-+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
-+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
-+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
-+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
-+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
-+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
-+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
-+            break;
-+        default:
-+            gmx_incons("Unknown replica exchange quantity");
-+    }
-+    if (bPrint)
-+    {
-+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
-+    }
-+    if (re->bNPT)
-+    {
-+        /* revist the calculation for 5.0.  Might be some improvements. */
-+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
-+        if (bPrint)
-+        {
-+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
-+        }
-+        delta += dpV;
-+    }
-+    return delta;
-+}
-+
-+static void
-+test_for_replica_exchange(FILE                 *fplog,
-+                          const gmx_multisim_t *ms,
-+                          struct gmx_repl_ex   *re,
-+                          gmx_enerdata_t       *enerd,
-+                          real                  vol,
-+                          gmx_int64_t           step,
-+                          real                  time)
-+{
-+    int       m, i, j, a, b, ap, bp, i0, i1, tmp;
-+    real      ediff = 0, delta = 0, dpV = 0;
-+    gmx_bool  bPrint, bMultiEx;
-+    gmx_bool *bEx      = re->bEx;
-+    real     *prob     = re->prob;
-+    int      *pind     = re->destinations; /* permuted index */
-+    gmx_bool  bEpot    = FALSE;
-+    gmx_bool  bDLambda = FALSE;
-+    gmx_bool  bVol     = FALSE;
-+    gmx_rng_t rng;
-+
-+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
-+    fprintf(fplog, "Replica exchange at step " "%"GMX_PRId64 " time %.5f\n", step, time);
-+
-+    if (re->bNPT)
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->Vol[i] = 0;
-+        }
-+        bVol               = TRUE;
-+        re->Vol[re->repl]  = vol;
-+    }
-+    if ((re->type == ereTEMP || re->type == ereTL))
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->Epot[i] = 0;
-+        }
-+        bEpot              = TRUE;
-+        re->Epot[re->repl] = enerd->term[F_EPOT];
-+        /* temperatures of different states*/
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
-+        }
-+    }
-+    else
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
-+        }
-+    }
-+    if (re->type == ereLAMBDA || re->type == ereTL)
-+    {
-+        bDLambda = TRUE;
-+        /* lambda differences. */
-+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
-+           minus the energy of the jth simulation in the jth Hamiltonian */
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = 0; j < re->nrepl; j++)
-+            {
-+                re->de[i][j] = 0;
-+            }
-+        }
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
-+        }
-+    }
-+
-+    /* now actually do the communication */
-+    if (bVol)
-+    {
-+        gmx_sum_sim(re->nrepl, re->Vol, ms);
-+    }
-+    if (bEpot)
-+    {
-+        gmx_sum_sim(re->nrepl, re->Epot, ms);
-+    }
-+    if (bDLambda)
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            gmx_sum_sim(re->nrepl, re->de[i], ms);
-+        }
-+    }
-+
-+    /* make a duplicate set of indices for shuffling */
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        pind[i] = re->ind[i];
-+    }
-+
-+    if (bMultiEx)
-+    {
-+        /* multiple random switch exchange */
-+        int nself = 0;
-+        for (i = 0; i < re->nex + nself; i++)
-+        {
-+            double rnd[2];
-+
-+            gmx_rng_cycle_2uniform(step, i*2, re->seed, RND_SEED_REPLEX, rnd);
-+            /* randomly select a pair  */
-+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
-+               probability of occurring (log p > -100) and only operate on those switches */
-+            /* find out which state it is from, and what label that state currently has. Likely
-+               more work that useful. */
-+            i0 = (int)(re->nrepl*rnd[0]);
-+            i1 = (int)(re->nrepl*rnd[1]);
-+            if (i0 == i1)
-+            {
-+                nself++;
-+                continue;  /* self-exchange, back up and do it again */
-+            }
-+
-+            a  = re->ind[i0]; /* what are the indices of these states? */
-+            b  = re->ind[i1];
-+            ap = pind[i0];
-+            bp = pind[i1];
-+
-+            bPrint = FALSE; /* too noisy */
-+            /* calculate the energy difference */
-+            /* if the code changes to flip the STATES, rather than the configurations,
-+               use the commented version of the code */
-+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
-+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
-+
-+            /* we actually only use the first space in the prob and bEx array,
-+               since there are actually many switches between pairs. */
-+
-+            if (delta <= 0)
-+            {
-+                /* accepted */
-+                prob[0] = 1;
-+                bEx[0]  = TRUE;
-+            }
-+            else
-+            {
-+                if (delta > PROBABILITYCUTOFF)
-+                {
-+                    prob[0] = 0;
-+                }
-+                else
-+                {
-+                    prob[0] = exp(-delta);
-+                }
-+                /* roll a number to determine if accepted */
-+                gmx_rng_cycle_2uniform(step, i*2+1, re->seed, RND_SEED_REPLEX, rnd);
-+                bEx[0] = rnd[0] < prob[0];
-+            }
-+            re->prob_sum[0] += prob[0];
-+
-+            if (bEx[0])
-+            {
-+                /* swap the states */
-+                tmp      = pind[i0];
-+                pind[i0] = pind[i1];
-+                pind[i1] = tmp;
-+            }
-+        }
-+        re->nattempt[0]++;  /* keep track of total permutation trials here */
-+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
-+    }
-+    else
-+    {
-+        /* standard nearest neighbor replica exchange */
-+
-+        m = (step / re->nst) % 2;
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+
-+            bPrint = (re->repl == a || re->repl == b);
-+            if (i % 2 == m)
-+            {
-+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                if (delta <= 0)
-+                {
-+                    /* accepted */
-+                    prob[i] = 1;
-+                    bEx[i]  = TRUE;
-+                }
-+                else
-+                {
-+                    double rnd[2];
-+
-+                    if (delta > PROBABILITYCUTOFF)
-+                    {
-+                        prob[i] = 0;
-+                    }
-+                    else
-+                    {
-+                        prob[i] = exp(-delta);
-+                    }
-+                    /* roll a number to determine if accepted */
-+                    gmx_rng_cycle_2uniform(step, i, re->seed, RND_SEED_REPLEX, rnd);
-+                    bEx[i] = rnd[0] < prob[i];
-+                }
-+                re->prob_sum[i] += prob[i];
-+
-+                if (bEx[i])
-+                {
-+                    /* swap these two */
-+                    tmp       = pind[i-1];
-+                    pind[i-1] = pind[i];
-+                    pind[i]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                }
-+            }
-+            else
-+            {
-+                prob[i] = -1;
-+                bEx[i]  = FALSE;
-+            }
-+        }
-+        /* print some statistics */
-+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
-+        print_prob(fplog, "pr", re->nrepl, prob);
-+        fprintf(fplog, "\n");
-+        re->nattempt[m]++;
-+    }
-+
-+    /* record which moves were made and accepted */
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->nmoves[re->ind[i]][pind[i]] += 1;
-+        re->nmoves[pind[i]][re->ind[i]] += 1;
-+    }
-+    fflush(fplog); /* make sure we can see what the last exchange was */
-+}
-+
-+static void write_debug_x(t_state *state)
-+{
-+    int i;
-+
-+    if (debug)
-+    {
-+        for (i = 0; i < state->natoms; i += 10)
-+        {
-+            fprintf(debug, "dx %5d %10.5f %10.5f %10.5f\n", i, state->x[i][XX], state->x[i][YY], state->x[i][ZZ]);
-+        }
-+    }
-+}
-+
-+static void
-+cyclic_decomposition(const int *destinations,
-+                     int      **cyclic,
-+                     gmx_bool  *incycle,
-+                     const int  nrepl,
-+                     int       *nswap)
-+{
-+
-+    int i, j, c, p;
-+    int maxlen = 1;
-+    for (i = 0; i < nrepl; i++)
-+    {
-+        incycle[i] = FALSE;
-+    }
-+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
-+    {
-+        if (incycle[i])
-+        {
-+            cyclic[i][0] = -1;
-+            continue;
-+        }
-+        cyclic[i][0] = i;
-+        incycle[i]   = TRUE;
-+        c            = 1;
-+        p            = i;
-+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
-+        {
-+            p = destinations[p];    /* start permuting */
-+            if (p == i)
-+            {
-+                cyclic[i][c] = -1;
-+                if (c > maxlen)
-+                {
-+                    maxlen = c;
-+                }
-+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
-+            }
-+            else
-+            {
-+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
-+                incycle[p]   = TRUE;
-+                c++;
-+            }
-+        }
-+    }
-+    *nswap = maxlen - 1;
-+
-+    if (debug)
-+    {
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            fprintf(debug, "Cycle %d:", i);
-+            for (j = 0; j < nrepl; j++)
-+            {
-+                if (cyclic[i][j] < 0)
-+                {
-+                    break;
-+                }
-+                fprintf(debug, "%2d", cyclic[i][j]);
-+            }
-+            fprintf(debug, "\n");
-+        }
-+        fflush(debug);
-+    }
-+}
-+
-+static void
-+compute_exchange_order(FILE     *fplog,
-+                       int     **cyclic,
-+                       int     **order,
-+                       const int nrepl,
-+                       const int maxswap)
-+{
-+    int i, j;
-+
-+    for (j = 0; j < maxswap; j++)
-+    {
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            if (cyclic[i][j+1] >= 0)
-+            {
-+                order[cyclic[i][j+1]][j] = cyclic[i][j];
-+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
-+            }
-+        }
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            if (order[i][j] < 0)
-+            {
-+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
-+            }
-+        }
-+    }
-+
-+    if (debug)
-+    {
-+        fprintf(fplog, "Replica Exchange Order\n");
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            fprintf(fplog, "Replica %d:", i);
-+            for (j = 0; j < maxswap; j++)
-+            {
-+                if (order[i][j] < 0)
-+                {
-+                    break;
-+                }
-+                fprintf(debug, "%2d", order[i][j]);
-+            }
-+            fprintf(fplog, "\n");
-+        }
-+        fflush(fplog);
-+    }
-+}
-+
-+static void
-+prepare_to_do_exchange(FILE               *fplog,
-+                       struct gmx_repl_ex *re,
-+                       const int           replica_id,
-+                       int                *maxswap,
-+                       gmx_bool           *bThisReplicaExchanged)
-+{
-+    int i, j;
-+    /* Hold the cyclic decomposition of the (multiple) replica
-+     * exchange. */
-+    gmx_bool bAnyReplicaExchanged = FALSE;
-+    *bThisReplicaExchanged = FALSE;
-+
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        if (re->destinations[i] != re->ind[i])
-+        {
-+            /* only mark as exchanged if the index has been shuffled */
-+            bAnyReplicaExchanged = TRUE;
-+            break;
-+        }
-+    }
-+    if (bAnyReplicaExchanged)
-+    {
-+        /* reinitialize the placeholder arrays */
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = 0; j < re->nrepl; j++)
-+            {
-+                re->cyclic[i][j] = -1;
-+                re->order[i][j]  = -1;
-+            }
-+        }
-+
-+        /* Identify the cyclic decomposition of the permutation (very
-+         * fast if neighbor replica exchange). */
-+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
-+
-+        /* Now translate the decomposition into a replica exchange
-+         * order at each step. */
-+        compute_exchange_order(fplog, re->cyclic, re->order, re->nrepl, *maxswap);
-+
-+        /* Did this replica do any exchange at any point? */
-+        for (j = 0; j < *maxswap; j++)
-+        {
-+            if (replica_id != re->order[replica_id][j])
-+            {
-+                *bThisReplicaExchanged = TRUE;
-+                break;
-+            }
-+        }
-+    }
-+}
-+
-+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
-+                          t_state *state, gmx_enerdata_t *enerd,
-+                          t_state *state_local, gmx_int64_t step, real time)
-+{
-+    int i, j;
-+    int replica_id = 0;
-+    int exchange_partner;
-+    int maxswap = 0;
-+    /* Number of rounds of exchanges needed to deal with any multiple
-+     * exchanges. */
-+    /* Where each replica ends up after the exchange attempt(s). */
-+    /* The order in which multiple exchanges will occur. */
-+    gmx_bool bThisReplicaExchanged = FALSE;
-+
-+    if (MASTER(cr))
-+    {
-+        replica_id  = re->repl;
-+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
-+        prepare_to_do_exchange(fplog, re, replica_id, &maxswap, &bThisReplicaExchanged);
-+    }
-+    /* Do intra-simulation broadcast so all processors belonging to
-+     * each simulation know whether they need to participate in
-+     * collecting the state. Otherwise, they might as well get on with
-+     * the next thing to do. */
-+    if (DOMAINDECOMP(cr))
-+    {
-+#ifdef GMX_MPI
-+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
-+                  cr->mpi_comm_mygroup);
-+#endif
-+    }
-+
-+    if (bThisReplicaExchanged)
-+    {
-+        /* Exchange the states */
-+        /* Collect the global state on the master node */
-+        if (DOMAINDECOMP(cr))
-+        {
-+            dd_collect_state(cr->dd, state_local, state);
-+        }
-+        else
-+        {
-+            copy_state_nonatomdata(state_local, state);
-+        }
-+
-+        if (MASTER(cr))
-+        {
-+            /* There will be only one swap cycle with standard replica
-+             * exchange, but there may be multiple swap cycles if we
-+             * allow multiple swaps. */
-+
-+            for (j = 0; j < maxswap; j++)
-+            {
-+                exchange_partner = re->order[replica_id][j];
-+
-+                if (exchange_partner != replica_id)
-+                {
-+                    /* Exchange the global states between the master nodes */
-+                    if (debug)
-+                    {
-+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
-+                    }
-+                    exchange_state(cr->ms, exchange_partner, state);
-+                }
-+            }
-+            /* For temperature-type replica exchange, we need to scale
-+             * the velocities. */
-+            if (re->type == ereTEMP || re->type == ereTL)
-+            {
-+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
-+            }
-+
-+        }
-+
-+        /* With domain decomposition the global state is distributed later */
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            /* Copy the global state to the local state data structure */
-+            copy_state_nonatomdata(state, state_local);
-+        }
-+    }
-+
-+    return bThisReplicaExchanged;
-+}
-+
-+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
-+{
-+    int  i;
-+
-+    fprintf(fplog, "\nReplica exchange statistics\n");
-+
-+    if (re->nex == 0)
-+    {
-+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
-+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
-+
-+        fprintf(fplog, "Repl  average probabilities:\n");
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            if (re->nattempt[i%2] == 0)
-+            {
-+                re->prob[i] = 0;
-+            }
-+            else
-+            {
-+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
-+            }
-+        }
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_prob(fplog, "", re->nrepl, re->prob);
-+
-+        fprintf(fplog, "Repl  number of exchanges:\n");
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_count(fplog, "", re->nrepl, re->nexchange);
-+
-+        fprintf(fplog, "Repl  average number of exchanges:\n");
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            if (re->nattempt[i%2] == 0)
-+            {
-+                re->prob[i] = 0;
-+            }
-+            else
-+            {
-+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
-+            }
-+        }
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_prob(fplog, "", re->nrepl, re->prob);
-+
-+        fprintf(fplog, "\n");
-+    }
-+    /* print the transition matrix */
-+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
-+}
diff --git a/g/GROMACS/gromacs-5.1.4-plumed-2.3.0-mpi.patch b/g/GROMACS/gromacs-5.1.4-plumed-2.3.0-mpi.patch
deleted file mode 100644
index e91a0aef..00000000
--- a/g/GROMACS/gromacs-5.1.4-plumed-2.3.0-mpi.patch
+++ /dev/null
@@ -1,9575 +0,0 @@
-diff --git a/Plumed.cmake b/Plumed.cmake
-new file mode 100644
-index 0000000..01472f0
---- /dev/null
-+++ b/Plumed.cmake
-@@ -0,0 +1,3 @@
-+# PLUMED: shared installation
-+set(PLUMED_LOAD  /apps/all/PLUMED/2.3.0-foss-2017a/lib/plumed///src/lib/libplumed.so -ldl )
-+set(PLUMED_DEPENDENCIES  /apps/all/PLUMED/2.3.0-foss-2017a/lib/plumed///src/lib/libplumed.so)
-diff --git a/Plumed.h b/Plumed.h
-new file mode 100644
-index 0000000..16da74a
---- /dev/null
-+++ b/Plumed.h
-@@ -0,0 +1,494 @@
-+/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-+   Copyright (c) 2011-2014 The plumed team
-+   (see the PEOPLE file at the root of the distribution for a list of names)
-+
-+   See http://www.plumed-code.org for more information.
-+
-+   This file is part of plumed, version 2.
-+
-+   plumed is free software: you can redistribute it and/or modify
-+   it under the terms of the GNU Lesser General Public License as published by
-+   the Free Software Foundation, either version 3 of the License, or
-+   (at your option) any later version.
-+
-+   plumed is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+   GNU Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public License
-+   along with plumed.  If not, see <http://www.gnu.org/licenses/>.
-++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
-+#ifndef __PLUMED_wrapper_Plumed_h
-+#define __PLUMED_wrapper_Plumed_h
-+
-+/**
-+\page ReferencePlumedH Reference for interfacing MD codes with PLUMED
-+
-+  Plumed.h and Plumed.c contain the external plumed interface, which is used to
-+  integrate it with MD engines. This interface is very general, and is expected
-+  not to change across plumed versions. Plumed.c also implements a dummy version
-+  of the interface, so as to allow a code to be fully linked even if the plumed
-+  library is not available yet. These files could be directly included in the official
-+  host MD distribution. In this manner, it will be sufficient to link the plumed
-+  library at link time (on all systems) or directly at runtime (on system where
-+  dynamic loading is enabled) to include plumed features.
-+
-+  Why is Plumed.c written in C and not C++? The reason is that the resulting Plumed.o
-+  needs to be linked with the host MD code immediately (whereas the rest of plumed
-+  could be linked a posteriori). Imagine the MD code is written in FORTRAN: when we
-+  link the Plumed.o file we would like not to need any C++ library linked. In this
-+  manner, we do not need to know which C++ compiler will be used to compile plumed.
-+  The C++ library is only linked to the "rest" of plumed, which actually use it.
-+  Anyway, Plumed.c is written in such a manner to allow its compilation also in C++
-+  (C++ is a bit stricter than C; compatibility is checked when PlumedStatic.cpp,
-+  which basically includes Plumed.c, is compiled with the C++ compiler). This will
-+  allow e.g. MD codes written in C++ to just incorporate Plumed.c (maybe renamed into
-+  Plumed.cpp), without the need of configuring a plain C compiler.
-+
-+  Plumed interface can be used from C, C++ and FORTRAN. Everything concerning plumed
-+  is hidden inside a single object type, which is described in C by a structure
-+  (struct \ref plumed), in C++ by a class (PLMD::Plumed) and in FORTRAN by a
-+  fixed-length string (CHARACTER(LEN=32)). Obviously C++ can use both struct
-+  and class interfaces, but the first should be preferred. The reference interface
-+  is the C one, whereas FORTRAN and C++ interfaces are implemented as wrappers
-+  around it.
-+
-+  In the C++ interface, all the routines are implemented as methods of PLMD::Plumed.
-+  In the C and FORTRAN interfaces, all the routines are named plumed_*, to
-+  avoid potential name clashes. Notice that the entire plumed library
-+  is implemented in C++, and it is hidden inside the PLMD namespace.
-+
-+  Handlers to the plumed object can be converted among different representations,
-+  to allow inter-operability among languages. In C, there are tools to convert
-+  to/from FORTRAN, whereas in C++ there are tools to convert to/from FORTRAN and C.
-+
-+  These handlers only contain a pointer to the real structure, so that
-+  when a plumed object is brought from one language to another,
-+  it brings a reference to the same environment.
-+
-+  Moreover, to simplify life in all cases where a single Plumed object is
-+  required for the entire simulation (which covers most of the practical
-+  applications with conventional MD codes) it is possible to take advantage
-+  of a global interface, which is implicitly referring to a unique global instance.
-+  The global object should still be initialized and finalized properly.
-+
-+  The basic method to send a message to plumed is
-+\verbatim
-+  (C) plumed_cmd
-+  (C++) PLMD::Plumed::cmd
-+  (FORTRAN)  PLUMED_F_CMD
-+\endverbatim
-+
-+  To initialize a plumed object, use:
-+\verbatim
-+  (C)        plumed_create
-+  (C++)      (constructor of PLMD::Plumed)
-+  (FORTRAN)  PLUMED_F_CREATE
-+\endverbatim
-+
-+  To finalize it, use
-+\verbatim
-+  (C)        plumed_finalize
-+  (C++)      (destructor of PLMD::Plumed)
-+  (FORTRAN)  PLUMED_F_FINALIZE
-+\endverbatim
-+
-+  To access to the global-object, use
-+\verbatim
-+  (C)        plumed_gcreate, plumed_gfinalize, plumed_gcmd
-+  (C++)      PLMD::Plumed::gcreate, PLMD::Plumed::gfinalize, PLMD::Plumed::gcmd
-+  (FORTRAN)  PLUMED_F_GCREATE, PLUMED_F_GFINALIZE, PLUMED_F_GCMD
-+\endverbatim
-+
-+  To check if the global object has been initialized, use
-+\verbatim
-+  (C)        plumed_ginitialized
-+  (C++)      PLMD::Plumed::ginitialized
-+  (FORTRAN)  PLUMED_F_GINITIALIZED
-+\endverbatim
-+
-+  To check if plumed library is available (this is useful for runtime linking), use
-+\verbatim
-+  (C)        plumed_installed 
-+  (C++)      PLMD::Plumed::installed
-+  (FORTRAN)  PLUMED_F_INSTALLED
-+\endverbatim
-+
-+  To convert handlers use
-+\verbatim
-+  (C)        plumed_c2f                 (C to FORTRAN)
-+  (C)        plumed_f2c                 (FORTRAN to C)
-+  (C++)      Plumed(plumed) constructor (C to C++)
-+  (C++)      operator plumed() cast     (C++ to C)
-+  (C++)      Plumed(char*)  constructor (FORTRAN to C++)
-+  (C++)      toFortran(char*)           (C++ to FORTRAN)
-+\endverbatim
-+
-+\verbatim
-+  FORTRAN interface
-+    SUBROUTINE PLUMED_F_INSTALLED(i)
-+      INTEGER,           INTENT(OUT)   :: i
-+    SUBROUTINE PLUMED_F_GINITIALIZED(i)
-+      INTEGER,           INTENT(OUT)   :: i
-+    SUBROUTINE PLUMED_F_GCREATE()
-+    SUBROUTINE PLUMED_F_GCMD(key,val)
-+      CHARACTER(LEN=*), INTENT(IN)     :: key
-+      UNSPECIFIED_TYPE, INTENT(INOUT)  :: val(*)
-+    SUBROUTINE PLUMED_F_GFINALIZE()
-+    SUBROUTINE PLUMED_F_GLOBAL(p)
-+      CHARACTER(LEN=32), INTENT(OUT)   :: p
-+    SUBROUTINE PLUMED_F_CREATE(p)
-+      CHARACTER(LEN=32), INTENT(OUT)   :: p
-+    SUBROUTINE PLUMED_F_CMD(p,key,val)
-+      CHARACTER(LEN=32), INTENT(IN)    :: p
-+      CHARACTER(LEN=*),  INTENT(IN)    :: key
-+      UNSPECIFIED_TYPE,  INTENT(INOUT) :: val(*)
-+    SUBROUTINE PLUMED_F_FINALIZE(p)
-+      CHARACTER(LEN=32), INTENT(IN)    :: p
-+\endverbatim
-+
-+  The main routine is "cmd", which accepts two arguments:
-+  key is a string containing the name of the command
-+  val is the argument. it is declared const so as to use allow passing const objects, but in practice plumed
-+      is going to modify val in several cases (using a const_cast).
-+  In some cases val can be omitted: just pass a NULL pointer (in C++, val is optional and can be omitted).
-+  The set of possible keys is the real API of the plumed library, and will be expanded with time.
-+  New commands will be added, but backward compatibility will be retained as long as possible.
-+
-+  To pass plumed a callback function use the following syntax (not available in FORTRAN yet)
-+\verbatim
-+    plumed_function_holder ff;
-+    ff.p=your_function;
-+    plumed_cmd(plumed,"xxxx",&ff);
-+\endverbatim
-+  (this is passing the your_function() function to the "xxxx" command)
-+*/
-+
-+#ifdef __cplusplus
-+ extern "C" {
-+#endif
-+
-+/* Generic function pointer */
-+typedef void (*plumed_function_pointer)(void);
-+
-+/**
-+  \brief Holder for function pointer.
-+
-+  To pass plumed a callback function use the following syntax:
-+\verbatim
-+    plumed_function_holder ff;
-+    ff.p=your_function;
-+    plumed_cmd(plumed,"xxxx",&ff);
-+\endverbatim
-+  (this is going to pass the your_function() function to the "xxxx" command)
-+*/
-+
-+typedef struct {
-+  plumed_function_pointer p;
-+} plumed_function_holder;
-+
-+/**
-+  \brief Main plumed object
-+
-+  This is an object containing a Plumed instance, which should be used in
-+  the MD engine. It should first be initialized with plumed_create(),
-+  then it communicates with the MD engine using plumed_cmd(). Finally,
-+  before the termination, it should be deallocated with plumed_finalize().
-+  Its interface is very simple and general, and is expected
-+  not to change across plumed versions. See \ref ReferencePlumedH.
-+*/
-+typedef struct {
-+/**
-+  \private
-+  \brief Void pointer holding the real PlumedMain structure
-+*/
-+  void*p;
-+} plumed;
-+
-+/** \relates plumed
-+    \brief Constructor
-+
-+    \return The constructed plumed object
-+*/
-+plumed plumed_create(void);
-+
-+/** \relates plumed
-+    \brief Tells p to execute a command
-+
-+    \param p The plumed object on which command is acting
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like plumed_cmd(p,"A","B"),
-+               but for some choice of key it can change the content
-+*/
-+void plumed_cmd(plumed p,const char*key,const void*val);
-+
-+/** \relates plumed
-+    \brief Destructor
-+
-+    \param p The plumed object to be deallocated
-+*/
-+void plumed_finalize(plumed p);
-+
-+/** \relates plumed
-+    \brief Check if plumed is installed (for runtime binding)
-+
-+    \return 1 if plumed is installed, to 0 otherwise
-+*/
-+int plumed_installed(void);
-+
-+/** \relates plumed
-+    \brief Retrieves an handler to the global structure.
-+*/
-+plumed plumed_global(void);
-+
-+/** \relates plumed
-+    \brief Check if the global interface has been initialized
-+
-+    \return 1 if plumed has been initialized, 0 otherwise
-+*/
-+int plumed_ginitialized(void);
-+
-+/* global C interface, working on a global object */
-+
-+/** \relates plumed
-+    \brief Constructor for the global interface.
-+
-+    \note Equivalent to plumed_create(), but initialize a static global plumed object
-+*/
-+void plumed_gcreate(void);
-+
-+/** \relates plumed
-+    \brief Tells to the global interface to execute a command.
-+
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like plumed_gcmd("A","B"),
-+               but for some choice of key it can change the content
-+
-+    \note Equivalent to plumed_cmd(), but skipping the plumed argument
-+*/
-+void plumed_gcmd(const char* key,const void* val);
-+
-+/** \relates plumed
-+    \brief Destructor for the global interface.
-+
-+    \note Equivalent to plumed_finalize(), but skipping the plumed argument
-+*/
-+void plumed_gfinalize(void);
-+
-+/* routines to convert char handler from/to plumed objects */
-+
-+/** \related plumed
-+    \brief Converts a C handler to a FORTRAN handler
-+
-+    \param p The C handler
-+    \param c The FORTRAN handler (a char[32])
-+*/
-+void   plumed_c2f(plumed p,char* c);
-+
-+/** \related plumed
-+    \brief Converts a FORTRAN handler to a C handler
-+    \param c The FORTRAN handler (a char[32])
-+    \return The C handler
-+*/
-+plumed plumed_f2c(const char* c);
-+
-+#ifdef __cplusplus
-+ }
-+#endif
-+
-+#ifdef __cplusplus
-+
-+/* this is to include the NULL pointer */
-+#include <cstdlib>
-+
-+/* C++ interface is hidden in PLMD namespace (same as plumed library) */
-+namespace PLMD {
-+
-+/**
-+  C++ wrapper for \ref plumed.
-+
-+  This class provides a C++ interface to PLUMED.
-+*/
-+
-+class Plumed{
-+  plumed main;
-+/**
-+   keeps track if the object was created from scratch using 
-+   the defaults destructor (cloned=false) or if it was imported
-+   from C or FORTRAN (cloned-true). In the latter case, the
-+   plumed_finalize() method is not called when destructing the object,
-+   since it is expected to be finalized in the C/FORTRAN code
-+*/
-+  bool cloned;
-+public:
-+/**
-+   Check if plumed is installed (for runtime binding)
-+   \return true if plumed is installed, false otherwise
-+*/
-+  static bool installed();
-+/**
-+   Check if global-plumed has been initialized
-+   \return true if global plumed object (see global()) is initialized (i.e. if gcreate() has been
-+           called), false otherwise.
-+*/
-+  static bool ginitialized();
-+/**
-+   Initialize global-plumed
-+*/
-+  static void gcreate();
-+/**
-+   Send a command to global-plumed
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like gcmd("A","B"),
-+               but for some choice of key it can change the content
-+*/
-+  static void gcmd(const char* key,const void* val);
-+/**
-+   Finalize global-plumed
-+*/
-+  static void gfinalize();
-+/**
-+   Returns the Plumed global object
-+   \return The Plumed global object
-+*/
-+  static Plumed global();
-+/**
-+   Constructor
-+*/
-+  Plumed();
-+/**
-+   Clone a Plumed object from a FORTRAN char* handler
-+   \param c The FORTRAN handler (a char[32]).
-+
-+ \attention The Plumed object created in this manner
-+            will not finalize the corresponding plumed structure.
-+            It is expected that the FORTRAN code calls plumed_c_finalize for it
-+*/
-+  Plumed(const char*c);
-+/**
-+   Clone a Plumed object from a C plumed structure
-+   \param p The C plumed structure.
-+
-+ \attention The Plumed object created in this manner
-+            will not finalize the corresponding plumed structure.
-+            It is expected that the C code calls plumed_finalize for it
-+*/
-+  Plumed(plumed p);
-+private:
-+/** Copy constructor is disabled (private and unimplemented)
-+  The problem here is that after copying it will not be clear who is
-+  going to finalize the corresponding plumed structure.
-+*/
-+  Plumed(const Plumed&);
-+/** Assignment operator is disabled (private and unimplemented)
-+  The problem here is that after copying it will not be clear who is
-+  going to finalize the corresponding plumed structure.
-+*/
-+  Plumed&operator=(const Plumed&);
-+public:
-+/**
-+   Retrieve the C plumed structure for this object
-+*/
-+  operator plumed()const;
-+/**
-+   Retrieve a FORTRAN handler for this object
-+    \param c The FORTRAN handler (a char[32]).
-+*/
-+  void toFortran(char*c)const;
-+/**
-+   Send a command to this plumed object
-+    \param key The name of the command to be executed
-+    \param val The argument. It is declared as const to allow calls like p.cmd("A","B"),
-+               but for some choice of key it can change the content
-+*/
-+  void cmd(const char*key,const void*val=NULL);
-+/**
-+   Destructor
-+
-+   Destructor is virtual so as to allow correct inheritance from Plumed object.
-+   To avoid linking problems with g++, I specify "inline" also here (in principle
-+   it should be enough to specify it down in the definition of the function, but
-+   for some reason that I do not understand g++ does not inline it properly in that
-+   case and complains when Plumed.h is included but Plumed.o is not linked. Anyway, the
-+   way it is done here seems to work properly).
-+*/
-+  inline virtual ~Plumed();
-+};
-+
-+/* All methods are inlined so as to avoid the compilation of an extra c++ file */
-+
-+inline
-+bool Plumed::installed(){
-+  return plumed_installed();
-+}
-+
-+inline
-+Plumed::Plumed():
-+  main(plumed_create()),
-+  cloned(false)
-+{}
-+
-+inline
-+Plumed::Plumed(const char*c):
-+  main(plumed_f2c(c)),
-+  cloned(true)
-+{}
-+
-+inline
-+Plumed::Plumed(plumed p):
-+  main(p),
-+  cloned(true)
-+{}
-+
-+inline
-+Plumed::operator plumed()const{
-+  return main;
-+}
-+
-+inline
-+void Plumed::toFortran(char*c)const{
-+  plumed_c2f(main,c);
-+}
-+
-+inline
-+void Plumed::cmd(const char*key,const void*val){
-+  plumed_cmd(main,key,val);
-+}
-+
-+inline
-+Plumed::~Plumed(){
-+  if(!cloned)plumed_finalize(main);
-+}
-+
-+inline
-+bool Plumed::ginitialized(){
-+  return plumed_ginitialized();
-+}
-+
-+inline
-+void Plumed::gcreate(){
-+  plumed_gcreate();
-+}
-+
-+inline
-+void Plumed::gcmd(const char* key,const void* val){
-+  plumed_gcmd(key,val);
-+}
-+
-+inline
-+void Plumed::gfinalize(){
-+  plumed_gfinalize();
-+}
-+
-+inline
-+Plumed Plumed::global(){
-+  return plumed_global();
-+}
-+
-+}
-+
-+#endif
-+
-+
-+#endif
-diff --git a/Plumed.inc b/Plumed.inc
-new file mode 100644
-index 0000000..e1e29a7
---- /dev/null
-+++ b/Plumed.inc
-@@ -0,0 +1,3 @@
-+# PLUMED: shared installation
-+PLUMED_LOAD= /apps/all/PLUMED/2.3.0-foss-2017a/lib/plumed///src/lib/libplumed.so -ldl
-+PLUMED_DEPENDENCIES= /apps/all/PLUMED/2.3.0-foss-2017a/lib/plumed///src/lib/libplumed.so
-diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt
-index 6db37e2..cc97aa8 100644
---- a/src/gromacs/CMakeLists.txt
-+++ b/src/gromacs/CMakeLists.txt
-@@ -32,6 +32,8 @@
- # To help us fund GROMACS development, we humbly ask that you cite
- # the research papers on the package. Check out http://www.gromacs.org.
- 
-+include(${CMAKE_SOURCE_DIR}/Plumed.cmake)
-+
- set(LIBGROMACS_SOURCES)
- 
- function (gmx_install_headers DESTINATION)
-@@ -189,7 +191,7 @@ target_link_libraries(libgromacs
-                       ${TNG_IO_LIBRARIES}
-                       ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
-                       ${XML_LIBRARIES}
--                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
-+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${PLUMED_LOAD})
- set_target_properties(libgromacs PROPERTIES
-                       OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
-                       SOVERSION ${LIBRARY_SOVERSION}
-diff --git a/src/gromacs/CMakeLists.txt.preplumed b/src/gromacs/CMakeLists.txt.preplumed
-new file mode 100644
-index 0000000..6db37e2
---- /dev/null
-+++ b/src/gromacs/CMakeLists.txt.preplumed
-@@ -0,0 +1,232 @@
-+#
-+# This file is part of the GROMACS molecular simulation package.
-+#
-+# Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
-+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+# and including many others, as listed in the AUTHORS file in the
-+# top-level source directory and at http://www.gromacs.org.
-+#
-+# GROMACS is free software; you can redistribute it and/or
-+# modify it under the terms of the GNU Lesser General Public License
-+# as published by the Free Software Foundation; either version 2.1
-+# of the License, or (at your option) any later version.
-+#
-+# GROMACS is distributed in the hope that it will be useful,
-+# but WITHOUT ANY WARRANTY; without even the implied warranty of
-+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+# Lesser General Public License for more details.
-+#
-+# You should have received a copy of the GNU Lesser General Public
-+# License along with GROMACS; if not, see
-+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+#
-+# If you want to redistribute modifications to GROMACS, please
-+# consider that scientific software is very special. Version
-+# control is crucial - bugs must be traceable. We will be happy to
-+# consider code for inclusion in the official distribution, but
-+# derived work must not be called official GROMACS. Details are found
-+# in the README & COPYING files - if they are missing, get the
-+# official version at http://www.gromacs.org.
-+#
-+# To help us fund GROMACS development, we humbly ask that you cite
-+# the research papers on the package. Check out http://www.gromacs.org.
-+
-+set(LIBGROMACS_SOURCES)
-+
-+function (gmx_install_headers DESTINATION)
-+    if (NOT GMX_BUILD_MDRUN_ONLY)
-+        if (DESTINATION)
-+            set(DESTINATION ${INCL_INSTALL_DIR}/gromacs/${DESTINATION})
-+        else()
-+            set(DESTINATION ${INCL_INSTALL_DIR}/gromacs)
-+        endif()
-+        install(FILES ${ARGN} DESTINATION ${DESTINATION} COMPONENT development)
-+    endif()
-+endfunction ()
-+
-+if(GMX_USE_TNG)
-+    option(GMX_EXTERNAL_TNG "Use external TNG instead of compiling the version shipped with GROMACS."
-+           OFF)
-+    # Detect TNG if GMX_EXTERNAL_TNG is explicitly ON
-+    if(GMX_EXTERNAL_TNG)
-+        find_package(TNG_IO 1.6.0)
-+        if(NOT TNG_IO_FOUND)
-+            message(FATAL_ERROR
-+                "TNG >= 1.6.0 not found. "
-+                "You can set GMX_EXTERNAL_TNG=OFF to compile TNG.")
-+        endif()
-+        include_directories(${TNG_IO_INCLUDE_DIRS})
-+    endif()
-+    if(NOT GMX_EXTERNAL_TNG)
-+        include(${CMAKE_SOURCE_DIR}/src/external/tng_io/BuildTNG.cmake)
-+        tng_get_source_list(TNG_SOURCES TNG_IO_DEFINITIONS)
-+        list(APPEND LIBGROMACS_SOURCES ${TNG_SOURCES})
-+        tng_set_source_properties(WITH_ZLIB ${HAVE_ZLIB})
-+
-+        if (HAVE_ZLIB)
-+            list(APPEND GMX_EXTRA_LIBRARIES ${ZLIB_LIBRARIES})
-+            include_directories(${ZLIB_INCLUDE_DIRS})
-+        endif()
-+    endif()
-+else()
-+    # We still need to get tng/tng_io_fwd.h from somewhere!
-+    include_directories(BEFORE ${CMAKE_SOURCE_DIR}/src/external/tng_io/include)
-+endif()
-+
-+add_subdirectory(gmxlib)
-+add_subdirectory(mdlib)
-+add_subdirectory(gmxpreprocess)
-+add_subdirectory(commandline)
-+add_subdirectory(fft)
-+add_subdirectory(linearalgebra)
-+add_subdirectory(math)
-+add_subdirectory(random)
-+add_subdirectory(onlinehelp)
-+add_subdirectory(options)
-+add_subdirectory(timing)
-+add_subdirectory(utility)
-+add_subdirectory(fileio)
-+add_subdirectory(swap)
-+add_subdirectory(essentialdynamics)
-+add_subdirectory(pulling)
-+add_subdirectory(simd)
-+add_subdirectory(imd)
-+if (NOT GMX_BUILD_MDRUN_ONLY)
-+    add_subdirectory(legacyheaders)
-+    add_subdirectory(gmxana)
-+    add_subdirectory(statistics)
-+    add_subdirectory(analysisdata)
-+    add_subdirectory(selection)
-+    add_subdirectory(trajectoryanalysis)
-+    add_subdirectory(tools)
-+endif()
-+
-+list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES})
-+
-+# This would be the standard way to include thread_mpi, but
-+# we want libgromacs to link the functions directly
-+#if(GMX_THREAD_MPI)
-+#    add_subdirectory(thread_mpi)
-+#endif()
-+#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
-+
-+tmpi_get_source_list(THREAD_MPI_SOURCES ${CMAKE_SOURCE_DIR}/src/external/thread_mpi/src)
-+list(APPEND LIBGROMACS_SOURCES ${THREAD_MPI_SOURCES})
-+
-+file(GLOB LIBGROMACS_HEADERS *.h)
-+configure_file(version.h.cmakein version.h)
-+gmx_install_headers("" ${LIBGROMACS_HEADERS})
-+gmx_install_headers("" ${CMAKE_CURRENT_BINARY_DIR}/version.h)
-+
-+# Add target that generates baseversion-gen.c every time make is run
-+# if git version info is requested, or create it statically.
-+# This code is here instead of utility/CMakeLists.txt because CMake
-+# ignores set_source_file_properties from subdirectories.
-+set(GENERATED_VERSION_FILE
-+    ${CMAKE_CURRENT_BINARY_DIR}/utility/baseversion-gen.c)
-+set(GENERATED_VERSION_FILE_SOURCE
-+    ${CMAKE_CURRENT_SOURCE_DIR}/utility/baseversion-gen.c.cmakein)
-+if (GMX_GIT_VERSION_INFO)
-+    add_custom_target(gmx-version ALL
-+            COMMAND ${CMAKE_COMMAND}
-+                -D GIT_EXECUTABLE="${GIT_EXECUTABLE}"
-+                -D PROJECT_VERSION="${PROJECT_VERSION}"
-+                -D PROJECT_SOURCE_DIR="${PROJECT_SOURCE_DIR}"
-+                -D VERSION_CMAKEIN=${GENERATED_VERSION_FILE_SOURCE}
-+                -D VERSION_OUT=${GENERATED_VERSION_FILE}
-+                -P ${CMAKE_SOURCE_DIR}/cmake/gmxGenerateVersionInfo.cmake
-+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-+            DEPENDS ${GENERATED_VERSION_FILE_SOURCE}
-+            COMMENT "Generating git version information")
-+    set_source_files_properties(${GENERATED_VERSION_FILE}
-+                                PROPERTIES GENERATED true)
-+else()
-+    set(GMX_PROJECT_VERSION_STR ${PROJECT_VERSION})
-+    configure_file(${GENERATED_VERSION_FILE_SOURCE} ${GENERATED_VERSION_FILE})
-+endif()
-+list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
-+
-+# apply gcc 4.4.x bug workaround
-+if(GMX_USE_GCC44_BUG_WORKAROUND)
-+   include(gmxGCC44O3BugWorkaround)
-+   gmx_apply_gcc44_bug_workaround("gmxlib/bondfree.c")
-+   gmx_apply_gcc44_bug_workaround("mdlib/force.c")
-+   gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
-+endif()
-+
-+add_library(libgromacs ${LIBGROMACS_SOURCES})
-+if (GMX_GIT_VERSION_INFO)
-+    add_dependencies(libgromacs gmx-version)
-+endif()
-+
-+# Recent versions of gcc and clang give warnings on scanner.cpp, which
-+# is a generated source file. These are awkward to suppress inline, so
-+# we do it in the compilation command (after testing that the compiler
-+# supports the suppressions). Setting the properties only works after
-+# the related target has been created, e.g. after when the file is
-+# used with add_library().
-+include(CheckCXXCompilerFlag)
-+check_cxx_compiler_flag(-Wno-unused-parameter HAS_NO_UNUSED_PARAMETER)
-+if (HAS_NO_UNUSED_PARAMETER)
-+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-unused-parameter")
-+endif()
-+check_cxx_compiler_flag(-Wno-deprecated-register HAS_NO_DEPRECATED_REGISTER)
-+if (HAS_NO_DEPRECATED_REGISTER)
-+    set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated-register")
-+else()
-+    check_cxx_compiler_flag(-Wno-deprecated HAS_NO_DEPRECATED)
-+    if (HAS_NO_DEPRECATED)
-+        set(_scanner_cpp_compiler_flags "${_scanner_cpp_compiler_flags} -Wno-deprecated")
-+    endif()
-+endif()
-+set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
-+
-+target_link_libraries(libgromacs
-+                      ${EXTRAE_LIBRARIES}
-+                      ${GMX_GPU_LIBRARIES}
-+                      ${GMX_EXTRA_LIBRARIES}
-+                      ${TNG_IO_LIBRARIES}
-+                      ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
-+                      ${XML_LIBRARIES}
-+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
-+set_target_properties(libgromacs PROPERTIES
-+                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
-+                      SOVERSION ${LIBRARY_SOVERSION}
-+                      VERSION ${LIBRARY_VERSION}
-+                      COMPILE_FLAGS "${OpenMP_C_FLAGS}")
-+
-+# Only install the library in mdrun-only mode if it is actually necessary
-+# for the binary
-+if (NOT GMX_BUILD_MDRUN_ONLY OR BUILD_SHARED_LIBS)
-+    install(TARGETS libgromacs
-+        LIBRARY DESTINATION ${LIB_INSTALL_DIR}
-+        RUNTIME DESTINATION ${BIN_INSTALL_DIR}
-+        ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
-+        COMPONENT libraries)
-+endif()
-+
-+if (NOT GMX_BUILD_MDRUN_ONLY)
-+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgromacs.pc.cmakein
-+                   ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc @ONLY)
-+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc
-+            DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
-+            RENAME "libgromacs${GMX_LIBS_SUFFIX}.pc"
-+            COMPONENT development)
-+endif()
-+
-+if (INSTALL_CUDART_LIB) #can be set manual by user
-+    if (GMX_GPU)
-+        foreach(CUDA_LIB ${CUDA_LIBRARIES})
-+            string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
-+            if(IS_CUDART) #libcuda should not be installed
-+                #install also name-links (linker uses those)
-+                file(GLOB CUDA_LIBS ${CUDA_LIB}*)
-+                install(FILES ${CUDA_LIBS} DESTINATION
-+                    ${LIB_INSTALL_DIR} COMPONENT libraries)
-+            endif()
-+        endforeach()
-+    else()
-+        message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
-+    endif()
-+endif()
-diff --git a/src/gromacs/mdlib/force.c b/src/gromacs/mdlib/force.c
-index 5230983..8227d5b 100644
---- a/src/gromacs/mdlib/force.c
-+++ b/src/gromacs/mdlib/force.c
-@@ -67,6 +67,14 @@
- #include "gromacs/timing/wallcycle.h"
- #include "gmx_fatal.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+int    plumedswitch=0;
-+plumed plumedmain;
-+void(*plumedcmd)(plumed,const char*,const void*)=NULL;
-+/* END PLUMED */
-+
-+
- void ns(FILE              *fp,
-         t_forcerec        *fr,
-         matrix             box,
-@@ -737,6 +745,13 @@ void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
-         pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
-     }
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      int plumedNeedsEnergy;
-+      (*plumedcmd)(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+      if(!plumedNeedsEnergy) (*plumedcmd)(plumedmain,"performCalc",NULL);
-+    }
-+    /* END PLUMED */
- }
- 
- void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
-diff --git a/src/gromacs/mdlib/force.c.preplumed b/src/gromacs/mdlib/force.c.preplumed
-new file mode 100644
-index 0000000..5230983
---- /dev/null
-+++ b/src/gromacs/mdlib/force.c.preplumed
-@@ -0,0 +1,1018 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <math.h>
-+#include <string.h>
-+#include <assert.h>
-+#include "sysstuff.h"
-+#include "typedefs.h"
-+#include "macros.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "macros.h"
-+#include "physics.h"
-+#include "force.h"
-+#include "nonbonded.h"
-+#include "names.h"
-+#include "network.h"
-+#include "pbc.h"
-+#include "ns.h"
-+#include "nrnb.h"
-+#include "bondf.h"
-+#include "mshift.h"
-+#include "txtdump.h"
-+#include "coulomb.h"
-+#include "pme.h"
-+#include "mdrun.h"
-+#include "domdec.h"
-+#include "qmmm.h"
-+#include "gmx_omp_nthreads.h"
-+
-+#include "gromacs/timing/wallcycle.h"
-+#include "gmx_fatal.h"
-+
-+void ns(FILE              *fp,
-+        t_forcerec        *fr,
-+        matrix             box,
-+        gmx_groups_t      *groups,
-+        gmx_localtop_t    *top,
-+        t_mdatoms         *md,
-+        t_commrec         *cr,
-+        t_nrnb            *nrnb,
-+        gmx_bool           bFillGrid,
-+        gmx_bool           bDoLongRangeNS)
-+{
-+    char   *ptr;
-+    int     nsearch;
-+
-+
-+    if (!fr->ns.nblist_initialized)
-+    {
-+        init_neighbor_list(fp, fr, md->homenr);
-+    }
-+
-+    if (fr->bTwinRange)
-+    {
-+        fr->nlr = 0;
-+    }
-+
-+    nsearch = search_neighbours(fp, fr, box, top, groups, cr, nrnb, md,
-+                                bFillGrid, bDoLongRangeNS);
-+    if (debug)
-+    {
-+        fprintf(debug, "nsearch = %d\n", nsearch);
-+    }
-+
-+    /* Check whether we have to do dynamic load balancing */
-+    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
-+       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
-+       &(top->idef),opts->ngener);
-+     */
-+    if (fr->ns.dump_nl > 0)
-+    {
-+        dump_nblist(fp, cr, fr, fr->ns.dump_nl);
-+    }
-+}
-+
-+static void reduce_thread_forces(int n, rvec *f,
-+                                 tensor vir_q, tensor vir_lj,
-+                                 real *Vcorr_q, real *Vcorr_lj,
-+                                 real *dvdl_q, real *dvdl_lj,
-+                                 int nthreads, f_thread_t *f_t)
-+{
-+    int t, i;
-+    int nthreads_loop gmx_unused;
-+
-+    /* This reduction can run over any number of threads */
-+    nthreads_loop = gmx_omp_nthreads_get(emntBonded);
-+#pragma omp parallel for num_threads(nthreads_loop) private(t) schedule(static)
-+    for (i = 0; i < n; i++)
-+    {
-+        for (t = 1; t < nthreads; t++)
-+        {
-+            rvec_inc(f[i], f_t[t].f[i]);
-+        }
-+    }
-+    for (t = 1; t < nthreads; t++)
-+    {
-+        *Vcorr_q  += f_t[t].Vcorr_q;
-+        *Vcorr_lj += f_t[t].Vcorr_lj;
-+        *dvdl_q   += f_t[t].dvdl[efptCOUL];
-+        *dvdl_lj  += f_t[t].dvdl[efptVDW];
-+        m_add(vir_q, f_t[t].vir_q, vir_q);
-+        m_add(vir_lj, f_t[t].vir_lj, vir_lj);
-+    }
-+}
-+
-+void gmx_print_sepdvdl(FILE *fplog, const char *s, real v, real dvdlambda)
-+{
-+    fprintf(fplog, "  %-30s V %12.5e  dVdl %12.5e\n", s, v, dvdlambda);
-+}
-+
-+void do_force_lowlevel(FILE       *fplog,   gmx_int64_t step,
-+                       t_forcerec *fr,      t_inputrec *ir,
-+                       t_idef     *idef,    t_commrec  *cr,
-+                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
-+                       t_mdatoms  *md,
-+                       rvec       x[],      history_t  *hist,
-+                       rvec       f[],
-+                       rvec       f_longrange[],
-+                       gmx_enerdata_t *enerd,
-+                       t_fcdata   *fcd,
-+                       gmx_localtop_t *top,
-+                       gmx_genborn_t *born,
-+                       t_atomtypes *atype,
-+                       gmx_bool       bBornRadii,
-+                       matrix     box,
-+                       t_lambda   *fepvals,
-+                       real       *lambda,
-+                       t_graph    *graph,
-+                       t_blocka   *excl,
-+                       rvec       mu_tot[],
-+                       int        flags,
-+                       float      *cycles_pme)
-+{
-+    int         i, j;
-+    int         donb_flags;
-+    gmx_bool    bDoEpot, bSepDVDL, bSB;
-+    int         pme_flags;
-+    matrix      boxs;
-+    rvec        box_size;
-+    t_pbc       pbc;
-+    char        buf[22];
-+    double      clam_i, vlam_i;
-+    real        dvdl_dum[efptNR], dvdl_nb[efptNR], lam_i[efptNR];
-+    real        dvdl_q, dvdl_lj;
-+
-+#ifdef GMX_MPI
-+    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
-+#endif
-+
-+#define PRINT_SEPDVDL(s, v, dvdlambda) if (bSepDVDL) { gmx_print_sepdvdl(fplog, s, v, dvdlambda); }
-+
-+    set_pbc(&pbc, fr->ePBC, box);
-+
-+    /* reset free energy components */
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        dvdl_nb[i]  = 0;
-+        dvdl_dum[i] = 0;
-+    }
-+
-+    /* Reset box */
-+    for (i = 0; (i < DIM); i++)
-+    {
-+        box_size[i] = box[i][i];
-+    }
-+
-+    bSepDVDL = (fr->bSepDVDL && do_per_step(step, ir->nstlog));
-+    debug_gmx();
-+
-+    /* do QMMM first if requested */
-+    if (fr->bQMMM)
-+    {
-+        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr);
-+    }
-+
-+    if (bSepDVDL)
-+    {
-+        fprintf(fplog, "Step %s: non-bonded V and dVdl for rank %d:\n",
-+                gmx_step_str(step, buf), cr->nodeid);
-+    }
-+
-+    /* Call the short range functions all in one go. */
-+
-+#ifdef GMX_MPI
-+    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
-+#define TAKETIME FALSE
-+    if (TAKETIME)
-+    {
-+        MPI_Barrier(cr->mpi_comm_mygroup);
-+        t0 = MPI_Wtime();
-+    }
-+#endif
-+
-+    if (ir->nwall)
-+    {
-+        /* foreign lambda component for walls */
-+        real dvdl_walls = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
-+                                   enerd->grpp.ener[egLJSR], nrnb);
-+        PRINT_SEPDVDL("Walls", 0.0, dvdl_walls);
-+        enerd->dvdl_lin[efptVDW] += dvdl_walls;
-+    }
-+
-+    /* If doing GB, reset dvda and calculate the Born radii */
-+    if (ir->implicit_solvent)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
-+
-+        for (i = 0; i < born->nr; i++)
-+        {
-+            fr->dvda[i] = 0;
-+        }
-+
-+        if (bBornRadii)
-+        {
-+            calc_gb_rad(cr, fr, ir, top, x, &(fr->gblist), born, md, nrnb);
-+        }
-+
-+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
-+    }
-+
-+    where();
-+    /* We only do non-bonded calculation with group scheme here, the verlet
-+     * calls are done from do_force_cutsVERLET(). */
-+    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
-+    {
-+        donb_flags = 0;
-+        /* Add short-range interactions */
-+        donb_flags |= GMX_NONBONDED_DO_SR;
-+
-+        /* Currently all group scheme kernels always calculate (shift-)forces */
-+        if (flags & GMX_FORCE_FORCES)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_FORCE;
-+        }
-+        if (flags & GMX_FORCE_VIRIAL)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
-+        }
-+        if (flags & GMX_FORCE_ENERGY)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
-+        }
-+        if (flags & GMX_FORCE_DO_LR)
-+        {
-+            donb_flags |= GMX_NONBONDED_DO_LR;
-+        }
-+
-+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
-+        do_nonbonded(fr, x, f, f_longrange, md, excl,
-+                     &enerd->grpp, nrnb,
-+                     lambda, dvdl_nb, -1, -1, donb_flags);
-+
-+        /* If we do foreign lambda and we have soft-core interactions
-+         * we have to recalculate the (non-linear) energies contributions.
-+         */
-+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
-+        {
-+            for (i = 0; i < enerd->n_lambda; i++)
-+            {
-+                for (j = 0; j < efptNR; j++)
-+                {
-+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
-+                }
-+                reset_foreign_enerdata(enerd);
-+                do_nonbonded(fr, x, f, f_longrange, md, excl,
-+                             &(enerd->foreign_grpp), nrnb,
-+                             lam_i, dvdl_dum, -1, -1,
-+                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
-+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
-+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
-+            }
-+        }
-+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
-+        where();
-+    }
-+
-+    /* If we are doing GB, calculate bonded forces and apply corrections
-+     * to the solvation forces */
-+    /* MRS: Eventually, many need to include free energy contribution here! */
-+    if (ir->implicit_solvent)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsBONDED);
-+        calc_gb_forces(cr, md, born, top, x, f, fr, idef,
-+                       ir->gb_algorithm, ir->sa_algorithm, nrnb, &pbc, graph, enerd);
-+        wallcycle_sub_stop(wcycle, ewcsBONDED);
-+    }
-+
-+#ifdef GMX_MPI
-+    if (TAKETIME)
-+    {
-+        t1          = MPI_Wtime();
-+        fr->t_fnbf += t1-t0;
-+    }
-+#endif
-+
-+    if (fepvals->sc_alpha != 0)
-+    {
-+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
-+    }
-+    else
-+    {
-+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
-+    }
-+
-+    if (fepvals->sc_alpha != 0)
-+
-+    /* even though coulomb part is linear, we already added it, beacuse we
-+       need to go through the vdw calculation anyway */
-+    {
-+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
-+    }
-+    else
-+    {
-+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
-+    }
-+
-+    if (bSepDVDL)
-+    {
-+        real V_short_range    = 0;
-+        real dvdl_short_range = 0;
-+
-+        for (i = 0; i < enerd->grpp.nener; i++)
-+        {
-+            V_short_range +=
-+                (fr->bBHAM ?
-+                 enerd->grpp.ener[egBHAMSR][i] :
-+                 enerd->grpp.ener[egLJSR][i])
-+                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
-+        }
-+        dvdl_short_range = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
-+        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.",
-+                      V_short_range,
-+                      dvdl_short_range);
-+    }
-+    debug_gmx();
-+
-+
-+    if (debug)
-+    {
-+        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
-+    }
-+
-+    /* Shift the coordinates. Must be done before bonded forces and PPPM,
-+     * but is also necessary for SHAKE and update, therefore it can NOT
-+     * go when no bonded forces have to be evaluated.
-+     */
-+
-+    /* Here sometimes we would not need to shift with NBFonly,
-+     * but we do so anyhow for consistency of the returned coordinates.
-+     */
-+    if (graph)
-+    {
-+        shift_self(graph, box, x);
-+        if (TRICLINIC(box))
-+        {
-+            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
-+        }
-+        else
-+        {
-+            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
-+        }
-+    }
-+    /* Check whether we need to do bondeds or correct for exclusions */
-+    if (fr->bMolPBC &&
-+        ((flags & GMX_FORCE_BONDED)
-+         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype)))
-+    {
-+        /* Since all atoms are in the rectangular or triclinic unit-cell,
-+         * only single box vector shifts (2 in x) are required.
-+         */
-+        set_pbc_dd(&pbc, fr->ePBC, cr->dd, TRUE, box);
-+    }
-+    debug_gmx();
-+
-+    if (flags & GMX_FORCE_BONDED)
-+    {
-+        wallcycle_sub_start(wcycle, ewcsBONDED);
-+        calc_bonds(fplog, cr->ms,
-+                   idef, x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
-+                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
-+                   flags,
-+                   fr->bSepDVDL && do_per_step(step, ir->nstlog), step);
-+
-+        /* Check if we have to determine energy differences
-+         * at foreign lambda's.
-+         */
-+        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) &&
-+            idef->ilsort != ilsortNO_FE)
-+        {
-+            if (idef->ilsort != ilsortFE_SORTED)
-+            {
-+                gmx_incons("The bonded interactions are not sorted for free energy");
-+            }
-+            for (i = 0; i < enerd->n_lambda; i++)
-+            {
-+                reset_foreign_enerdata(enerd);
-+                for (j = 0; j < efptNR; j++)
-+                {
-+                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
-+                }
-+                calc_bonds_lambda(fplog, idef, x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
-+                                  fcd, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
-+                sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
-+                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
-+            }
-+        }
-+        debug_gmx();
-+
-+        wallcycle_sub_stop(wcycle, ewcsBONDED);
-+    }
-+
-+    where();
-+
-+    *cycles_pme = 0;
-+    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+    {
-+        real Vlr             = 0, Vcorr = 0;
-+        real dvdl_long_range = 0;
-+        int  status          = 0;
-+
-+        bSB = (ir->nwall == 2);
-+        if (bSB)
-+        {
-+            copy_mat(box, boxs);
-+            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
-+            box_size[ZZ] *= ir->wall_ewald_zfac;
-+        }
-+    }
-+
-+    /* Do long-range electrostatics and/or LJ-PME, including related short-range
-+     * corrections.
-+     */
-+
-+    clear_mat(fr->vir_el_recip);
-+    clear_mat(fr->vir_lj_recip);
-+
-+    if (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+    {
-+        real Vlr_q             = 0, Vlr_lj = 0, Vcorr_q = 0, Vcorr_lj = 0;
-+        real dvdl_long_range_q = 0, dvdl_long_range_lj = 0;
-+        int  status            = 0;
-+
-+        if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype))
-+        {
-+            real dvdl_long_range_correction_q   = 0;
-+            real dvdl_long_range_correction_lj  = 0;
-+            /* With the Verlet scheme exclusion forces are calculated
-+             * in the non-bonded kernel.
-+             */
-+            /* The TPI molecule does not have exclusions with the rest
-+             * of the system and no intra-molecular PME grid
-+             * contributions will be calculated in
-+             * gmx_pme_calc_energy.
-+             */
-+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
-+                ir->ewald_geometry != eewg3D ||
-+                ir->epsilon_surface != 0)
-+            {
-+                int nthreads, t;
-+
-+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
-+
-+                if (fr->n_tpi > 0)
-+                {
-+                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
-+                }
-+
-+                nthreads = gmx_omp_nthreads_get(emntBonded);
-+#pragma omp parallel for num_threads(nthreads) schedule(static)
-+                for (t = 0; t < nthreads; t++)
-+                {
-+                    int     s, e, i;
-+                    rvec   *fnv;
-+                    tensor *vir_q, *vir_lj;
-+                    real   *Vcorrt_q, *Vcorrt_lj, *dvdlt_q, *dvdlt_lj;
-+                    if (t == 0)
-+                    {
-+                        fnv       = fr->f_novirsum;
-+                        vir_q     = &fr->vir_el_recip;
-+                        vir_lj    = &fr->vir_lj_recip;
-+                        Vcorrt_q  = &Vcorr_q;
-+                        Vcorrt_lj = &Vcorr_lj;
-+                        dvdlt_q   = &dvdl_long_range_correction_q;
-+                        dvdlt_lj  = &dvdl_long_range_correction_lj;
-+                    }
-+                    else
-+                    {
-+                        fnv       = fr->f_t[t].f;
-+                        vir_q     = &fr->f_t[t].vir_q;
-+                        vir_lj    = &fr->f_t[t].vir_lj;
-+                        Vcorrt_q  = &fr->f_t[t].Vcorr_q;
-+                        Vcorrt_lj = &fr->f_t[t].Vcorr_lj;
-+                        dvdlt_q   = &fr->f_t[t].dvdl[efptCOUL];
-+                        dvdlt_lj  = &fr->f_t[t].dvdl[efptVDW];
-+                        for (i = 0; i < fr->natoms_force; i++)
-+                        {
-+                            clear_rvec(fnv[i]);
-+                        }
-+                        clear_mat(*vir_q);
-+                        clear_mat(*vir_lj);
-+                    }
-+                    *dvdlt_q  = 0;
-+                    *dvdlt_lj = 0;
-+
-+                    ewald_LRcorrection(fr->excl_load[t], fr->excl_load[t+1],
-+                                       cr, t, fr,
-+                                       md->chargeA, md->chargeB,
-+                                       md->sqrt_c6A, md->sqrt_c6B,
-+                                       md->sigmaA, md->sigmaB,
-+                                       md->sigma3A, md->sigma3B,
-+                                       md->nChargePerturbed || md->nTypePerturbed,
-+                                       ir->cutoff_scheme != ecutsVERLET,
-+                                       excl, x, bSB ? boxs : box, mu_tot,
-+                                       ir->ewald_geometry,
-+                                       ir->epsilon_surface,
-+                                       fnv, *vir_q, *vir_lj,
-+                                       Vcorrt_q, Vcorrt_lj,
-+                                       lambda[efptCOUL], lambda[efptVDW],
-+                                       dvdlt_q, dvdlt_lj);
-+                }
-+                if (nthreads > 1)
-+                {
-+                    reduce_thread_forces(fr->natoms_force, fr->f_novirsum,
-+                                         fr->vir_el_recip, fr->vir_lj_recip,
-+                                         &Vcorr_q, &Vcorr_lj,
-+                                         &dvdl_long_range_correction_q,
-+                                         &dvdl_long_range_correction_lj,
-+                                         nthreads, fr->f_t);
-+                }
-+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
-+            }
-+
-+            if (EEL_PME_EWALD(fr->eeltype) && fr->n_tpi == 0)
-+            {
-+                Vcorr_q += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
-+                                                   &dvdl_long_range_correction_q,
-+                                                   fr->vir_el_recip);
-+            }
-+
-+            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.", Vcorr_q, dvdl_long_range_correction_q);
-+            PRINT_SEPDVDL("Ewald excl. corr. LJ", Vcorr_lj, dvdl_long_range_correction_lj);
-+            enerd->dvdl_lin[efptCOUL] += dvdl_long_range_correction_q;
-+            enerd->dvdl_lin[efptVDW]  += dvdl_long_range_correction_lj;
-+        }
-+
-+        if ((EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype)))
-+        {
-+            if (cr->duty & DUTY_PME)
-+            {
-+                /* Do reciprocal PME for Coulomb and/or LJ. */
-+                assert(fr->n_tpi >= 0);
-+                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
-+                {
-+                    pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
-+                    if (EEL_PME(fr->eeltype))
-+                    {
-+                        pme_flags     |= GMX_PME_DO_COULOMB;
-+                    }
-+                    if (EVDW_PME(fr->vdwtype))
-+                    {
-+                        pme_flags |= GMX_PME_DO_LJ;
-+                    }
-+                    if (flags & GMX_FORCE_FORCES)
-+                    {
-+                        pme_flags |= GMX_PME_CALC_F;
-+                    }
-+                    if (flags & GMX_FORCE_VIRIAL)
-+                    {
-+                        pme_flags |= GMX_PME_CALC_ENER_VIR;
-+                    }
-+                    if (fr->n_tpi > 0)
-+                    {
-+                        /* We don't calculate f, but we do want the potential */
-+                        pme_flags |= GMX_PME_CALC_POT;
-+                    }
-+                    wallcycle_start(wcycle, ewcPMEMESH);
-+                    status = gmx_pme_do(fr->pmedata,
-+                                        0, md->homenr - fr->n_tpi,
-+                                        x, fr->f_novirsum,
-+                                        md->chargeA, md->chargeB,
-+                                        md->sqrt_c6A, md->sqrt_c6B,
-+                                        md->sigmaA, md->sigmaB,
-+                                        bSB ? boxs : box, cr,
-+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
-+                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
-+                                        nrnb, wcycle,
-+                                        fr->vir_el_recip, fr->ewaldcoeff_q,
-+                                        fr->vir_lj_recip, fr->ewaldcoeff_lj,
-+                                        &Vlr_q, &Vlr_lj,
-+                                        lambda[efptCOUL], lambda[efptVDW],
-+                                        &dvdl_long_range_q, &dvdl_long_range_lj, pme_flags);
-+                    *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
-+                    if (status != 0)
-+                    {
-+                        gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
-+                    }
-+                    /* We should try to do as little computation after
-+                     * this as possible, because parallel PME synchronizes
-+                     * the nodes, so we want all load imbalance of the
-+                     * rest of the force calculation to be before the PME
-+                     * call.  DD load balancing is done on the whole time
-+                     * of the force call (without PME).
-+                     */
-+                }
-+                if (fr->n_tpi > 0)
-+                {
-+                    if (EVDW_PME(ir->vdwtype))
-+                    {
-+
-+                        gmx_fatal(FARGS, "Test particle insertion not implemented with LJ-PME");
-+                    }
-+                    /* Determine the PME grid energy of the test molecule
-+                     * with the PME grid potential of the other charges.
-+                     */
-+                    gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
-+                                        x + md->homenr - fr->n_tpi,
-+                                        md->chargeA + md->homenr - fr->n_tpi,
-+                                        &Vlr_q);
-+                }
-+                PRINT_SEPDVDL("PME mesh", Vlr_q + Vlr_lj, dvdl_long_range_q+dvdl_long_range_lj);
-+            }
-+        }
-+
-+        if (!EEL_PME(fr->eeltype) && EEL_PME_EWALD(fr->eeltype))
-+        {
-+            Vlr_q = do_ewald(ir, x, fr->f_novirsum,
-+                             md->chargeA, md->chargeB,
-+                             box_size, cr, md->homenr,
-+                             fr->vir_el_recip, fr->ewaldcoeff_q,
-+                             lambda[efptCOUL], &dvdl_long_range_q, fr->ewald_table);
-+            PRINT_SEPDVDL("Ewald long-range", Vlr_q, dvdl_long_range_q);
-+        }
-+
-+        /* Note that with separate PME nodes we get the real energies later */
-+        enerd->dvdl_lin[efptCOUL] += dvdl_long_range_q;
-+        enerd->dvdl_lin[efptVDW]  += dvdl_long_range_lj;
-+        enerd->term[F_COUL_RECIP]  = Vlr_q + Vcorr_q;
-+        enerd->term[F_LJ_RECIP]    = Vlr_lj + Vcorr_lj;
-+        if (debug)
-+        {
-+            fprintf(debug, "Vlr_q = %g, Vcorr_q = %g, Vlr_corr_q = %g\n",
-+                    Vlr_q, Vcorr_q, enerd->term[F_COUL_RECIP]);
-+            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
-+            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
-+            fprintf(debug, "Vlr_lj: %g, Vcorr_lj = %g, Vlr_corr_lj = %g\n",
-+                    Vlr_lj, Vcorr_lj, enerd->term[F_LJ_RECIP]);
-+            pr_rvecs(debug, 0, "vir_lj_recip after corr", fr->vir_lj_recip, DIM);
-+        }
-+    }
-+    else
-+    {
-+        /* Is there a reaction-field exclusion correction needed? */
-+        if (EEL_RF(fr->eeltype) && eelRF_NEC != fr->eeltype)
-+        {
-+            /* With the Verlet scheme, exclusion forces are calculated
-+             * in the non-bonded kernel.
-+             */
-+            if (ir->cutoff_scheme != ecutsVERLET)
-+            {
-+                real dvdl_rf_excl      = 0;
-+                enerd->term[F_RF_EXCL] =
-+                    RF_excl_correction(fr, graph, md, excl, x, f,
-+                                       fr->fshift, &pbc, lambda[efptCOUL], &dvdl_rf_excl);
-+
-+                enerd->dvdl_lin[efptCOUL] += dvdl_rf_excl;
-+                PRINT_SEPDVDL("RF exclusion correction",
-+                              enerd->term[F_RF_EXCL], dvdl_rf_excl);
-+            }
-+        }
-+    }
-+    where();
-+    debug_gmx();
-+
-+    if (debug)
-+    {
-+        print_nrnb(debug, nrnb);
-+    }
-+    debug_gmx();
-+
-+#ifdef GMX_MPI
-+    if (TAKETIME)
-+    {
-+        t2 = MPI_Wtime();
-+        MPI_Barrier(cr->mpi_comm_mygroup);
-+        t3          = MPI_Wtime();
-+        fr->t_wait += t3-t2;
-+        if (fr->timesteps == 11)
-+        {
-+            fprintf(stderr, "* PP load balancing info: rank %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
-+                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
-+                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
-+                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
-+        }
-+        fr->timesteps++;
-+    }
-+#endif
-+
-+    if (debug)
-+    {
-+        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
-+    }
-+
-+}
-+
-+void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
-+{
-+    int i, n2;
-+
-+    for (i = 0; i < F_NRE; i++)
-+    {
-+        enerd->term[i]         = 0;
-+        enerd->foreign_term[i] = 0;
-+    }
-+
-+
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        enerd->dvdl_lin[i]     = 0;
-+        enerd->dvdl_nonlin[i]  = 0;
-+    }
-+
-+    n2 = ngener*ngener;
-+    if (debug)
-+    {
-+        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
-+    }
-+    enerd->grpp.nener         = n2;
-+    enerd->foreign_grpp.nener = n2;
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        snew(enerd->grpp.ener[i], n2);
-+        snew(enerd->foreign_grpp.ener[i], n2);
-+    }
-+
-+    if (n_lambda)
-+    {
-+        enerd->n_lambda = 1 + n_lambda;
-+        snew(enerd->enerpart_lambda, enerd->n_lambda);
-+    }
-+    else
-+    {
-+        enerd->n_lambda = 0;
-+    }
-+}
-+
-+void destroy_enerdata(gmx_enerdata_t *enerd)
-+{
-+    int i;
-+
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        sfree(enerd->grpp.ener[i]);
-+    }
-+
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        sfree(enerd->foreign_grpp.ener[i]);
-+    }
-+
-+    if (enerd->n_lambda)
-+    {
-+        sfree(enerd->enerpart_lambda);
-+    }
-+}
-+
-+static real sum_v(int n, real v[])
-+{
-+    real t;
-+    int  i;
-+
-+    t = 0.0;
-+    for (i = 0; (i < n); i++)
-+    {
-+        t = t + v[i];
-+    }
-+
-+    return t;
-+}
-+
-+void sum_epot(gmx_grppairener_t *grpp, real *epot)
-+{
-+    int i;
-+
-+    /* Accumulate energies */
-+    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
-+    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
-+    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
-+    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
-+    epot[F_COUL_LR]  = sum_v(grpp->nener, grpp->ener[egCOULLR]);
-+    epot[F_LJ_LR]    = sum_v(grpp->nener, grpp->ener[egLJLR]);
-+    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
-+    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
-+
-+/* lattice part of LR doesnt belong to any group
-+ * and has been added earlier
-+ */
-+    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
-+    epot[F_BHAM_LR]  = sum_v(grpp->nener, grpp->ener[egBHAMLR]);
-+
-+    epot[F_EPOT] = 0;
-+    for (i = 0; (i < F_EPOT); i++)
-+    {
-+        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
-+        {
-+            epot[F_EPOT] += epot[i];
-+        }
-+    }
-+}
-+
-+void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
-+{
-+    int    i, j, index;
-+    double dlam;
-+
-+    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
-+    enerd->term[F_DVDL]       = 0.0;
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        if (fepvals->separate_dvdl[i])
-+        {
-+            /* could this be done more readably/compactly? */
-+            switch (i)
-+            {
-+                case (efptMASS):
-+                    index = F_DKDL;
-+                    break;
-+                case (efptCOUL):
-+                    index = F_DVDL_COUL;
-+                    break;
-+                case (efptVDW):
-+                    index = F_DVDL_VDW;
-+                    break;
-+                case (efptBONDED):
-+                    index = F_DVDL_BONDED;
-+                    break;
-+                case (efptRESTRAINT):
-+                    index = F_DVDL_RESTRAINT;
-+                    break;
-+                default:
-+                    index = F_DVDL;
-+                    break;
-+            }
-+            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
-+            if (debug)
-+            {
-+                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
-+                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
-+            }
-+        }
-+        else
-+        {
-+            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
-+            if (debug)
-+            {
-+                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
-+                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
-+            }
-+        }
-+    }
-+
-+    /* Notes on the foreign lambda free energy difference evaluation:
-+     * Adding the potential and ekin terms that depend linearly on lambda
-+     * as delta lam * dvdl to the energy differences is exact.
-+     * For the constraints this is not exact, but we have no other option
-+     * without literally changing the lengths and reevaluating the energies at each step.
-+     * (try to remedy this post 4.6 - MRS)
-+     * For the non-bonded LR term we assume that the soft-core (if present)
-+     * no longer affects the energy beyond the short-range cut-off,
-+     * which is a very good approximation (except for exotic settings).
-+     * (investigate how to overcome this post 4.6 - MRS)
-+     */
-+    if (fepvals->separate_dvdl[efptBONDED])
-+    {
-+        enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
-+    }
-+    else
-+    {
-+        enerd->term[F_DVDL] += enerd->term[F_DVDL_CONSTR];
-+    }
-+    enerd->term[F_DVDL_CONSTR] = 0;
-+
-+    for (i = 0; i < fepvals->n_lambda; i++)
-+    {
-+        /* note we are iterating over fepvals here!
-+           For the current lam, dlam = 0 automatically,
-+           so we don't need to add anything to the
-+           enerd->enerpart_lambda[0] */
-+
-+        /* we don't need to worry about dvdl_lin contributions to dE at
-+           current lambda, because the contributions to the current
-+           lambda are automatically zeroed */
-+
-+        for (j = 0; j < efptNR; j++)
-+        {
-+            /* Note that this loop is over all dhdl components, not just the separated ones */
-+            dlam = (fepvals->all_lambda[j][i]-lambda[j]);
-+            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
-+            if (debug)
-+            {
-+                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
-+                        fepvals->all_lambda[j][i], efpt_names[j],
-+                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
-+                        dlam, enerd->dvdl_lin[j]);
-+            }
-+        }
-+    }
-+}
-+
-+
-+void reset_foreign_enerdata(gmx_enerdata_t *enerd)
-+{
-+    int  i, j;
-+
-+    /* First reset all foreign energy components.  Foreign energies always called on
-+       neighbor search steps */
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        for (j = 0; (j < enerd->grpp.nener); j++)
-+        {
-+            enerd->foreign_grpp.ener[i][j] = 0.0;
-+        }
-+    }
-+
-+    /* potential energy components */
-+    for (i = 0; (i <= F_EPOT); i++)
-+    {
-+        enerd->foreign_term[i] = 0.0;
-+    }
-+}
-+
-+void reset_enerdata(t_forcerec *fr, gmx_bool bNS,
-+                    gmx_enerdata_t *enerd,
-+                    gmx_bool bMaster)
-+{
-+    gmx_bool bKeepLR;
-+    int      i, j;
-+
-+    /* First reset all energy components, except for the long range terms
-+     * on the master at non neighbor search steps, since the long range
-+     * terms have already been summed at the last neighbor search step.
-+     */
-+    bKeepLR = (fr->bTwinRange && !bNS);
-+    for (i = 0; (i < egNR); i++)
-+    {
-+        if (!(bKeepLR && bMaster && (i == egCOULLR || i == egLJLR)))
-+        {
-+            for (j = 0; (j < enerd->grpp.nener); j++)
-+            {
-+                enerd->grpp.ener[i][j] = 0.0;
-+            }
-+        }
-+    }
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        enerd->dvdl_lin[i]    = 0.0;
-+        enerd->dvdl_nonlin[i] = 0.0;
-+    }
-+
-+    /* Normal potential energy components */
-+    for (i = 0; (i <= F_EPOT); i++)
-+    {
-+        enerd->term[i] = 0.0;
-+    }
-+    /* Initialize the dVdlambda term with the long range contribution */
-+    /* Initialize the dvdl term with the long range contribution */
-+    enerd->term[F_DVDL]            = 0.0;
-+    enerd->term[F_DVDL_COUL]       = 0.0;
-+    enerd->term[F_DVDL_VDW]        = 0.0;
-+    enerd->term[F_DVDL_BONDED]     = 0.0;
-+    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
-+    enerd->term[F_DKDL]            = 0.0;
-+    if (enerd->n_lambda > 0)
-+    {
-+        for (i = 0; i < enerd->n_lambda; i++)
-+        {
-+            enerd->enerpart_lambda[i] = 0.0;
-+        }
-+    }
-+    /* reset foreign energy data - separate function since we also call it elsewhere */
-+    reset_foreign_enerdata(enerd);
-+}
-diff --git a/src/gromacs/mdlib/minimize.c b/src/gromacs/mdlib/minimize.c
-index 69008f5..5114fa0 100644
---- a/src/gromacs/mdlib/minimize.c
-+++ b/src/gromacs/mdlib/minimize.c
-@@ -80,6 +80,13 @@
- #include "gromacs/timing/walltime_accounting.h"
- #include "gromacs/imd/imd.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+extern void(*plumedcmd)(plumed,const char*,const void*);
-+/* END PLUMED */
-+
- typedef struct {
-     t_state  s;
-     rvec    *f;
-@@ -442,6 +449,43 @@ void init_em(FILE *fplog, const char *title,
- 
-     clear_rvec(mu_tot);
-     calc_shifts(ems->s.box, fr->shift_vec);
-+
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) (*plumedcmd) (plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            (*plumedcmd) (plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        (*plumedcmd) (plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }else{
-+          (*plumedcmd) (plumedmain,"setMPIComm",&cr->mpi_comm_mysim);
-+        }
-+      }
-+      (*plumedcmd) (plumedmain,"setNatoms",&top_global->natoms);
-+      (*plumedcmd) (plumedmain,"setMDEngine","gromacs");
-+      (*plumedcmd) (plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      (*plumedcmd) (plumedmain,"setTimestep",&real_delta_t);
-+      (*plumedcmd) (plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
- }
- 
- static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
-@@ -737,12 +781,34 @@ static void evaluate_energy(FILE *fplog, t_commrec *cr,
-         em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-                                ems, top, mdatoms, fr, vsite, constr,
-                                nrnb, wcycle);
-+        /* PLUMED */
-+        if(plumedswitch){
-+          (*plumedcmd) (plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          (*plumedcmd) (plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+        /* END PLUMED */
-     }
- 
-     /* Calc force & energy on new trial position  */
-     /* do_force always puts the charge groups in the box and shifts again
-      * We do not unshift, so molecules are always whole in congrad.c
-      */
-+    /* PLUMED */
-+    int plumedNeedsEnergy=0;
-+    matrix plumed_vir;
-+    if(plumedswitch){
-+      long int lstep=count; (*plumedcmd)(plumedmain,"setStepLong",&count);
-+      (*plumedcmd) (plumedmain,"setPositions",&ems->s.x[0][0]);
-+      (*plumedcmd) (plumedmain,"setMasses",&mdatoms->massT[0]);
-+      (*plumedcmd) (plumedmain,"setCharges",&mdatoms->chargeA[0]);
-+      (*plumedcmd) (plumedmain,"setBox",&ems->s.box[0][0]);
-+      (*plumedcmd) (plumedmain,"prepareCalc",NULL);
-+      (*plumedcmd) (plumedmain,"setForces",&ems->f[0][0]);
-+      (*plumedcmd) (plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+      clear_mat(plumed_vir);
-+      (*plumedcmd) (plumedmain,"setVirial",&plumed_vir[0][0]);
-+    }
-+    /* END PLUMED */
-     do_force(fplog, cr, inputrec,
-              count, nrnb, wcycle, top, &top_global->groups,
-              ems->s.box, ems->s.x, &ems->s.hist,
-@@ -751,6 +817,19 @@ static void evaluate_energy(FILE *fplog, t_commrec *cr,
-              GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
-              GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
-              (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
-+    /* PLUMED */
-+    if(plumedswitch){
-+      if(plumedNeedsEnergy) {
-+        msmul(force_vir,2.0,plumed_vir);
-+        (*plumedcmd) (plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+        (*plumedcmd) (plumedmain,"performCalc",NULL);
-+        msmul(plumed_vir,0.5,force_vir);
-+      } else {
-+        msmul(plumed_vir,0.5,plumed_vir);
-+        m_add(force_vir,plumed_vir,force_vir);
-+      }
-+    }
-+    /* END PLUMED */
- 
-     /* Clear the unused shake virial and pressure */
-     clear_mat(shake_vir);
-diff --git a/src/gromacs/mdlib/minimize.c.preplumed b/src/gromacs/mdlib/minimize.c.preplumed
-new file mode 100644
-index 0000000..69008f5
---- /dev/null
-+++ b/src/gromacs/mdlib/minimize.c.preplumed
-@@ -0,0 +1,2906 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <string.h>
-+#include <time.h>
-+#include <math.h>
-+#include "sysstuff.h"
-+#include "gromacs/utility/cstringutil.h"
-+#include "network.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "nrnb.h"
-+#include "main.h"
-+#include "force.h"
-+#include "macros.h"
-+#include "names.h"
-+#include "gmx_fatal.h"
-+#include "txtdump.h"
-+#include "typedefs.h"
-+#include "update.h"
-+#include "constr.h"
-+#include "vec.h"
-+#include "tgroup.h"
-+#include "mdebin.h"
-+#include "vsite.h"
-+#include "force.h"
-+#include "mdrun.h"
-+#include "md_support.h"
-+#include "sim_util.h"
-+#include "domdec.h"
-+#include "mdatoms.h"
-+#include "ns.h"
-+#include "mtop_util.h"
-+#include "pme.h"
-+#include "bondf.h"
-+#include "gmx_omp_nthreads.h"
-+#include "md_logging.h"
-+
-+#include "gromacs/fileio/confio.h"
-+#include "gromacs/fileio/trajectory_writing.h"
-+#include "gromacs/linearalgebra/mtxio.h"
-+#include "gromacs/linearalgebra/sparsematrix.h"
-+#include "gromacs/timing/wallcycle.h"
-+#include "gromacs/timing/walltime_accounting.h"
-+#include "gromacs/imd/imd.h"
-+
-+typedef struct {
-+    t_state  s;
-+    rvec    *f;
-+    real     epot;
-+    real     fnorm;
-+    real     fmax;
-+    int      a_fmax;
-+} em_state_t;
-+
-+static em_state_t *init_em_state()
-+{
-+    em_state_t *ems;
-+
-+    snew(ems, 1);
-+
-+    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
-+    snew(ems->s.lambda, efptNR);
-+
-+    return ems;
-+}
-+
-+static void print_em_start(FILE                     *fplog,
-+                           t_commrec                *cr,
-+                           gmx_walltime_accounting_t walltime_accounting,
-+                           gmx_wallcycle_t           wcycle,
-+                           const char               *name)
-+{
-+    walltime_accounting_start(walltime_accounting);
-+    wallcycle_start(wcycle, ewcRUN);
-+    print_start(fplog, cr, walltime_accounting, name);
-+}
-+static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
-+                        gmx_wallcycle_t           wcycle)
-+{
-+    wallcycle_stop(wcycle, ewcRUN);
-+
-+    walltime_accounting_end(walltime_accounting);
-+}
-+
-+static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
-+{
-+    fprintf(out, "\n");
-+    fprintf(out, "%s:\n", minimizer);
-+    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
-+    fprintf(out, "   Number of steps    = %12d\n", nsteps);
-+}
-+
-+static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
-+{
-+    char buffer[2048];
-+    if (bLastStep)
-+    {
-+        sprintf(buffer,
-+                "\nEnergy minimization reached the maximum number "
-+                "of steps before the forces reached the requested "
-+                "precision Fmax < %g.\n", ftol);
-+    }
-+    else
-+    {
-+        sprintf(buffer,
-+                "\nEnergy minimization has stopped, but the forces have "
-+                "not converged to the requested precision Fmax < %g (which "
-+                "may not be possible for your system). It stopped "
-+                "because the algorithm tried to make a new step whose size "
-+                "was too small, or there was no change in the energy since "
-+                "last step. Either way, we regard the minimization as "
-+                "converged to within the available machine precision, "
-+                "given your starting configuration and EM parameters.\n%s%s",
-+                ftol,
-+                sizeof(real) < sizeof(double) ?
-+                "\nDouble precision normally gives you higher accuracy, but "
-+                "this is often not needed for preparing to run molecular "
-+                "dynamics.\n" :
-+                "",
-+                bConstrain ?
-+                "You might need to increase your constraint accuracy, or turn\n"
-+                "off constraints altogether (set constraints = none in mdp file)\n" :
-+                "");
-+    }
-+    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
-+}
-+
-+
-+
-+static void print_converged(FILE *fp, const char *alg, real ftol,
-+                            gmx_int64_t count, gmx_bool bDone, gmx_int64_t nsteps,
-+                            real epot, real fmax, int nfmax, real fnorm)
-+{
-+    char buf[STEPSTRSIZE];
-+
-+    if (bDone)
-+    {
-+        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
-+                alg, ftol, gmx_step_str(count, buf));
-+    }
-+    else if (count < nsteps)
-+    {
-+        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
-+                "but did not reach the requested Fmax < %g.\n",
-+                alg, gmx_step_str(count, buf), ftol);
-+    }
-+    else
-+    {
-+        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
-+                alg, ftol, gmx_step_str(count, buf));
-+    }
-+
-+#ifdef GMX_DOUBLE
-+    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
-+    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
-+    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
-+#else
-+    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
-+    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
-+    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
-+#endif
-+}
-+
-+static void get_f_norm_max(t_commrec *cr,
-+                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
-+                           real *fnorm, real *fmax, int *a_fmax)
-+{
-+    double fnorm2, *sum;
-+    real   fmax2, fmax2_0, fam;
-+    int    la_max, a_max, start, end, i, m, gf;
-+
-+    /* This routine finds the largest force and returns it.
-+     * On parallel machines the global max is taken.
-+     */
-+    fnorm2 = 0;
-+    fmax2  = 0;
-+    la_max = -1;
-+    gf     = 0;
-+    start  = 0;
-+    end    = mdatoms->homenr;
-+    if (mdatoms->cFREEZE)
-+    {
-+        for (i = start; i < end; i++)
-+        {
-+            gf  = mdatoms->cFREEZE[i];
-+            fam = 0;
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    fam += sqr(f[i][m]);
-+                }
-+            }
-+            fnorm2 += fam;
-+            if (fam > fmax2)
-+            {
-+                fmax2  = fam;
-+                la_max = i;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        for (i = start; i < end; i++)
-+        {
-+            fam     = norm2(f[i]);
-+            fnorm2 += fam;
-+            if (fam > fmax2)
-+            {
-+                fmax2  = fam;
-+                la_max = i;
-+            }
-+        }
-+    }
-+
-+    if (la_max >= 0 && DOMAINDECOMP(cr))
-+    {
-+        a_max = cr->dd->gatindex[la_max];
-+    }
-+    else
-+    {
-+        a_max = la_max;
-+    }
-+    if (PAR(cr))
-+    {
-+        snew(sum, 2*cr->nnodes+1);
-+        sum[2*cr->nodeid]   = fmax2;
-+        sum[2*cr->nodeid+1] = a_max;
-+        sum[2*cr->nnodes]   = fnorm2;
-+        gmx_sumd(2*cr->nnodes+1, sum, cr);
-+        fnorm2 = sum[2*cr->nnodes];
-+        /* Determine the global maximum */
-+        for (i = 0; i < cr->nnodes; i++)
-+        {
-+            if (sum[2*i] > fmax2)
-+            {
-+                fmax2 = sum[2*i];
-+                a_max = (int)(sum[2*i+1] + 0.5);
-+            }
-+        }
-+        sfree(sum);
-+    }
-+
-+    if (fnorm)
-+    {
-+        *fnorm = sqrt(fnorm2);
-+    }
-+    if (fmax)
-+    {
-+        *fmax  = sqrt(fmax2);
-+    }
-+    if (a_fmax)
-+    {
-+        *a_fmax = a_max;
-+    }
-+}
-+
-+static void get_state_f_norm_max(t_commrec *cr,
-+                                 t_grpopts *opts, t_mdatoms *mdatoms,
-+                                 em_state_t *ems)
-+{
-+    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
-+}
-+
-+void init_em(FILE *fplog, const char *title,
-+             t_commrec *cr, t_inputrec *ir,
-+             t_state *state_global, gmx_mtop_t *top_global,
-+             em_state_t *ems, gmx_localtop_t **top,
-+             rvec **f, rvec **f_global,
-+             t_nrnb *nrnb, rvec mu_tot,
-+             t_forcerec *fr, gmx_enerdata_t **enerd,
-+             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int nfile, const t_filenm fnm[],
-+             gmx_mdoutf_t *outf, t_mdebin **mdebin,
-+             int imdport, unsigned long gmx_unused Flags,
-+             gmx_wallcycle_t wcycle)
-+{
-+    int  i;
-+    real dvdl_constr;
-+
-+    if (fplog)
-+    {
-+        fprintf(fplog, "Initiating %s\n", title);
-+    }
-+
-+    state_global->ngtc = 0;
-+
-+    /* Initialize lambda variables */
-+    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
-+
-+    init_nrnb(nrnb);
-+
-+    /* Interactive molecular dynamics */
-+    init_IMD(ir, cr, top_global, fplog, 1, state_global->x,
-+             nfile, fnm, NULL, imdport, Flags);
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        *top = dd_init_local_top(top_global);
-+
-+        dd_init_local_state(cr->dd, state_global, &ems->s);
-+
-+        *f = NULL;
-+
-+        /* Distribute the charge groups over the nodes from the master node */
-+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
-+                            state_global, top_global, ir,
-+                            &ems->s, &ems->f, mdatoms, *top,
-+                            fr, vsite, NULL, constr,
-+                            nrnb, NULL, FALSE);
-+        dd_store_state(cr->dd, &ems->s);
-+
-+        if (ir->nstfout)
-+        {
-+            snew(*f_global, top_global->natoms);
-+        }
-+        else
-+        {
-+            *f_global = NULL;
-+        }
-+        *graph = NULL;
-+    }
-+    else
-+    {
-+        snew(*f, top_global->natoms);
-+
-+        /* Just copy the state */
-+        ems->s = *state_global;
-+        snew(ems->s.x, ems->s.nalloc);
-+        snew(ems->f, ems->s.nalloc);
-+        for (i = 0; i < state_global->natoms; i++)
-+        {
-+            copy_rvec(state_global->x[i], ems->s.x[i]);
-+        }
-+        copy_mat(state_global->box, ems->s.box);
-+
-+        *top      = gmx_mtop_generate_local_top(top_global, ir);
-+        *f_global = *f;
-+
-+        forcerec_set_excl_load(fr, *top);
-+
-+        setup_bonded_threading(fr, &(*top)->idef);
-+
-+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
-+        {
-+            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
-+        }
-+        else
-+        {
-+            *graph = NULL;
-+        }
-+
-+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
-+        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
-+
-+        if (vsite)
-+        {
-+            set_vsite_top(vsite, *top, mdatoms, cr);
-+        }
-+    }
-+
-+    if (constr)
-+    {
-+        if (ir->eConstrAlg == econtSHAKE &&
-+            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
-+        {
-+            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
-+                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
-+        }
-+
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            set_constraints(constr, *top, ir, mdatoms, cr);
-+        }
-+
-+        if (!ir->bContinuation)
-+        {
-+            /* Constrain the starting coordinates */
-+            dvdl_constr = 0;
-+            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
-+                      ir, NULL, cr, -1, 0, 1.0, mdatoms,
-+                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
-+                      ems->s.lambda[efptFEP], &dvdl_constr,
-+                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
-+        }
-+    }
-+
-+    if (PAR(cr))
-+    {
-+        *gstat = global_stat_init(ir);
-+    }
-+
-+    *outf = init_mdoutf(fplog, nfile, fnm, 0, cr, ir, top_global, NULL, wcycle);
-+
-+    snew(*enerd, 1);
-+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
-+                  *enerd);
-+
-+    if (mdebin != NULL)
-+    {
-+        /* Init bin for energy stuff */
-+        *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, NULL);
-+    }
-+
-+    clear_rvec(mu_tot);
-+    calc_shifts(ems->s.box, fr->shift_vec);
-+}
-+
-+static void finish_em(t_commrec *cr, gmx_mdoutf_t outf,
-+                      gmx_walltime_accounting_t walltime_accounting,
-+                      gmx_wallcycle_t wcycle)
-+{
-+    if (!(cr->duty & DUTY_PME))
-+    {
-+        /* Tell the PME only node to finish */
-+        gmx_pme_send_finish(cr);
-+    }
-+
-+    done_mdoutf(outf);
-+
-+    em_time_end(walltime_accounting, wcycle);
-+}
-+
-+static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
-+{
-+    em_state_t tmp;
-+
-+    tmp   = *ems1;
-+    *ems1 = *ems2;
-+    *ems2 = tmp;
-+}
-+
-+static void copy_em_coords(em_state_t *ems, t_state *state)
-+{
-+    int i;
-+
-+    for (i = 0; (i < state->natoms); i++)
-+    {
-+        copy_rvec(ems->s.x[i], state->x[i]);
-+    }
-+}
-+
-+static void write_em_traj(FILE *fplog, t_commrec *cr,
-+                          gmx_mdoutf_t outf,
-+                          gmx_bool bX, gmx_bool bF, const char *confout,
-+                          gmx_mtop_t *top_global,
-+                          t_inputrec *ir, gmx_int64_t step,
-+                          em_state_t *state,
-+                          t_state *state_global, rvec *f_global)
-+{
-+    int      mdof_flags;
-+    gmx_bool bIMDout = FALSE;
-+
-+
-+    /* Shall we do IMD output? */
-+    if (ir->bIMD)
-+    {
-+        bIMDout = do_per_step(step, IMD_get_step(ir->imd->setup));
-+    }
-+
-+    if ((bX || bF || bIMDout || confout != NULL) && !DOMAINDECOMP(cr))
-+    {
-+        copy_em_coords(state, state_global);
-+        f_global = state->f;
-+    }
-+
-+    mdof_flags = 0;
-+    if (bX)
-+    {
-+        mdof_flags |= MDOF_X;
-+    }
-+    if (bF)
-+    {
-+        mdof_flags |= MDOF_F;
-+    }
-+
-+    /* If we want IMD output, set appropriate MDOF flag */
-+    if (ir->bIMD)
-+    {
-+        mdof_flags |= MDOF_IMD;
-+    }
-+
-+    mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
-+                                     top_global, step, (double)step,
-+                                     &state->s, state_global, state->f, f_global);
-+
-+    if (confout != NULL && MASTER(cr))
-+    {
-+        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
-+        {
-+            /* Make molecules whole only for confout writing */
-+            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
-+                        state_global->x);
-+        }
-+
-+        write_sto_conf_mtop(confout,
-+                            *top_global->name, top_global,
-+                            state_global->x, NULL, ir->ePBC, state_global->box);
-+    }
-+}
-+
-+static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
-+                       gmx_bool bMolPBC,
-+                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
-+                       gmx_constr_t constr, gmx_localtop_t *top,
-+                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                       gmx_int64_t count)
-+
-+{
-+    t_state *s1, *s2;
-+    int      i;
-+    int      start, end;
-+    rvec    *x1, *x2;
-+    real     dvdl_constr;
-+    int      nthreads gmx_unused;
-+
-+    s1 = &ems1->s;
-+    s2 = &ems2->s;
-+
-+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
-+    {
-+        gmx_incons("state mismatch in do_em_step");
-+    }
-+
-+    s2->flags = s1->flags;
-+
-+    if (s2->nalloc != s1->nalloc)
-+    {
-+        s2->nalloc = s1->nalloc;
-+        srenew(s2->x, s1->nalloc);
-+        srenew(ems2->f,  s1->nalloc);
-+        if (s2->flags & (1<<estCGP))
-+        {
-+            srenew(s2->cg_p,  s1->nalloc);
-+        }
-+    }
-+
-+    s2->natoms = s1->natoms;
-+    copy_mat(s1->box, s2->box);
-+    /* Copy free energy state */
-+    for (i = 0; i < efptNR; i++)
-+    {
-+        s2->lambda[i] = s1->lambda[i];
-+    }
-+    copy_mat(s1->box, s2->box);
-+
-+    start = 0;
-+    end   = md->homenr;
-+
-+    x1 = s1->x;
-+    x2 = s2->x;
-+
-+    nthreads = gmx_omp_nthreads_get(emntUpdate);
-+#pragma omp parallel num_threads(nthreads)
-+    {
-+        int gf, i, m;
-+
-+        gf = 0;
-+#pragma omp for schedule(static) nowait
-+        for (i = start; i < end; i++)
-+        {
-+            if (md->cFREEZE)
-+            {
-+                gf = md->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (ir->opts.nFreeze[gf][m])
-+                {
-+                    x2[i][m] = x1[i][m];
-+                }
-+                else
-+                {
-+                    x2[i][m] = x1[i][m] + a*f[i][m];
-+                }
-+            }
-+        }
-+
-+        if (s2->flags & (1<<estCGP))
-+        {
-+            /* Copy the CG p vector */
-+            x1 = s1->cg_p;
-+            x2 = s2->cg_p;
-+#pragma omp for schedule(static) nowait
-+            for (i = start; i < end; i++)
-+            {
-+                copy_rvec(x1[i], x2[i]);
-+            }
-+        }
-+
-+        if (DOMAINDECOMP(cr))
-+        {
-+            s2->ddp_count = s1->ddp_count;
-+            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
-+            {
-+#pragma omp barrier
-+                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
-+                srenew(s2->cg_gl, s2->cg_gl_nalloc);
-+#pragma omp barrier
-+            }
-+            s2->ncg_gl = s1->ncg_gl;
-+#pragma omp for schedule(static) nowait
-+            for (i = 0; i < s2->ncg_gl; i++)
-+            {
-+                s2->cg_gl[i] = s1->cg_gl[i];
-+            }
-+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
-+        }
-+    }
-+
-+    if (constr)
-+    {
-+        wallcycle_start(wcycle, ewcCONSTR);
-+        dvdl_constr = 0;
-+        constrain(NULL, TRUE, TRUE, constr, &top->idef,
-+                  ir, NULL, cr, count, 0, 1.0, md,
-+                  s1->x, s2->x, NULL, bMolPBC, s2->box,
-+                  s2->lambda[efptBONDED], &dvdl_constr,
-+                  NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
-+        wallcycle_stop(wcycle, ewcCONSTR);
-+    }
-+}
-+
-+static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
-+                                   gmx_mtop_t *top_global, t_inputrec *ir,
-+                                   em_state_t *ems, gmx_localtop_t *top,
-+                                   t_mdatoms *mdatoms, t_forcerec *fr,
-+                                   gmx_vsite_t *vsite, gmx_constr_t constr,
-+                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
-+{
-+    /* Repartition the domain decomposition */
-+    wallcycle_start(wcycle, ewcDOMDEC);
-+    dd_partition_system(fplog, step, cr, FALSE, 1,
-+                        NULL, top_global, ir,
-+                        &ems->s, &ems->f,
-+                        mdatoms, top, fr, vsite, NULL, constr,
-+                        nrnb, wcycle, FALSE);
-+    dd_store_state(cr->dd, &ems->s);
-+    wallcycle_stop(wcycle, ewcDOMDEC);
-+}
-+
-+static void evaluate_energy(FILE *fplog, t_commrec *cr,
-+                            gmx_mtop_t *top_global,
-+                            em_state_t *ems, gmx_localtop_t *top,
-+                            t_inputrec *inputrec,
-+                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                            gmx_global_stat_t gstat,
-+                            gmx_vsite_t *vsite, gmx_constr_t constr,
-+                            t_fcdata *fcd,
-+                            t_graph *graph, t_mdatoms *mdatoms,
-+                            t_forcerec *fr, rvec mu_tot,
-+                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
-+                            gmx_int64_t count, gmx_bool bFirst)
-+{
-+    real     t;
-+    gmx_bool bNS;
-+    int      nabnsb;
-+    tensor   force_vir, shake_vir, ekin;
-+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
-+    real     terminate = 0;
-+
-+    /* Set the time to the initial time, the time does not change during EM */
-+    t = inputrec->init_t;
-+
-+    if (bFirst ||
-+        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
-+    {
-+        /* This is the first state or an old state used before the last ns */
-+        bNS = TRUE;
-+    }
-+    else
-+    {
-+        bNS = FALSE;
-+        if (inputrec->nstlist > 0)
-+        {
-+            bNS = TRUE;
-+        }
-+        else if (inputrec->nstlist == -1)
-+        {
-+            nabnsb = natoms_beyond_ns_buffer(inputrec, fr, &top->cgs, NULL, ems->s.x);
-+            if (PAR(cr))
-+            {
-+                gmx_sumi(1, &nabnsb, cr);
-+            }
-+            bNS = (nabnsb > 0);
-+        }
-+    }
-+
-+    if (vsite)
-+    {
-+        construct_vsites(vsite, ems->s.x, 1, NULL,
-+                         top->idef.iparams, top->idef.il,
-+                         fr->ePBC, fr->bMolPBC, cr, ems->s.box);
-+    }
-+
-+    if (DOMAINDECOMP(cr) && bNS)
-+    {
-+        /* Repartition the domain decomposition */
-+        em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-+                               ems, top, mdatoms, fr, vsite, constr,
-+                               nrnb, wcycle);
-+    }
-+
-+    /* Calc force & energy on new trial position  */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole in congrad.c
-+     */
-+    do_force(fplog, cr, inputrec,
-+             count, nrnb, wcycle, top, &top_global->groups,
-+             ems->s.box, ems->s.x, &ems->s.hist,
-+             ems->f, force_vir, mdatoms, enerd, fcd,
-+             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
-+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
-+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
-+             (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
-+
-+    /* Clear the unused shake virial and pressure */
-+    clear_mat(shake_vir);
-+    clear_mat(pres);
-+
-+    /* Communicate stuff when parallel */
-+    if (PAR(cr) && inputrec->eI != eiNM)
-+    {
-+        wallcycle_start(wcycle, ewcMoveE);
-+
-+        global_stat(fplog, gstat, cr, enerd, force_vir, shake_vir, mu_tot,
-+                    inputrec, NULL, NULL, NULL, 1, &terminate,
-+                    top_global, &ems->s, FALSE,
-+                    CGLO_ENERGY |
-+                    CGLO_PRESSURE |
-+                    CGLO_CONSTRAINT |
-+                    CGLO_FIRSTITERATE);
-+
-+        wallcycle_stop(wcycle, ewcMoveE);
-+    }
-+
-+    /* Calculate long range corrections to pressure and energy */
-+    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
-+                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
-+    enerd->term[F_DISPCORR] = enercorr;
-+    enerd->term[F_EPOT]    += enercorr;
-+    enerd->term[F_PRES]    += prescorr;
-+    enerd->term[F_DVDL]    += dvdlcorr;
-+
-+    ems->epot = enerd->term[F_EPOT];
-+
-+    if (constr)
-+    {
-+        /* Project out the constraint components of the force */
-+        wallcycle_start(wcycle, ewcCONSTR);
-+        dvdl_constr = 0;
-+        constrain(NULL, FALSE, FALSE, constr, &top->idef,
-+                  inputrec, NULL, cr, count, 0, 1.0, mdatoms,
-+                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
-+                  ems->s.lambda[efptBONDED], &dvdl_constr,
-+                  NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
-+        if (fr->bSepDVDL && fplog)
-+        {
-+            gmx_print_sepdvdl(fplog, "Constraints", t, dvdl_constr);
-+        }
-+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-+        m_add(force_vir, shake_vir, vir);
-+        wallcycle_stop(wcycle, ewcCONSTR);
-+    }
-+    else
-+    {
-+        copy_mat(force_vir, vir);
-+    }
-+
-+    clear_mat(ekin);
-+    enerd->term[F_PRES] =
-+        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
-+
-+    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
-+
-+    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
-+    {
-+        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
-+    }
-+}
-+
-+static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
-+                              gmx_mtop_t *mtop,
-+                              em_state_t *s_min, em_state_t *s_b)
-+{
-+    rvec          *fm, *fb, *fmg;
-+    t_block       *cgs_gl;
-+    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
-+    double         partsum;
-+    unsigned char *grpnrFREEZE;
-+
-+    if (debug)
-+    {
-+        fprintf(debug, "Doing reorder_partsum\n");
-+    }
-+
-+    fm = s_min->f;
-+    fb = s_b->f;
-+
-+    cgs_gl = dd_charge_groups_global(cr->dd);
-+    index  = cgs_gl->index;
-+
-+    /* Collect fm in a global vector fmg.
-+     * This conflicts with the spirit of domain decomposition,
-+     * but to fully optimize this a much more complicated algorithm is required.
-+     */
-+    snew(fmg, mtop->natoms);
-+
-+    ncg   = s_min->s.ncg_gl;
-+    cg_gl = s_min->s.cg_gl;
-+    i     = 0;
-+    for (c = 0; c < ncg; c++)
-+    {
-+        cg = cg_gl[c];
-+        a0 = index[cg];
-+        a1 = index[cg+1];
-+        for (a = a0; a < a1; a++)
-+        {
-+            copy_rvec(fm[i], fmg[a]);
-+            i++;
-+        }
-+    }
-+    gmx_sum(mtop->natoms*3, fmg[0], cr);
-+
-+    /* Now we will determine the part of the sum for the cgs in state s_b */
-+    ncg         = s_b->s.ncg_gl;
-+    cg_gl       = s_b->s.cg_gl;
-+    partsum     = 0;
-+    i           = 0;
-+    gf          = 0;
-+    grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
-+    for (c = 0; c < ncg; c++)
-+    {
-+        cg = cg_gl[c];
-+        a0 = index[cg];
-+        a1 = index[cg+1];
-+        for (a = a0; a < a1; a++)
-+        {
-+            if (mdatoms->cFREEZE && grpnrFREEZE)
-+            {
-+                gf = grpnrFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
-+                }
-+            }
-+            i++;
-+        }
-+    }
-+
-+    sfree(fmg);
-+
-+    return partsum;
-+}
-+
-+static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
-+                    gmx_mtop_t *mtop,
-+                    em_state_t *s_min, em_state_t *s_b)
-+{
-+    rvec  *fm, *fb;
-+    double sum;
-+    int    gf, i, m;
-+
-+    /* This is just the classical Polak-Ribiere calculation of beta;
-+     * it looks a bit complicated since we take freeze groups into account,
-+     * and might have to sum it in parallel runs.
-+     */
-+
-+    if (!DOMAINDECOMP(cr) ||
-+        (s_min->s.ddp_count == cr->dd->ddp_count &&
-+         s_b->s.ddp_count   == cr->dd->ddp_count))
-+    {
-+        fm  = s_min->f;
-+        fb  = s_b->f;
-+        sum = 0;
-+        gf  = 0;
-+        /* This part of code can be incorrect with DD,
-+         * since the atom ordering in s_b and s_min might differ.
-+         */
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            if (mdatoms->cFREEZE)
-+            {
-+                gf = mdatoms->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!opts->nFreeze[gf][m])
-+                {
-+                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
-+                }
-+            }
-+        }
-+    }
-+    else
-+    {
-+        /* We need to reorder cgs while summing */
-+        sum = reorder_partsum(cr, opts, mdatoms, mtop, s_min, s_b);
-+    }
-+    if (PAR(cr))
-+    {
-+        gmx_sumd(1, &sum, cr);
-+    }
-+
-+    return sum/sqr(s_min->fnorm);
-+}
-+
-+double do_cg(FILE *fplog, t_commrec *cr,
-+             int nfile, const t_filenm fnm[],
-+             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+             int gmx_unused nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int gmx_unused stepout,
-+             t_inputrec *inputrec,
-+             gmx_mtop_t *top_global, t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t gmx_unused ed,
-+             t_forcerec *fr,
-+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+             gmx_membed_t gmx_unused membed,
-+             real gmx_unused cpt_period, real gmx_unused max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long gmx_unused Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char       *CG = "Polak-Ribiere Conjugate Gradients";
-+
-+    em_state_t       *s_min, *s_a, *s_b, *s_c;
-+    gmx_localtop_t   *top;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f;
-+    gmx_global_stat_t gstat;
-+    t_graph          *graph;
-+    rvec             *f_global, *p, *sf, *sfm;
-+    double            gpa, gpb, gpc, tmp, sum[2], minstep;
-+    real              fnormn;
-+    real              stepsize;
-+    real              a, b, c, beta = 0.0;
-+    real              epot_repl = 0;
-+    real              pnorm;
-+    t_mdebin         *mdebin;
-+    gmx_bool          converged, foundlower;
-+    rvec              mu_tot;
-+    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
-+    tensor            vir, pres;
-+    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
-+    gmx_mdoutf_t      outf;
-+    int               i, m, gf, step, nminstep;
-+    real              terminate = 0;
-+
-+    step = 0;
-+
-+    s_min = init_em_state();
-+    s_a   = init_em_state();
-+    s_b   = init_em_state();
-+    s_c   = init_em_state();
-+
-+    /* Init em and store the local state in s_min */
-+    init_em(fplog, CG, cr, inputrec,
-+            state_global, top_global, s_min, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+
-+    /* Print to log file */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, CG);
-+
-+    /* Max number of steps */
-+    number_steps = inputrec->nsteps;
-+
-+    if (MASTER(cr))
-+    {
-+        sp_header(stderr, CG, inputrec->em_tol, number_steps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, CG, inputrec->em_tol, number_steps);
-+    }
-+
-+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole in congrad.c
-+     */
-+    evaluate_energy(fplog, cr,
-+                    top_global, s_min, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    where();
-+
-+    if (MASTER(cr))
-+    {
-+        /* Copy stuff to the energy bin for easy printing etc. */
-+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
-+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+        print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+    where();
-+
-+    /* Estimate/guess the initial stepsize */
-+    stepsize = inputrec->em_stepsize/s_min->fnorm;
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
-+                s_min->fmax, s_min->a_fmax+1);
-+        fprintf(stderr, "   F-Norm            = %12.5e\n",
-+                s_min->fnorm/sqrt(state_global->natoms));
-+        fprintf(stderr, "\n");
-+        /* and copy to the log file too... */
-+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
-+                s_min->fmax, s_min->a_fmax+1);
-+        fprintf(fplog, "   F-Norm            = %12.5e\n",
-+                s_min->fnorm/sqrt(state_global->natoms));
-+        fprintf(fplog, "\n");
-+    }
-+    /* Start the loop over CG steps.
-+     * Each successful step is counted, and we continue until
-+     * we either converge or reach the max number of steps.
-+     */
-+    converged = FALSE;
-+    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
-+    {
-+
-+        /* start taking steps in a new direction
-+         * First time we enter the routine, beta=0, and the direction is
-+         * simply the negative gradient.
-+         */
-+
-+        /* Calculate the new direction in p, and the gradient in this direction, gpa */
-+        p   = s_min->s.cg_p;
-+        sf  = s_min->f;
-+        gpa = 0;
-+        gf  = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            if (mdatoms->cFREEZE)
-+            {
-+                gf = mdatoms->cFREEZE[i];
-+            }
-+            for (m = 0; m < DIM; m++)
-+            {
-+                if (!inputrec->opts.nFreeze[gf][m])
-+                {
-+                    p[i][m] = sf[i][m] + beta*p[i][m];
-+                    gpa    -= p[i][m]*sf[i][m];
-+                    /* f is negative gradient, thus the sign */
-+                }
-+                else
-+                {
-+                    p[i][m] = 0;
-+                }
-+            }
-+        }
-+
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpa, cr);
-+        }
-+
-+        /* Calculate the norm of the search vector */
-+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
-+
-+        /* Just in case stepsize reaches zero due to numerical precision... */
-+        if (stepsize <= 0)
-+        {
-+            stepsize = inputrec->em_stepsize/pnorm;
-+        }
-+
-+        /*
-+         * Double check the value of the derivative in the search direction.
-+         * If it is positive it must be due to the old information in the
-+         * CG formula, so just remove that and start over with beta=0.
-+         * This corresponds to a steepest descent step.
-+         */
-+        if (gpa > 0)
-+        {
-+            beta = 0;
-+            step--;   /* Don't count this step since we are restarting */
-+            continue; /* Go back to the beginning of the big for-loop */
-+        }
-+
-+        /* Calculate minimum allowed stepsize, before the average (norm)
-+         * relative change in coordinate is smaller than precision
-+         */
-+        minstep = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            for (m = 0; m < DIM; m++)
-+            {
-+                tmp = fabs(s_min->s.x[i][m]);
-+                if (tmp < 1.0)
-+                {
-+                    tmp = 1.0;
-+                }
-+                tmp      = p[i][m]/tmp;
-+                minstep += tmp*tmp;
-+            }
-+        }
-+        /* Add up from all CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &minstep, cr);
-+        }
-+
-+        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
-+
-+        if (stepsize < minstep)
-+        {
-+            converged = TRUE;
-+            break;
-+        }
-+
-+        /* Write coordinates if necessary */
-+        do_x = do_per_step(step, inputrec->nstxout);
-+        do_f = do_per_step(step, inputrec->nstfout);
-+
-+        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
-+                      top_global, inputrec, step,
-+                      s_min, state_global, f_global);
-+
-+        /* Take a step downhill.
-+         * In theory, we should minimize the function along this direction.
-+         * That is quite possible, but it turns out to take 5-10 function evaluations
-+         * for each line. However, we dont really need to find the exact minimum -
-+         * it is much better to start a new CG step in a modified direction as soon
-+         * as we are close to it. This will save a lot of energy evaluations.
-+         *
-+         * In practice, we just try to take a single step.
-+         * If it worked (i.e. lowered the energy), we increase the stepsize but
-+         * the continue straight to the next CG step without trying to find any minimum.
-+         * If it didn't work (higher energy), there must be a minimum somewhere between
-+         * the old position and the new one.
-+         *
-+         * Due to the finite numerical accuracy, it turns out that it is a good idea
-+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-+         * This leads to lower final energies in the tests I've done. / Erik
-+         */
-+        s_a->epot = s_min->epot;
-+        a         = 0.0;
-+        c         = a + stepsize; /* reference position along line is zero */
-+
-+        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
-+        {
-+            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
-+                                   s_min, top, mdatoms, fr, vsite, constr,
-+                                   nrnb, wcycle);
-+        }
-+
-+        /* Take a trial step (new coords in s_c) */
-+        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
-+                   constr, top, nrnb, wcycle, -1);
-+
-+        neval++;
-+        /* Calculate energy for the trial step */
-+        evaluate_energy(fplog, cr,
-+                        top_global, s_c, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, -1, FALSE);
-+
-+        /* Calc derivative along line */
-+        p   = s_c->s.cg_p;
-+        sf  = s_c->f;
-+        gpc = 0;
-+        for (i = 0; i < mdatoms->homenr; i++)
-+        {
-+            for (m = 0; m < DIM; m++)
-+            {
-+                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
-+            }
-+        }
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpc, cr);
-+        }
-+
-+        /* This is the max amount of increase in energy we tolerate */
-+        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
-+
-+        /* Accept the step if the energy is lower, or if it is not significantly higher
-+         * and the line derivative is still negative.
-+         */
-+        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
-+        {
-+            foundlower = TRUE;
-+            /* Great, we found a better energy. Increase step for next iteration
-+             * if we are still going down, decrease it otherwise
-+             */
-+            if (gpc < 0)
-+            {
-+                stepsize *= 1.618034; /* The golden section */
-+            }
-+            else
-+            {
-+                stepsize *= 0.618034; /* 1/golden section */
-+            }
-+        }
-+        else
-+        {
-+            /* New energy is the same or higher. We will have to do some work
-+             * to find a smaller value in the interval. Take smaller step next time!
-+             */
-+            foundlower = FALSE;
-+            stepsize  *= 0.618034;
-+        }
-+
-+
-+
-+
-+        /* OK, if we didn't find a lower value we will have to locate one now - there must
-+         * be one in the interval [a=0,c].
-+         * The same thing is valid here, though: Don't spend dozens of iterations to find
-+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-+         *
-+         * I also have a safeguard for potentially really patological functions so we never
-+         * take more than 20 steps before we give up ...
-+         *
-+         * If we already found a lower value we just skip this step and continue to the update.
-+         */
-+        if (!foundlower)
-+        {
-+            nminstep = 0;
-+
-+            do
-+            {
-+                /* Select a new trial point.
-+                 * If the derivatives at points a & c have different sign we interpolate to zero,
-+                 * otherwise just do a bisection.
-+                 */
-+                if (gpa < 0 && gpc > 0)
-+                {
-+                    b = a + gpa*(a-c)/(gpc-gpa);
-+                }
-+                else
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* safeguard if interpolation close to machine accuracy causes errors:
-+                 * never go outside the interval
-+                 */
-+                if (b <= a || b >= c)
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-+                {
-+                    /* Reload the old state */
-+                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
-+                                           s_min, top, mdatoms, fr, vsite, constr,
-+                                           nrnb, wcycle);
-+                }
-+
-+                /* Take a trial step to this new point - new coords in s_b */
-+                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
-+                           constr, top, nrnb, wcycle, -1);
-+
-+                neval++;
-+                /* Calculate energy for the trial step */
-+                evaluate_energy(fplog, cr,
-+                                top_global, s_b, top,
-+                                inputrec, nrnb, wcycle, gstat,
-+                                vsite, constr, fcd, graph, mdatoms, fr,
-+                                mu_tot, enerd, vir, pres, -1, FALSE);
-+
-+                /* p does not change within a step, but since the domain decomposition
-+                 * might change, we have to use cg_p of s_b here.
-+                 */
-+                p   = s_b->s.cg_p;
-+                sf  = s_b->f;
-+                gpb = 0;
-+                for (i = 0; i < mdatoms->homenr; i++)
-+                {
-+                    for (m = 0; m < DIM; m++)
-+                    {
-+                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
-+                    }
-+                }
-+                /* Sum the gradient along the line across CPUs */
-+                if (PAR(cr))
-+                {
-+                    gmx_sumd(1, &gpb, cr);
-+                }
-+
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
-+                            s_a->epot, s_b->epot, s_c->epot, gpb);
-+                }
-+
-+                epot_repl = s_b->epot;
-+
-+                /* Keep one of the intervals based on the value of the derivative at the new point */
-+                if (gpb > 0)
-+                {
-+                    /* Replace c endpoint with b */
-+                    swap_em_state(s_b, s_c);
-+                    c   = b;
-+                    gpc = gpb;
-+                }
-+                else
-+                {
-+                    /* Replace a endpoint with b */
-+                    swap_em_state(s_b, s_a);
-+                    a   = b;
-+                    gpa = gpb;
-+                }
-+
-+                /*
-+                 * Stop search as soon as we find a value smaller than the endpoints.
-+                 * Never run more than 20 steps, no matter what.
-+                 */
-+                nminstep++;
-+            }
-+            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
-+                   (nminstep < 20));
-+
-+            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
-+                nminstep >= 20)
-+            {
-+                /* OK. We couldn't find a significantly lower energy.
-+                 * If beta==0 this was steepest descent, and then we give up.
-+                 * If not, set beta=0 and restart with steepest descent before quitting.
-+                 */
-+                if (beta == 0.0)
-+                {
-+                    /* Converged */
-+                    converged = TRUE;
-+                    break;
-+                }
-+                else
-+                {
-+                    /* Reset memory before giving up */
-+                    beta = 0.0;
-+                    continue;
-+                }
-+            }
-+
-+            /* Select min energy state of A & C, put the best in B.
-+             */
-+            if (s_c->epot < s_a->epot)
-+            {
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
-+                            s_c->epot, s_a->epot);
-+                }
-+                swap_em_state(s_b, s_c);
-+                gpb = gpc;
-+                b   = c;
-+            }
-+            else
-+            {
-+                if (debug)
-+                {
-+                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
-+                            s_a->epot, s_c->epot);
-+                }
-+                swap_em_state(s_b, s_a);
-+                gpb = gpa;
-+                b   = a;
-+            }
-+
-+        }
-+        else
-+        {
-+            if (debug)
-+            {
-+                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
-+                        s_c->epot);
-+            }
-+            swap_em_state(s_b, s_c);
-+            gpb = gpc;
-+            b   = c;
-+        }
-+
-+        /* new search direction */
-+        /* beta = 0 means forget all memory and restart with steepest descents. */
-+        if (nstcg && ((step % nstcg) == 0))
-+        {
-+            beta = 0.0;
-+        }
-+        else
-+        {
-+            /* s_min->fnorm cannot be zero, because then we would have converged
-+             * and broken out.
-+             */
-+
-+            /* Polak-Ribiere update.
-+             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
-+             */
-+            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
-+        }
-+        /* Limit beta to prevent oscillations */
-+        if (fabs(beta) > 5.0)
-+        {
-+            beta = 0.0;
-+        }
-+
-+
-+        /* update positions */
-+        swap_em_state(s_min, s_b);
-+        gpa = gpb;
-+
-+        /* Print it if necessary */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
-+                        step, s_min->epot, s_min->fnorm/sqrt(state_global->natoms),
-+                        s_min->fmax, s_min->a_fmax+1);
-+            }
-+            /* Store the new (lower) energies */
-+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
-+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+            do_log = do_per_step(step, inputrec->nstlog);
-+            do_ene = do_per_step(step, inputrec->nstenergy);
-+
-+            /* Prepare IMD energy record, if bIMD is TRUE. */
-+            IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE);
-+
-+            if (do_log)
-+            {
-+                print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+            }
-+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-+                       do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+
-+        /* Send energies and positions to the IMD client if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        /* Stop when the maximum force lies below tolerance.
-+         * If we have reached machine precision, converged is already set to true.
-+         */
-+        converged = converged || (s_min->fmax < inputrec->em_tol);
-+
-+    } /* End of the loop */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    if (converged)
-+    {
-+        step--; /* we never took that last step in this case */
-+
-+    }
-+    if (s_min->fmax > inputrec->em_tol)
-+    {
-+        if (MASTER(cr))
-+        {
-+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
-+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
-+        }
-+        converged = FALSE;
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        /* If we printed energy and/or logfile last step (which was the last step)
-+         * we don't have to do it again, but otherwise print the final values.
-+         */
-+        if (!do_log)
-+        {
-+            /* Write final value to log since we didn't do anything the last step */
-+            print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
-+        }
-+        if (!do_ene || !do_log)
-+        {
-+            /* Write final energy file entries */
-+            print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-+                       !do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+    }
-+
-+    /* Print some stuff... */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+
-+    /* IMPORTANT!
-+     * For accurate normal mode calculation it is imperative that we
-+     * store the last conformation into the full precision binary trajectory.
-+     *
-+     * However, we should only do it if we did NOT already write this step
-+     * above (which we did if do_x or do_f was true).
-+     */
-+    do_x = !do_per_step(step, inputrec->nstxout);
-+    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
-+
-+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, step,
-+                  s_min, state_global, f_global);
-+
-+    fnormn = s_min->fnorm/sqrt(state_global->natoms);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+
-+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_lbfgs(FILE *fplog, t_commrec *cr,
-+                int nfile, const t_filenm fnm[],
-+                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+                int gmx_unused nstglobalcomm,
-+                gmx_vsite_t *vsite, gmx_constr_t constr,
-+                int gmx_unused stepout,
-+                t_inputrec *inputrec,
-+                gmx_mtop_t *top_global, t_fcdata *fcd,
-+                t_state *state,
-+                t_mdatoms *mdatoms,
-+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                gmx_edsam_t gmx_unused ed,
-+                t_forcerec *fr,
-+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+                gmx_membed_t gmx_unused membed,
-+                real gmx_unused cpt_period, real gmx_unused max_hours,
-+                const char gmx_unused *deviceOptions,
-+                int imdport,
-+                unsigned long gmx_unused Flags,
-+                gmx_walltime_accounting_t walltime_accounting)
-+{
-+    static const char *LBFGS = "Low-Memory BFGS Minimizer";
-+    em_state_t         ems;
-+    gmx_localtop_t    *top;
-+    gmx_enerdata_t    *enerd;
-+    rvec              *f;
-+    gmx_global_stat_t  gstat;
-+    t_graph           *graph;
-+    rvec              *f_global;
-+    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
-+    double             stepsize, gpa, gpb, gpc, tmp, minstep;
-+    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
-+    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
-+    real               a, b, c, maxdelta, delta;
-+    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
-+    real               dgdx, dgdg, sq, yr, beta;
-+    t_mdebin          *mdebin;
-+    gmx_bool           converged, first;
-+    rvec               mu_tot;
-+    real               fnorm, fmax;
-+    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
-+    tensor             vir, pres;
-+    int                start, end, number_steps;
-+    gmx_mdoutf_t       outf;
-+    int                i, k, m, n, nfmax, gf, step;
-+    int                mdof_flags;
-+    /* not used */
-+    real               terminate;
-+
-+    if (PAR(cr))
-+    {
-+        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
-+    }
-+
-+    if (NULL != constr)
-+    {
-+        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
-+    }
-+
-+    n        = 3*state->natoms;
-+    nmaxcorr = inputrec->nbfgscorr;
-+
-+    /* Allocate memory */
-+    /* Use pointers to real so we dont have to loop over both atoms and
-+     * dimensions all the time...
-+     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
-+     * that point to the same memory.
-+     */
-+    snew(xa, n);
-+    snew(xb, n);
-+    snew(xc, n);
-+    snew(fa, n);
-+    snew(fb, n);
-+    snew(fc, n);
-+    snew(frozen, n);
-+
-+    snew(p, n);
-+    snew(lastx, n);
-+    snew(lastf, n);
-+    snew(rho, nmaxcorr);
-+    snew(alpha, nmaxcorr);
-+
-+    snew(dx, nmaxcorr);
-+    for (i = 0; i < nmaxcorr; i++)
-+    {
-+        snew(dx[i], n);
-+    }
-+
-+    snew(dg, nmaxcorr);
-+    for (i = 0; i < nmaxcorr; i++)
-+    {
-+        snew(dg[i], n);
-+    }
-+
-+    step  = 0;
-+    neval = 0;
-+
-+    /* Init em */
-+    init_em(fplog, LBFGS, cr, inputrec,
-+            state, top_global, &ems, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+    /* Do_lbfgs is not completely updated like do_steep and do_cg,
-+     * so we free some memory again.
-+     */
-+    sfree(ems.s.x);
-+    sfree(ems.f);
-+
-+    xx = (real *)state->x;
-+    ff = (real *)f;
-+
-+    start = 0;
-+    end   = mdatoms->homenr;
-+
-+    /* Print to log file */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS);
-+
-+    do_log = do_ene = do_x = do_f = TRUE;
-+
-+    /* Max number of steps */
-+    number_steps = inputrec->nsteps;
-+
-+    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
-+    gf = 0;
-+    for (i = start; i < end; i++)
-+    {
-+        if (mdatoms->cFREEZE)
-+        {
-+            gf = mdatoms->cFREEZE[i];
-+        }
-+        for (m = 0; m < DIM; m++)
-+        {
-+            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
-+        }
-+    }
-+    if (MASTER(cr))
-+    {
-+        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
-+    }
-+
-+    if (vsite)
-+    {
-+        construct_vsites(vsite, state->x, 1, NULL,
-+                         top->idef.iparams, top->idef.il,
-+                         fr->ePBC, fr->bMolPBC, cr, state->box);
-+    }
-+
-+    /* Call the force routine and some auxiliary (neighboursearching etc.) */
-+    /* do_force always puts the charge groups in the box and shifts again
-+     * We do not unshift, so molecules are always whole
-+     */
-+    neval++;
-+    ems.s.x = state->x;
-+    ems.f   = f;
-+    evaluate_energy(fplog, cr,
-+                    top_global, &ems, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    where();
-+
-+    if (MASTER(cr))
-+    {
-+        /* Copy stuff to the energy bin for easy printing etc. */
-+        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                   mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
-+                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+        print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+    where();
-+
-+    /* This is the starting energy */
-+    Epot = enerd->term[F_EPOT];
-+
-+    fnorm = ems.fnorm;
-+    fmax  = ems.fmax;
-+    nfmax = ems.a_fmax;
-+
-+    /* Set the initial step.
-+     * since it will be multiplied by the non-normalized search direction
-+     * vector (force vector the first time), we scale it by the
-+     * norm of the force.
-+     */
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-+        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
-+        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
-+        fprintf(stderr, "\n");
-+        /* and copy to the log file too... */
-+        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
-+        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
-+        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
-+        fprintf(fplog, "\n");
-+    }
-+
-+    point = 0;
-+    for (i = 0; i < n; i++)
-+    {
-+        if (!frozen[i])
-+        {
-+            dx[point][i] = ff[i]; /* Initial search direction */
-+        }
-+        else
-+        {
-+            dx[point][i] = 0;
-+        }
-+    }
-+
-+    stepsize  = 1.0/fnorm;
-+    converged = FALSE;
-+
-+    /* Start the loop over BFGS steps.
-+     * Each successful step is counted, and we continue until
-+     * we either converge or reach the max number of steps.
-+     */
-+
-+    ncorr = 0;
-+
-+    /* Set the gradient from the force */
-+    converged = FALSE;
-+    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
-+    {
-+
-+        /* Write coordinates if necessary */
-+        do_x = do_per_step(step, inputrec->nstxout);
-+        do_f = do_per_step(step, inputrec->nstfout);
-+
-+        mdof_flags = 0;
-+        if (do_x)
-+        {
-+            mdof_flags |= MDOF_X;
-+        }
-+
-+        if (do_f)
-+        {
-+            mdof_flags |= MDOF_F;
-+        }
-+
-+        if (inputrec->bIMD)
-+        {
-+            mdof_flags |= MDOF_IMD;
-+        }
-+
-+        mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags,
-+                                         top_global, step, (real)step, state, state, f, f);
-+
-+        /* Do the linesearching in the direction dx[point][0..(n-1)] */
-+
-+        /* pointer to current direction - point=0 first time here */
-+        s = dx[point];
-+
-+        /* calculate line gradient */
-+        for (gpa = 0, i = 0; i < n; i++)
-+        {
-+            gpa -= s[i]*ff[i];
-+        }
-+
-+        /* Calculate minimum allowed stepsize, before the average (norm)
-+         * relative change in coordinate is smaller than precision
-+         */
-+        for (minstep = 0, i = 0; i < n; i++)
-+        {
-+            tmp = fabs(xx[i]);
-+            if (tmp < 1.0)
-+            {
-+                tmp = 1.0;
-+            }
-+            tmp      = s[i]/tmp;
-+            minstep += tmp*tmp;
-+        }
-+        minstep = GMX_REAL_EPS/sqrt(minstep/n);
-+
-+        if (stepsize < minstep)
-+        {
-+            converged = TRUE;
-+            break;
-+        }
-+
-+        /* Store old forces and coordinates */
-+        for (i = 0; i < n; i++)
-+        {
-+            lastx[i] = xx[i];
-+            lastf[i] = ff[i];
-+        }
-+        Epot0 = Epot;
-+
-+        first = TRUE;
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            xa[i] = xx[i];
-+        }
-+
-+        /* Take a step downhill.
-+         * In theory, we should minimize the function along this direction.
-+         * That is quite possible, but it turns out to take 5-10 function evaluations
-+         * for each line. However, we dont really need to find the exact minimum -
-+         * it is much better to start a new BFGS step in a modified direction as soon
-+         * as we are close to it. This will save a lot of energy evaluations.
-+         *
-+         * In practice, we just try to take a single step.
-+         * If it worked (i.e. lowered the energy), we increase the stepsize but
-+         * the continue straight to the next BFGS step without trying to find any minimum.
-+         * If it didn't work (higher energy), there must be a minimum somewhere between
-+         * the old position and the new one.
-+         *
-+         * Due to the finite numerical accuracy, it turns out that it is a good idea
-+         * to even accept a SMALL increase in energy, if the derivative is still downhill.
-+         * This leads to lower final energies in the tests I've done. / Erik
-+         */
-+        foundlower = FALSE;
-+        EpotA      = Epot0;
-+        a          = 0.0;
-+        c          = a + stepsize; /* reference position along line is zero */
-+
-+        /* Check stepsize first. We do not allow displacements
-+         * larger than emstep.
-+         */
-+        do
-+        {
-+            c        = a + stepsize;
-+            maxdelta = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                delta = c*s[i];
-+                if (delta > maxdelta)
-+                {
-+                    maxdelta = delta;
-+                }
-+            }
-+            if (maxdelta > inputrec->em_stepsize)
-+            {
-+                stepsize *= 0.1;
-+            }
-+        }
-+        while (maxdelta > inputrec->em_stepsize);
-+
-+        /* Take a trial step */
-+        for (i = 0; i < n; i++)
-+        {
-+            xc[i] = lastx[i] + c*s[i];
-+        }
-+
-+        neval++;
-+        /* Calculate energy for the trial step */
-+        ems.s.x = (rvec *)xc;
-+        ems.f   = (rvec *)fc;
-+        evaluate_energy(fplog, cr,
-+                        top_global, &ems, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, step, FALSE);
-+        EpotC = ems.epot;
-+
-+        /* Calc derivative along line */
-+        for (gpc = 0, i = 0; i < n; i++)
-+        {
-+            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
-+        }
-+        /* Sum the gradient along the line across CPUs */
-+        if (PAR(cr))
-+        {
-+            gmx_sumd(1, &gpc, cr);
-+        }
-+
-+        /* This is the max amount of increase in energy we tolerate */
-+        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
-+
-+        /* Accept the step if the energy is lower, or if it is not significantly higher
-+         * and the line derivative is still negative.
-+         */
-+        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
-+        {
-+            foundlower = TRUE;
-+            /* Great, we found a better energy. Increase step for next iteration
-+             * if we are still going down, decrease it otherwise
-+             */
-+            if (gpc < 0)
-+            {
-+                stepsize *= 1.618034; /* The golden section */
-+            }
-+            else
-+            {
-+                stepsize *= 0.618034; /* 1/golden section */
-+            }
-+        }
-+        else
-+        {
-+            /* New energy is the same or higher. We will have to do some work
-+             * to find a smaller value in the interval. Take smaller step next time!
-+             */
-+            foundlower = FALSE;
-+            stepsize  *= 0.618034;
-+        }
-+
-+        /* OK, if we didn't find a lower value we will have to locate one now - there must
-+         * be one in the interval [a=0,c].
-+         * The same thing is valid here, though: Don't spend dozens of iterations to find
-+         * the line minimum. We try to interpolate based on the derivative at the endpoints,
-+         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
-+         *
-+         * I also have a safeguard for potentially really patological functions so we never
-+         * take more than 20 steps before we give up ...
-+         *
-+         * If we already found a lower value we just skip this step and continue to the update.
-+         */
-+
-+        if (!foundlower)
-+        {
-+
-+            nminstep = 0;
-+            do
-+            {
-+                /* Select a new trial point.
-+                 * If the derivatives at points a & c have different sign we interpolate to zero,
-+                 * otherwise just do a bisection.
-+                 */
-+
-+                if (gpa < 0 && gpc > 0)
-+                {
-+                    b = a + gpa*(a-c)/(gpc-gpa);
-+                }
-+                else
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* safeguard if interpolation close to machine accuracy causes errors:
-+                 * never go outside the interval
-+                 */
-+                if (b <= a || b >= c)
-+                {
-+                    b = 0.5*(a+c);
-+                }
-+
-+                /* Take a trial step */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xb[i] = lastx[i] + b*s[i];
-+                }
-+
-+                neval++;
-+                /* Calculate energy for the trial step */
-+                ems.s.x = (rvec *)xb;
-+                ems.f   = (rvec *)fb;
-+                evaluate_energy(fplog, cr,
-+                                top_global, &ems, top,
-+                                inputrec, nrnb, wcycle, gstat,
-+                                vsite, constr, fcd, graph, mdatoms, fr,
-+                                mu_tot, enerd, vir, pres, step, FALSE);
-+                EpotB = ems.epot;
-+
-+                fnorm = ems.fnorm;
-+
-+                for (gpb = 0, i = 0; i < n; i++)
-+                {
-+                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
-+
-+                }
-+                /* Sum the gradient along the line across CPUs */
-+                if (PAR(cr))
-+                {
-+                    gmx_sumd(1, &gpb, cr);
-+                }
-+
-+                /* Keep one of the intervals based on the value of the derivative at the new point */
-+                if (gpb > 0)
-+                {
-+                    /* Replace c endpoint with b */
-+                    EpotC = EpotB;
-+                    c     = b;
-+                    gpc   = gpb;
-+                    /* swap coord pointers b/c */
-+                    xtmp = xb;
-+                    ftmp = fb;
-+                    xb   = xc;
-+                    fb   = fc;
-+                    xc   = xtmp;
-+                    fc   = ftmp;
-+                }
-+                else
-+                {
-+                    /* Replace a endpoint with b */
-+                    EpotA = EpotB;
-+                    a     = b;
-+                    gpa   = gpb;
-+                    /* swap coord pointers a/b */
-+                    xtmp = xb;
-+                    ftmp = fb;
-+                    xb   = xa;
-+                    fb   = fa;
-+                    xa   = xtmp;
-+                    fa   = ftmp;
-+                }
-+
-+                /*
-+                 * Stop search as soon as we find a value smaller than the endpoints,
-+                 * or if the tolerance is below machine precision.
-+                 * Never run more than 20 steps, no matter what.
-+                 */
-+                nminstep++;
-+            }
-+            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
-+
-+            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
-+            {
-+                /* OK. We couldn't find a significantly lower energy.
-+                 * If ncorr==0 this was steepest descent, and then we give up.
-+                 * If not, reset memory to restart as steepest descent before quitting.
-+                 */
-+                if (ncorr == 0)
-+                {
-+                    /* Converged */
-+                    converged = TRUE;
-+                    break;
-+                }
-+                else
-+                {
-+                    /* Reset memory */
-+                    ncorr = 0;
-+                    /* Search in gradient direction */
-+                    for (i = 0; i < n; i++)
-+                    {
-+                        dx[point][i] = ff[i];
-+                    }
-+                    /* Reset stepsize */
-+                    stepsize = 1.0/fnorm;
-+                    continue;
-+                }
-+            }
-+
-+            /* Select min energy state of A & C, put the best in xx/ff/Epot
-+             */
-+            if (EpotC < EpotA)
-+            {
-+                Epot = EpotC;
-+                /* Use state C */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xx[i] = xc[i];
-+                    ff[i] = fc[i];
-+                }
-+                stepsize = c;
-+            }
-+            else
-+            {
-+                Epot = EpotA;
-+                /* Use state A */
-+                for (i = 0; i < n; i++)
-+                {
-+                    xx[i] = xa[i];
-+                    ff[i] = fa[i];
-+                }
-+                stepsize = a;
-+            }
-+
-+        }
-+        else
-+        {
-+            /* found lower */
-+            Epot = EpotC;
-+            /* Use state C */
-+            for (i = 0; i < n; i++)
-+            {
-+                xx[i] = xc[i];
-+                ff[i] = fc[i];
-+            }
-+            stepsize = c;
-+        }
-+
-+        /* Update the memory information, and calculate a new
-+         * approximation of the inverse hessian
-+         */
-+
-+        /* Have new data in Epot, xx, ff */
-+        if (ncorr < nmaxcorr)
-+        {
-+            ncorr++;
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            dg[point][i]  = lastf[i]-ff[i];
-+            dx[point][i] *= stepsize;
-+        }
-+
-+        dgdg = 0;
-+        dgdx = 0;
-+        for (i = 0; i < n; i++)
-+        {
-+            dgdg += dg[point][i]*dg[point][i];
-+            dgdx += dg[point][i]*dx[point][i];
-+        }
-+
-+        diag = dgdx/dgdg;
-+
-+        rho[point] = 1.0/dgdx;
-+        point++;
-+
-+        if (point >= nmaxcorr)
-+        {
-+            point = 0;
-+        }
-+
-+        /* Update */
-+        for (i = 0; i < n; i++)
-+        {
-+            p[i] = ff[i];
-+        }
-+
-+        cp = point;
-+
-+        /* Recursive update. First go back over the memory points */
-+        for (k = 0; k < ncorr; k++)
-+        {
-+            cp--;
-+            if (cp < 0)
-+            {
-+                cp = ncorr-1;
-+            }
-+
-+            sq = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                sq += dx[cp][i]*p[i];
-+            }
-+
-+            alpha[cp] = rho[cp]*sq;
-+
-+            for (i = 0; i < n; i++)
-+            {
-+                p[i] -= alpha[cp]*dg[cp][i];
-+            }
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            p[i] *= diag;
-+        }
-+
-+        /* And then go forward again */
-+        for (k = 0; k < ncorr; k++)
-+        {
-+            yr = 0;
-+            for (i = 0; i < n; i++)
-+            {
-+                yr += p[i]*dg[cp][i];
-+            }
-+
-+            beta = rho[cp]*yr;
-+            beta = alpha[cp]-beta;
-+
-+            for (i = 0; i < n; i++)
-+            {
-+                p[i] += beta*dx[cp][i];
-+            }
-+
-+            cp++;
-+            if (cp >= ncorr)
-+            {
-+                cp = 0;
-+            }
-+        }
-+
-+        for (i = 0; i < n; i++)
-+        {
-+            if (!frozen[i])
-+            {
-+                dx[point][i] = p[i];
-+            }
-+            else
-+            {
-+                dx[point][i] = 0;
-+            }
-+        }
-+
-+        stepsize = 1.0;
-+
-+        /* Test whether the convergence criterion is met */
-+        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
-+
-+        /* Print it if necessary */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
-+                        step, Epot, fnorm/sqrt(state->natoms), fmax, nfmax+1);
-+            }
-+            /* Store the new (lower) energies */
-+            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
-+                       mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
-+                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+            do_log = do_per_step(step, inputrec->nstlog);
-+            do_ene = do_per_step(step, inputrec->nstenergy);
-+            if (do_log)
-+            {
-+                print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+            }
-+            print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE,
-+                       do_log ? fplog : NULL, step, step, eprNORMAL,
-+                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+        }
-+
-+        /* Send x and E to IMD client, if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, step, cr, TRUE, state->box, state->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        /* Stop when the maximum force lies below tolerance.
-+         * If we have reached machine precision, converged is already set to true.
-+         */
-+
-+        converged = converged || (fmax < inputrec->em_tol);
-+
-+    } /* End of the loop */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    if (converged)
-+    {
-+        step--; /* we never took that last step in this case */
-+
-+    }
-+    if (fmax > inputrec->em_tol)
-+    {
-+        if (MASTER(cr))
-+        {
-+            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
-+            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
-+        }
-+        converged = FALSE;
-+    }
-+
-+    /* If we printed energy and/or logfile last step (which was the last step)
-+     * we don't have to do it again, but otherwise print the final values.
-+     */
-+    if (!do_log) /* Write final value to log since we didn't do anythin last step */
-+    {
-+        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
-+    }
-+    if (!do_ene || !do_log) /* Write final energy file entries */
-+    {
-+        print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE,
-+                   !do_log ? fplog : NULL, step, step, eprNORMAL,
-+                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+    }
-+
-+    /* Print some stuff... */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+
-+    /* IMPORTANT!
-+     * For accurate normal mode calculation it is imperative that we
-+     * store the last conformation into the full precision binary trajectory.
-+     *
-+     * However, we should only do it if we did NOT already write this step
-+     * above (which we did if do_x or do_f was true).
-+     */
-+    do_x = !do_per_step(step, inputrec->nstxout);
-+    do_f = !do_per_step(step, inputrec->nstfout);
-+    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, step,
-+                  &ems, state, f);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
-+                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
-+        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
-+                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
-+
-+        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_steep(FILE *fplog, t_commrec *cr,
-+                int nfile, const t_filenm fnm[],
-+                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
-+                int gmx_unused nstglobalcomm,
-+                gmx_vsite_t *vsite, gmx_constr_t constr,
-+                int gmx_unused stepout,
-+                t_inputrec *inputrec,
-+                gmx_mtop_t *top_global, t_fcdata *fcd,
-+                t_state *state_global,
-+                t_mdatoms *mdatoms,
-+                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+                gmx_edsam_t gmx_unused  ed,
-+                t_forcerec *fr,
-+                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+                gmx_membed_t gmx_unused membed,
-+                real gmx_unused cpt_period, real gmx_unused max_hours,
-+                const char  gmx_unused *deviceOptions,
-+                int imdport,
-+                unsigned long gmx_unused Flags,
-+                gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char       *SD = "Steepest Descents";
-+    em_state_t       *s_min, *s_try;
-+    rvec             *f_global;
-+    gmx_localtop_t   *top;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f;
-+    gmx_global_stat_t gstat;
-+    t_graph          *graph;
-+    real              stepsize, constepsize;
-+    real              ustep, fnormn;
-+    gmx_mdoutf_t      outf;
-+    t_mdebin         *mdebin;
-+    gmx_bool          bDone, bAbort, do_x, do_f;
-+    tensor            vir, pres;
-+    rvec              mu_tot;
-+    int               nsteps;
-+    int               count          = 0;
-+    int               steps_accepted = 0;
-+    /* not used */
-+    real              terminate = 0;
-+
-+    s_min = init_em_state();
-+    s_try = init_em_state();
-+
-+    /* Init em and store the local state in s_try */
-+    init_em(fplog, SD, cr, inputrec,
-+            state_global, top_global, s_try, &top, &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, &mdebin, imdport, Flags, wcycle);
-+
-+    /* Print to log file  */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, SD);
-+
-+    /* Set variables for stepsize (in nm). This is the largest
-+     * step that we are going to make in any direction.
-+     */
-+    ustep    = inputrec->em_stepsize;
-+    stepsize = 0;
-+
-+    /* Max number of steps  */
-+    nsteps = inputrec->nsteps;
-+
-+    if (MASTER(cr))
-+    {
-+        /* Print to the screen  */
-+        sp_header(stderr, SD, inputrec->em_tol, nsteps);
-+    }
-+    if (fplog)
-+    {
-+        sp_header(fplog, SD, inputrec->em_tol, nsteps);
-+    }
-+
-+    /**** HERE STARTS THE LOOP ****
-+     * count is the counter for the number of steps
-+     * bDone will be TRUE when the minimization has converged
-+     * bAbort will be TRUE when nsteps steps have been performed or when
-+     * the stepsize becomes smaller than is reasonable for machine precision
-+     */
-+    count  = 0;
-+    bDone  = FALSE;
-+    bAbort = FALSE;
-+    while (!bDone && !bAbort)
-+    {
-+        bAbort = (nsteps >= 0) && (count == nsteps);
-+
-+        /* set new coordinates, except for first step */
-+        if (count > 0)
-+        {
-+            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
-+                       s_min, stepsize, s_min->f, s_try,
-+                       constr, top, nrnb, wcycle, count);
-+        }
-+
-+        evaluate_energy(fplog, cr,
-+                        top_global, s_try, top,
-+                        inputrec, nrnb, wcycle, gstat,
-+                        vsite, constr, fcd, graph, mdatoms, fr,
-+                        mu_tot, enerd, vir, pres, count, count == 0);
-+
-+        if (MASTER(cr))
-+        {
-+            print_ebin_header(fplog, count, count, s_try->s.lambda[efptFEP]);
-+        }
-+
-+        if (count == 0)
-+        {
-+            s_min->epot = s_try->epot + 1;
-+        }
-+
-+        /* Print it if necessary  */
-+        if (MASTER(cr))
-+        {
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
-+                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
-+                        (s_try->epot < s_min->epot) ? '\n' : '\r');
-+            }
-+
-+            if (s_try->epot < s_min->epot)
-+            {
-+                /* Store the new (lower) energies  */
-+                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
-+                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
-+                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
-+
-+                /* Prepare IMD energy record, if bIMD is TRUE. */
-+                IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE);
-+
-+                print_ebin(mdoutf_get_fp_ene(outf), TRUE,
-+                           do_per_step(steps_accepted, inputrec->nstdisreout),
-+                           do_per_step(steps_accepted, inputrec->nstorireout),
-+                           fplog, count, count, eprNORMAL, TRUE,
-+                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
-+                fflush(fplog);
-+            }
-+        }
-+
-+        /* Now if the new energy is smaller than the previous...
-+         * or if this is the first step!
-+         * or if we did random steps!
-+         */
-+
-+        if ( (count == 0) || (s_try->epot < s_min->epot) )
-+        {
-+            steps_accepted++;
-+
-+            /* Test whether the convergence criterion is met...  */
-+            bDone = (s_try->fmax < inputrec->em_tol);
-+
-+            /* Copy the arrays for force, positions and energy  */
-+            /* The 'Min' array always holds the coords and forces of the minimal
-+               sampled energy  */
-+            swap_em_state(s_min, s_try);
-+            if (count > 0)
-+            {
-+                ustep *= 1.2;
-+            }
-+
-+            /* Write to trn, if necessary */
-+            do_x = do_per_step(steps_accepted, inputrec->nstxout);
-+            do_f = do_per_step(steps_accepted, inputrec->nstfout);
-+            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
-+                          top_global, inputrec, count,
-+                          s_min, state_global, f_global);
-+        }
-+        else
-+        {
-+            /* If energy is not smaller make the step smaller...  */
-+            ustep *= 0.5;
-+
-+            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
-+            {
-+                /* Reload the old state */
-+                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
-+                                       s_min, top, mdatoms, fr, vsite, constr,
-+                                       nrnb, wcycle);
-+            }
-+        }
-+
-+        /* Determine new step  */
-+        stepsize = ustep/s_min->fmax;
-+
-+        /* Check if stepsize is too small, with 1 nm as a characteristic length */
-+#ifdef GMX_DOUBLE
-+        if (count == nsteps || ustep < 1e-12)
-+#else
-+        if (count == nsteps || ustep < 1e-6)
-+#endif
-+        {
-+            if (MASTER(cr))
-+            {
-+                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
-+                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
-+            }
-+            bAbort = TRUE;
-+        }
-+
-+        /* Send IMD energies and positions, if bIMD is TRUE. */
-+        if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box, state_global->x, inputrec, 0, wcycle) && MASTER(cr))
-+        {
-+            IMD_send_positions(inputrec->imd);
-+        }
-+
-+        count++;
-+    } /* End of the loop  */
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(inputrec->bIMD, inputrec->imd);
-+
-+    /* Print some data...  */
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
-+    }
-+    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
-+                  top_global, inputrec, count,
-+                  s_min, state_global, f_global);
-+
-+    fnormn = s_min->fnorm/sqrt(state_global->natoms);
-+
-+    if (MASTER(cr))
-+    {
-+        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
-+                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    /* To print the actual number of steps we needed somewhere */
-+    inputrec->nsteps = count;
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, count);
-+
-+    return 0;
-+} /* That's all folks */
-+
-+
-+double do_nm(FILE *fplog, t_commrec *cr,
-+             int nfile, const t_filenm fnm[],
-+             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused  bCompact,
-+             int gmx_unused nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int gmx_unused stepout,
-+             t_inputrec *inputrec,
-+             gmx_mtop_t *top_global, t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t  gmx_unused ed,
-+             t_forcerec *fr,
-+             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
-+             gmx_membed_t gmx_unused membed,
-+             real gmx_unused cpt_period, real gmx_unused max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long gmx_unused Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    const char          *NM = "Normal Mode Analysis";
-+    gmx_mdoutf_t         outf;
-+    int                  natoms, atom, d;
-+    int                  nnodes, node;
-+    rvec                *f_global;
-+    gmx_localtop_t      *top;
-+    gmx_enerdata_t      *enerd;
-+    rvec                *f;
-+    gmx_global_stat_t    gstat;
-+    t_graph             *graph;
-+    real                 t, t0, lambda, lam0;
-+    gmx_bool             bNS;
-+    tensor               vir, pres;
-+    rvec                 mu_tot;
-+    rvec                *fneg, *dfdx;
-+    gmx_bool             bSparse; /* use sparse matrix storage format */
-+    size_t               sz = 0;
-+    gmx_sparsematrix_t * sparse_matrix           = NULL;
-+    real           *     full_matrix             = NULL;
-+    em_state_t       *   state_work;
-+
-+    /* added with respect to mdrun */
-+    int        i, j, k, row, col;
-+    real       der_range = 10.0*sqrt(GMX_REAL_EPS);
-+    real       x_min;
-+    real       fnorm, fmax;
-+
-+    if (constr != NULL)
-+    {
-+        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
-+    }
-+
-+    state_work = init_em_state();
-+
-+    /* Init em and store the local state in state_minimum */
-+    init_em(fplog, NM, cr, inputrec,
-+            state_global, top_global, state_work, &top,
-+            &f, &f_global,
-+            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
-+            nfile, fnm, &outf, NULL, imdport, Flags, wcycle);
-+
-+    natoms = top_global->natoms;
-+    snew(fneg, natoms);
-+    snew(dfdx, natoms);
-+
-+#ifndef GMX_DOUBLE
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr,
-+                "NOTE: This version of Gromacs has been compiled in single precision,\n"
-+                "      which MIGHT not be accurate enough for normal mode analysis.\n"
-+                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
-+                "      are fairly modest even if you recompile in double precision.\n\n");
-+    }
-+#endif
-+
-+    /* Check if we can/should use sparse storage format.
-+     *
-+     * Sparse format is only useful when the Hessian itself is sparse, which it
-+     * will be when we use a cutoff.
-+     * For small systems (n<1000) it is easier to always use full matrix format, though.
-+     */
-+    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
-+    {
-+        md_print_info(cr, fplog, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
-+        bSparse = FALSE;
-+    }
-+    else if (top_global->natoms < 1000)
-+    {
-+        md_print_info(cr, fplog, "Small system size (N=%d), using full Hessian format.\n", top_global->natoms);
-+        bSparse = FALSE;
-+    }
-+    else
-+    {
-+        md_print_info(cr, fplog, "Using compressed symmetric sparse Hessian format.\n");
-+        bSparse = TRUE;
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        sz = DIM*top_global->natoms;
-+
-+        fprintf(stderr, "Allocating Hessian memory...\n\n");
-+
-+        if (bSparse)
-+        {
-+            sparse_matrix = gmx_sparsematrix_init(sz);
-+            sparse_matrix->compressed_symmetric = TRUE;
-+        }
-+        else
-+        {
-+            snew(full_matrix, sz*sz);
-+        }
-+    }
-+
-+    /* Initial values */
-+    t0           = inputrec->init_t;
-+    lam0         = inputrec->fepvals->init_lambda;
-+    t            = t0;
-+    lambda       = lam0;
-+
-+    init_nrnb(nrnb);
-+
-+    where();
-+
-+    /* Write start time and temperature */
-+    print_em_start(fplog, cr, walltime_accounting, wcycle, NM);
-+
-+    /* fudge nr of steps to nr of atoms */
-+    inputrec->nsteps = natoms*2;
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
-+                *(top_global->name), (int)inputrec->nsteps);
-+    }
-+
-+    nnodes = cr->nnodes;
-+
-+    /* Make evaluate_energy do a single node force calculation */
-+    cr->nnodes = 1;
-+    evaluate_energy(fplog, cr,
-+                    top_global, state_work, top,
-+                    inputrec, nrnb, wcycle, gstat,
-+                    vsite, constr, fcd, graph, mdatoms, fr,
-+                    mu_tot, enerd, vir, pres, -1, TRUE);
-+    cr->nnodes = nnodes;
-+
-+    /* if forces are not small, warn user */
-+    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
-+
-+    md_print_info(cr, fplog, "Maximum force:%12.5e\n", state_work->fmax);
-+    if (state_work->fmax > 1.0e-3)
-+    {
-+        md_print_info(cr, fplog,
-+                      "The force is probably not small enough to "
-+                      "ensure that you are at a minimum.\n"
-+                      "Be aware that negative eigenvalues may occur\n"
-+                      "when the resulting matrix is diagonalized.\n\n");
-+    }
-+
-+    /***********************************************************
-+     *
-+     *      Loop over all pairs in matrix
-+     *
-+     *      do_force called twice. Once with positive and
-+     *      once with negative displacement
-+     *
-+     ************************************************************/
-+
-+    /* Steps are divided one by one over the nodes */
-+    for (atom = cr->nodeid; atom < natoms; atom += nnodes)
-+    {
-+
-+        for (d = 0; d < DIM; d++)
-+        {
-+            x_min = state_work->s.x[atom][d];
-+
-+            state_work->s.x[atom][d] = x_min - der_range;
-+
-+            /* Make evaluate_energy do a single node force calculation */
-+            cr->nnodes = 1;
-+            evaluate_energy(fplog, cr,
-+                            top_global, state_work, top,
-+                            inputrec, nrnb, wcycle, gstat,
-+                            vsite, constr, fcd, graph, mdatoms, fr,
-+                            mu_tot, enerd, vir, pres, atom*2, FALSE);
-+
-+            for (i = 0; i < natoms; i++)
-+            {
-+                copy_rvec(state_work->f[i], fneg[i]);
-+            }
-+
-+            state_work->s.x[atom][d] = x_min + der_range;
-+
-+            evaluate_energy(fplog, cr,
-+                            top_global, state_work, top,
-+                            inputrec, nrnb, wcycle, gstat,
-+                            vsite, constr, fcd, graph, mdatoms, fr,
-+                            mu_tot, enerd, vir, pres, atom*2+1, FALSE);
-+            cr->nnodes = nnodes;
-+
-+            /* x is restored to original */
-+            state_work->s.x[atom][d] = x_min;
-+
-+            for (j = 0; j < natoms; j++)
-+            {
-+                for (k = 0; (k < DIM); k++)
-+                {
-+                    dfdx[j][k] =
-+                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
-+                }
-+            }
-+
-+            if (!MASTER(cr))
-+            {
-+#ifdef GMX_MPI
-+#ifdef GMX_DOUBLE
-+#define mpi_type MPI_DOUBLE
-+#else
-+#define mpi_type MPI_FLOAT
-+#endif
-+                MPI_Send(dfdx[0], natoms*DIM, mpi_type, MASTERNODE(cr), cr->nodeid,
-+                         cr->mpi_comm_mygroup);
-+#endif
-+            }
-+            else
-+            {
-+                for (node = 0; (node < nnodes && atom+node < natoms); node++)
-+                {
-+                    if (node > 0)
-+                    {
-+#ifdef GMX_MPI
-+                        MPI_Status stat;
-+                        MPI_Recv(dfdx[0], natoms*DIM, mpi_type, node, node,
-+                                 cr->mpi_comm_mygroup, &stat);
-+#undef mpi_type
-+#endif
-+                    }
-+
-+                    row = (atom + node)*DIM + d;
-+
-+                    for (j = 0; j < natoms; j++)
-+                    {
-+                        for (k = 0; k < DIM; k++)
-+                        {
-+                            col = j*DIM + k;
-+
-+                            if (bSparse)
-+                            {
-+                                if (col >= row && dfdx[j][k] != 0.0)
-+                                {
-+                                    gmx_sparsematrix_increment_value(sparse_matrix,
-+                                                                     row, col, dfdx[j][k]);
-+                                }
-+                            }
-+                            else
-+                            {
-+                                full_matrix[row*sz+col] = dfdx[j][k];
-+                            }
-+                        }
-+                    }
-+                }
-+            }
-+
-+            if (bVerbose && fplog)
-+            {
-+                fflush(fplog);
-+            }
-+        }
-+        /* write progress */
-+        if (MASTER(cr) && bVerbose)
-+        {
-+            fprintf(stderr, "\rFinished step %d out of %d",
-+                    min(atom+nnodes, natoms), natoms);
-+            fflush(stderr);
-+        }
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        fprintf(stderr, "\n\nWriting Hessian...\n");
-+        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
-+    }
-+
-+    finish_em(cr, outf, walltime_accounting, wcycle);
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, natoms*2);
-+
-+    return 0;
-+}
-diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c
-index 3d98d59..b34d23c 100644
---- a/src/programs/mdrun/md.c
-+++ b/src/programs/mdrun/md.c
-@@ -96,6 +96,12 @@
- #include "gromacs/swap/swapcoords.h"
- #include "gromacs/imd/imd.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #ifdef GMX_FAHCORE
- #include "corewrap.h"
- #endif
-@@ -224,6 +230,12 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-     /* Interactive MD */
-     gmx_bool          bIMDstep = FALSE;
- 
-+    /* PLUMED */
-+    int plumedNeedsEnergy=0;
-+    int plumedWantsToStop=0;
-+    matrix plumed_vir;
-+    /* END PLUMED */
-+
- #ifdef GMX_FAHCORE
-     /* Temporary addition for FAHCORE checkpointing */
-     int chkpt_ret;
-@@ -651,6 +663,48 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-         fprintf(fplog, "\n");
-     }
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      /* detect plumed API version */
-+      int pversion=0;
-+      plumed_cmd(plumedmain,"getApiVersion",&pversion);
-+      /* setting kbT is only implemented with api>1) */
-+      real kbT=ir->opts.ref_t[0]*BOLTZ;
-+      if(pversion>1) plumed_cmd(plumedmain,"setKbT",&kbT);
-+
-+      if(cr->ms && cr->ms->nsim>1) {
-+        if(MASTER(cr)) plumed_cmd(plumedmain,"GREX setMPIIntercomm",&cr->ms->mpi_comm_masters);
-+        if(PAR(cr)){
-+          if(DOMAINDECOMP(cr)) {
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->dd->mpi_comm_all);
-+          }else{
-+            plumed_cmd(plumedmain,"GREX setMPIIntracomm",&cr->mpi_comm_mysim);
-+          }
-+        }
-+        plumed_cmd(plumedmain,"GREX init",NULL);
-+      }
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setMPIComm",&cr->dd->mpi_comm_all);
-+        }
-+      }
-+      plumed_cmd(plumedmain,"setNatoms",&top_global->natoms);
-+      plumed_cmd(plumedmain,"setMDEngine","gromacs");
-+      plumed_cmd(plumedmain,"setLog",fplog);
-+      real real_delta_t;
-+      real_delta_t=ir->delta_t;
-+      plumed_cmd(plumedmain,"setTimestep",&real_delta_t);
-+      plumed_cmd(plumedmain,"init",NULL);
-+
-+      if(PAR(cr)){
-+        if(DOMAINDECOMP(cr)) {
-+          plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+          plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+        }
-+      }
-+    }
-+    /* END PLUMED */
-+
-     walltime_accounting_start(walltime_accounting);
-     wallcycle_start(wcycle, ewcRUN);
-     print_start(fplog, cr, walltime_accounting, "mdrun");
-@@ -955,6 +1009,13 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-                                     do_verbose && !bPMETuneRunning);
-                 wallcycle_stop(wcycle, ewcDOMDEC);
-                 /* If using an iterative integrator, reallocate space to match the decomposition */
-+
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  plumed_cmd(plumedmain,"setAtomsNlocal",&cr->dd->nat_home);
-+                  plumed_cmd(plumedmain,"setAtomsGatindex",cr->dd->gatindex);
-+                }
-+                /* END PLUMED */
-             }
-         }
- 
-@@ -1078,12 +1139,45 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-              * This is parallellized as well, and does communication too.
-              * Check comments in sim_util.c
-              */
-+
-+            /* PLUMED */
-+            plumedNeedsEnergy=0;
-+            if(plumedswitch){
-+              long int lstep=step; plumed_cmd(plumedmain,"setStepLong",&lstep);
-+              plumed_cmd(plumedmain,"setPositions",&state->x[0][0]);
-+              plumed_cmd(plumedmain,"setMasses",&mdatoms->massT[0]);
-+              plumed_cmd(plumedmain,"setCharges",&mdatoms->chargeA[0]);
-+              plumed_cmd(plumedmain,"setBox",&state->box[0][0]);
-+              plumed_cmd(plumedmain,"prepareCalc",NULL);
-+              plumed_cmd(plumedmain,"setStopFlag",&plumedWantsToStop);
-+              plumed_cmd(plumedmain,"setForces",&f[0][0]);
-+              plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
-+              clear_mat(plumed_vir);
-+              plumed_cmd(plumedmain,"setVirial",&plumed_vir[0][0]);
-+            }
-+            /* END PLUMED */
-             do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
-                      state->box, state->x, &state->hist,
-                      f, force_vir, mdatoms, enerd, fcd,
-                      state->lambda, graph,
-                      fr, vsite, mu_tot, t, mdoutf_get_fp_field(outf), ed, bBornRadii,
-                      (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+            /* PLUMED */
-+            if(plumedswitch){
-+              if(plumedNeedsEnergy){
-+                msmul(force_vir,2.0,plumed_vir);
-+                plumed_cmd(plumedmain,"setEnergy",&enerd->term[F_EPOT]);
-+                plumed_cmd(plumedmain,"performCalc",NULL);
-+                msmul(plumed_vir,0.5,force_vir);
-+              } else {
-+                msmul(plumed_vir,0.5,plumed_vir);
-+                m_add(force_vir,plumed_vir,force_vir);
-+              }
-+              if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+                 do_per_step(step,repl_ex_nst)) plumed_cmd(plumedmain,"GREX savePositions",NULL);
-+              if(plumedWantsToStop) ir->nsteps=step_rel+1;
-+            }
-+            /* END PLUMED */
-         }
- 
-         if (bVV && !bStartingFromCpt && !bRerunMD)
-diff --git a/src/programs/mdrun/md.c.preplumed b/src/programs/mdrun/md.c.preplumed
-new file mode 100644
-index 0000000..3d98d59
---- /dev/null
-+++ b/src/programs/mdrun/md.c.preplumed
-@@ -0,0 +1,2058 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include "typedefs.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "sysstuff.h"
-+#include "vec.h"
-+#include "vcm.h"
-+#include "mdebin.h"
-+#include "nrnb.h"
-+#include "calcmu.h"
-+#include "index.h"
-+#include "vsite.h"
-+#include "update.h"
-+#include "ns.h"
-+#include "mdrun.h"
-+#include "md_support.h"
-+#include "md_logging.h"
-+#include "network.h"
-+#include "xvgr.h"
-+#include "physics.h"
-+#include "names.h"
-+#include "force.h"
-+#include "disre.h"
-+#include "orires.h"
-+#include "pme.h"
-+#include "mdatoms.h"
-+#include "repl_ex.h"
-+#include "deform.h"
-+#include "qmmm.h"
-+#include "domdec.h"
-+#include "domdec_network.h"
-+#include "gromacs/gmxlib/topsort.h"
-+#include "coulomb.h"
-+#include "constr.h"
-+#include "shellfc.h"
-+#include "gromacs/gmxpreprocess/compute_io.h"
-+#include "checkpoint.h"
-+#include "mtop_util.h"
-+#include "sighandler.h"
-+#include "txtdump.h"
-+#include "gromacs/utility/cstringutil.h"
-+#include "pme_loadbal.h"
-+#include "bondf.h"
-+#include "membed.h"
-+#include "types/nlistheuristics.h"
-+#include "types/iteratedconstraints.h"
-+#include "nbnxn_cuda_data_mgmt.h"
-+
-+#include "gromacs/utility/gmxmpi.h"
-+#include "gromacs/fileio/confio.h"
-+#include "gromacs/fileio/trajectory_writing.h"
-+#include "gromacs/fileio/trnio.h"
-+#include "gromacs/fileio/trxio.h"
-+#include "gromacs/fileio/xtcio.h"
-+#include "gromacs/timing/wallcycle.h"
-+#include "gromacs/timing/walltime_accounting.h"
-+#include "gromacs/pulling/pull.h"
-+#include "gromacs/swap/swapcoords.h"
-+#include "gromacs/imd/imd.h"
-+
-+#ifdef GMX_FAHCORE
-+#include "corewrap.h"
-+#endif
-+
-+static void reset_all_counters(FILE *fplog, t_commrec *cr,
-+                               gmx_int64_t step,
-+                               gmx_int64_t *step_rel, t_inputrec *ir,
-+                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
-+                               gmx_walltime_accounting_t walltime_accounting,
-+                               nbnxn_cuda_ptr_t cu_nbv)
-+{
-+    char sbuf[STEPSTRSIZE];
-+
-+    /* Reset all the counters related to performance over the run */
-+    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
-+                  gmx_step_str(step, sbuf));
-+
-+    if (cu_nbv)
-+    {
-+        nbnxn_cuda_reset_timings(cu_nbv);
-+    }
-+
-+    wallcycle_stop(wcycle, ewcRUN);
-+    wallcycle_reset_all(wcycle);
-+    if (DOMAINDECOMP(cr))
-+    {
-+        reset_dd_statistics_counters(cr->dd);
-+    }
-+    init_nrnb(nrnb);
-+    ir->init_step += *step_rel;
-+    ir->nsteps    -= *step_rel;
-+    *step_rel      = 0;
-+    wallcycle_start(wcycle, ewcRUN);
-+    walltime_accounting_start(walltime_accounting);
-+    print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
-+}
-+
-+double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
-+             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
-+             int nstglobalcomm,
-+             gmx_vsite_t *vsite, gmx_constr_t constr,
-+             int stepout, t_inputrec *ir,
-+             gmx_mtop_t *top_global,
-+             t_fcdata *fcd,
-+             t_state *state_global,
-+             t_mdatoms *mdatoms,
-+             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
-+             gmx_edsam_t ed, t_forcerec *fr,
-+             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
-+             real cpt_period, real max_hours,
-+             const char gmx_unused *deviceOptions,
-+             int imdport,
-+             unsigned long Flags,
-+             gmx_walltime_accounting_t walltime_accounting)
-+{
-+    gmx_mdoutf_t    outf = NULL;
-+    gmx_int64_t     step, step_rel;
-+    double          elapsed_time;
-+    double          t, t0, lam0[efptNR];
-+    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
-+    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
-+                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
-+                    bBornRadii, bStartingFromCpt;
-+    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
-+    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
-+                      bForceUpdate = FALSE, bCPT;
-+    gmx_bool          bMasterState;
-+    int               force_flags, cglo_flags;
-+    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
-+    int               i, m;
-+    t_trxstatus      *status;
-+    rvec              mu_tot;
-+    t_vcm            *vcm;
-+    t_state          *bufstate = NULL;
-+    matrix           *scale_tot, pcoupl_mu, M, ebox;
-+    gmx_nlheur_t      nlh;
-+    t_trxframe        rerun_fr;
-+    gmx_repl_ex_t     repl_ex = NULL;
-+    int               nchkpt  = 1;
-+    gmx_localtop_t   *top;
-+    t_mdebin         *mdebin   = NULL;
-+    t_state          *state    = NULL;
-+    rvec             *f_global = NULL;
-+    gmx_enerdata_t   *enerd;
-+    rvec             *f = NULL;
-+    gmx_global_stat_t gstat;
-+    gmx_update_t      upd   = NULL;
-+    t_graph          *graph = NULL;
-+    globsig_t         gs;
-+    gmx_groups_t     *groups;
-+    gmx_ekindata_t   *ekind, *ekind_save;
-+    gmx_shellfc_t     shellfc;
-+    int               count, nconverged = 0;
-+    real              timestep   = 0;
-+    double            tcount     = 0;
-+    gmx_bool          bConverged = TRUE, bOK, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
-+    gmx_bool          bAppend;
-+    gmx_bool          bResetCountersHalfMaxH = FALSE;
-+    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
-+    gmx_bool          bUpdateDoLR;
-+    real              dvdl_constr;
-+    rvec             *cbuf = NULL;
-+    matrix            lastbox;
-+    real              veta_save, scalevir, tracevir;
-+    real              vetanew = 0;
-+    int               lamnew  = 0;
-+    /* for FEP */
-+    int               nstfep;
-+    double            cycles;
-+    real              saved_conserved_quantity = 0;
-+    real              last_ekin                = 0;
-+    int               iter_i;
-+    t_extmass         MassQ;
-+    int             **trotter_seq;
-+    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
-+    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
-+    gmx_iterate_t     iterate;
-+    gmx_int64_t       multisim_nsteps = -1;                        /* number of steps to do  before first multisim
-+                                                                          simulation stops. If equal to zero, don't
-+                                                                          communicate any more between multisims.*/
-+    /* PME load balancing data for GPU kernels */
-+    pme_load_balancing_t pme_loadbal = NULL;
-+    double               cycles_pmes;
-+    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
-+
-+    /* Interactive MD */
-+    gmx_bool          bIMDstep = FALSE;
-+
-+#ifdef GMX_FAHCORE
-+    /* Temporary addition for FAHCORE checkpointing */
-+    int chkpt_ret;
-+#endif
-+
-+    /* Check for special mdrun options */
-+    bRerunMD = (Flags & MD_RERUN);
-+    bAppend  = (Flags & MD_APPENDFILES);
-+    if (Flags & MD_RESETCOUNTERSHALFWAY)
-+    {
-+        if (ir->nsteps > 0)
-+        {
-+            /* Signal to reset the counters half the simulation steps. */
-+            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
-+        }
-+        /* Signal to reset the counters halfway the simulation time. */
-+        bResetCountersHalfMaxH = (max_hours > 0);
-+    }
-+
-+    /* md-vv uses averaged full step velocities for T-control
-+       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
-+       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
-+    bVV = EI_VV(ir->eI);
-+    if (bVV) /* to store the initial velocities while computing virial */
-+    {
-+        snew(cbuf, top_global->natoms);
-+    }
-+    /* all the iteratative cases - only if there are constraints */
-+    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
-+    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
-+                                          false in this step.  The correct value, true or false,
-+                                          is set at each step, as it depends on the frequency of temperature
-+                                          and pressure control.*/
-+    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
-+
-+    if (bRerunMD)
-+    {
-+        /* Since we don't know if the frames read are related in any way,
-+         * rebuild the neighborlist at every step.
-+         */
-+        ir->nstlist       = 1;
-+        ir->nstcalcenergy = 1;
-+        nstglobalcomm     = 1;
-+    }
-+
-+    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
-+
-+    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
-+    bGStatEveryStep = (nstglobalcomm == 1);
-+
-+    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
-+    {
-+        fprintf(fplog,
-+                "To reduce the energy communication with nstlist = -1\n"
-+                "the neighbor list validity should not be checked at every step,\n"
-+                "this means that exact integration is not guaranteed.\n"
-+                "The neighbor list validity is checked after:\n"
-+                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
-+                "In most cases this will result in exact integration.\n"
-+                "This reduces the energy communication by a factor of 2 to 3.\n"
-+                "If you want less energy communication, set nstlist > 3.\n\n");
-+    }
-+
-+    if (bRerunMD)
-+    {
-+        ir->nstxout_compressed = 0;
-+    }
-+    groups = &top_global->groups;
-+
-+    /* Initial values */
-+    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
-+            &(state_global->fep_state), lam0,
-+            nrnb, top_global, &upd,
-+            nfile, fnm, &outf, &mdebin,
-+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags, wcycle);
-+
-+    clear_mat(total_vir);
-+    clear_mat(pres);
-+    /* Energy terms and groups */
-+    snew(enerd, 1);
-+    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
-+                  enerd);
-+    if (DOMAINDECOMP(cr))
-+    {
-+        f = NULL;
-+    }
-+    else
-+    {
-+        snew(f, top_global->natoms);
-+    }
-+
-+    /* Kinetic energy data */
-+    snew(ekind, 1);
-+    init_ekindata(fplog, top_global, &(ir->opts), ekind);
-+    /* needed for iteration of constraints */
-+    snew(ekind_save, 1);
-+    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
-+    /* Copy the cos acceleration to the groups struct */
-+    ekind->cosacc.cos_accel = ir->cos_accel;
-+
-+    gstat = global_stat_init(ir);
-+    debug_gmx();
-+
-+    /* Check for polarizable models and flexible constraints */
-+    shellfc = init_shell_flexcon(fplog,
-+                                 top_global, n_flexible_constraints(constr),
-+                                 (ir->bContinuation ||
-+                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
-+                                 NULL : state_global->x);
-+    if (shellfc && ir->nstcalcenergy != 1)
-+    {
-+        gmx_fatal(FARGS, "You have nstcalcenergy set to a value (%d) that is different from 1.\nThis is not supported in combinations with shell particles.\nPlease make a new tpr file.", ir->nstcalcenergy);
-+    }
-+    if (shellfc && DOMAINDECOMP(cr))
-+    {
-+        gmx_fatal(FARGS, "Shell particles are not implemented with domain decomposition, use a single rank");
-+    }
-+    if (shellfc && ir->eI == eiNM)
-+    {
-+        /* Currently shells don't work with Normal Modes */
-+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with shells.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
-+    }
-+
-+    if (vsite && ir->eI == eiNM)
-+    {
-+        /* Currently virtual sites don't work with Normal Modes */
-+        gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
-+    }
-+
-+    if (DEFORM(*ir))
-+    {
-+        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
-+        set_deform_reference_box(upd,
-+                                 deform_init_init_step_tpx,
-+                                 deform_init_box_tpx);
-+        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
-+    }
-+
-+    {
-+        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
-+        if ((io > 2000) && MASTER(cr))
-+        {
-+            fprintf(stderr,
-+                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
-+                    io);
-+        }
-+    }
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        top = dd_init_local_top(top_global);
-+
-+        snew(state, 1);
-+        dd_init_local_state(cr->dd, state_global, state);
-+
-+        if (DDMASTER(cr->dd) && ir->nstfout)
-+        {
-+            snew(f_global, state_global->natoms);
-+        }
-+    }
-+    else
-+    {
-+        top = gmx_mtop_generate_local_top(top_global, ir);
-+
-+        forcerec_set_excl_load(fr, top);
-+
-+        state    = serial_init_local_state(state_global);
-+        f_global = f;
-+
-+        atoms2md(top_global, ir, 0, NULL, top_global->natoms, mdatoms);
-+
-+        if (vsite)
-+        {
-+            set_vsite_top(vsite, top, mdatoms, cr);
-+        }
-+
-+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
-+        {
-+            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
-+        }
-+
-+        if (shellfc)
-+        {
-+            make_local_shells(cr, mdatoms, shellfc);
-+        }
-+
-+        setup_bonded_threading(fr, &top->idef);
-+    }
-+
-+    /* Set up interactive MD (IMD) */
-+    init_IMD(ir, cr, top_global, fplog, ir->nstcalcenergy, state_global->x,
-+             nfile, fnm, oenv, imdport, Flags);
-+
-+    if (DOMAINDECOMP(cr))
-+    {
-+        /* Distribute the charge groups over the nodes from the master node */
-+        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
-+                            state_global, top_global, ir,
-+                            state, &f, mdatoms, top, fr,
-+                            vsite, shellfc, constr,
-+                            nrnb, wcycle, FALSE);
-+
-+    }
-+
-+    update_mdatoms(mdatoms, state->lambda[efptMASS]);
-+
-+    if (opt2bSet("-cpi", nfile, fnm))
-+    {
-+        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
-+    }
-+    else
-+    {
-+        bStateFromCP = FALSE;
-+    }
-+
-+    if (ir->bExpanded)
-+    {
-+        init_expanded_ensemble(bStateFromCP, ir, &state->dfhist);
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        if (bStateFromCP)
-+        {
-+            /* Update mdebin with energy history if appending to output files */
-+            if (Flags & MD_APPENDFILES)
-+            {
-+                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
-+            }
-+            else
-+            {
-+                /* We might have read an energy history from checkpoint,
-+                 * free the allocated memory and reset the counts.
-+                 */
-+                done_energyhistory(&state_global->enerhist);
-+                init_energyhistory(&state_global->enerhist);
-+            }
-+        }
-+        /* Set the initial energy history in state by updating once */
-+        update_energyhistory(&state_global->enerhist, mdebin);
-+    }
-+
-+    /* Initialize constraints */
-+    if (constr && !DOMAINDECOMP(cr))
-+    {
-+        set_constraints(constr, top, ir, mdatoms, cr);
-+    }
-+
-+    if (repl_ex_nst > 0 && MASTER(cr))
-+    {
-+        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
-+                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
-+    }
-+
-+    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
-+     * PME tuning is not supported with PME only for LJ and not for Coulomb.
-+     */
-+    if ((Flags & MD_TUNEPME) &&
-+        EEL_PME(fr->eeltype) &&
-+        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
-+        !bRerunMD)
-+    {
-+        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
-+        cycles_pmes = 0;
-+        if (cr->duty & DUTY_PME)
-+        {
-+            /* Start tuning right away, as we can't measure the load */
-+            bPMETuneRunning = TRUE;
-+        }
-+        else
-+        {
-+            /* Separate PME nodes, we can measure the PP/PME load balance */
-+            bPMETuneTry = TRUE;
-+        }
-+    }
-+
-+    if (!ir->bContinuation && !bRerunMD)
-+    {
-+        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
-+        {
-+            /* Set the velocities of frozen particles to zero */
-+            for (i = 0; i < mdatoms->homenr; i++)
-+            {
-+                for (m = 0; m < DIM; m++)
-+                {
-+                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
-+                    {
-+                        state->v[i][m] = 0;
-+                    }
-+                }
-+            }
-+        }
-+
-+        if (constr)
-+        {
-+            /* Constrain the initial coordinates and velocities */
-+            do_constrain_first(fplog, constr, ir, mdatoms, state,
-+                               cr, nrnb, fr, top);
-+        }
-+        if (vsite)
-+        {
-+            /* Construct the virtual sites for the initial configuration */
-+            construct_vsites(vsite, state->x, ir->delta_t, NULL,
-+                             top->idef.iparams, top->idef.il,
-+                             fr->ePBC, fr->bMolPBC, cr, state->box);
-+        }
-+    }
-+
-+    debug_gmx();
-+
-+    /* set free energy calculation frequency as the minimum
-+       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
-+    nstfep = ir->fepvals->nstdhdl;
-+    if (ir->bExpanded)
-+    {
-+        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl, nstfep);
-+    }
-+    if (repl_ex_nst > 0)
-+    {
-+        nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
-+    }
-+
-+    /* I'm assuming we need global communication the first time! MRS */
-+    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
-+                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
-+                  | (bVV ? CGLO_PRESSURE : 0)
-+                  | (bVV ? CGLO_CONSTRAINT : 0)
-+                  | (bRerunMD ? CGLO_RERUNMD : 0)
-+                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
-+
-+    bSumEkinhOld = FALSE;
-+    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                    constr, NULL, FALSE, state->box,
-+                    top_global, &bSumEkinhOld, cglo_flags);
-+    if (ir->eI == eiVVAK)
-+    {
-+        /* a second call to get the half step temperature initialized as well */
-+        /* we do the same call as above, but turn the pressure off -- internally to
-+           compute_globals, this is recognized as a velocity verlet half-step
-+           kinetic energy calculation.  This minimized excess variables, but
-+           perhaps loses some logic?*/
-+
-+        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                        constr, NULL, FALSE, state->box,
-+                        top_global, &bSumEkinhOld,
-+                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
-+    }
-+
-+    /* Calculate the initial half step temperature, and save the ekinh_old */
-+    if (!(Flags & MD_STARTFROMCPT))
-+    {
-+        for (i = 0; (i < ir->opts.ngtc); i++)
-+        {
-+            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
-+        }
-+    }
-+    if (ir->eI != eiVV)
-+    {
-+        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
-+                                     and there is no previous step */
-+    }
-+
-+    /* if using an iterative algorithm, we need to create a working directory for the state. */
-+    if (bIterativeCase)
-+    {
-+        bufstate = init_bufstate(state);
-+    }
-+
-+    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
-+       temperature control */
-+    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
-+
-+    if (MASTER(cr))
-+    {
-+        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
-+        {
-+            fprintf(fplog,
-+                    "RMS relative constraint deviation after constraining: %.2e\n",
-+                    constr_rmsd(constr, FALSE));
-+        }
-+        if (EI_STATE_VELOCITY(ir->eI))
-+        {
-+            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
-+        }
-+        if (bRerunMD)
-+        {
-+            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
-+                    " input trajectory '%s'\n\n",
-+                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
-+            if (bVerbose)
-+            {
-+                fprintf(stderr, "Calculated time to finish depends on nsteps from "
-+                        "run input file,\nwhich may not correspond to the time "
-+                        "needed to process input trajectory.\n\n");
-+            }
-+        }
-+        else
-+        {
-+            char tbuf[20];
-+            fprintf(stderr, "starting mdrun '%s'\n",
-+                    *(top_global->name));
-+            if (ir->nsteps >= 0)
-+            {
-+                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
-+            }
-+            else
-+            {
-+                sprintf(tbuf, "%s", "infinite");
-+            }
-+            if (ir->init_step > 0)
-+            {
-+                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
-+                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
-+                        gmx_step_str(ir->init_step, sbuf2),
-+                        ir->init_step*ir->delta_t);
-+            }
-+            else
-+            {
-+                fprintf(stderr, "%s steps, %s ps.\n",
-+                        gmx_step_str(ir->nsteps, sbuf), tbuf);
-+            }
-+        }
-+        fprintf(fplog, "\n");
-+    }
-+
-+    walltime_accounting_start(walltime_accounting);
-+    wallcycle_start(wcycle, ewcRUN);
-+    print_start(fplog, cr, walltime_accounting, "mdrun");
-+
-+    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
-+#ifdef GMX_FAHCORE
-+    chkpt_ret = fcCheckPointParallel( cr->nodeid,
-+                                      NULL, 0);
-+    if (chkpt_ret == 0)
-+    {
-+        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
-+    }
-+#endif
-+
-+    debug_gmx();
-+    /***********************************************************
-+     *
-+     *             Loop over MD steps
-+     *
-+     ************************************************************/
-+
-+    /* if rerunMD then read coordinates and velocities from input trajectory */
-+    if (bRerunMD)
-+    {
-+        if (getenv("GMX_FORCE_UPDATE"))
-+        {
-+            bForceUpdate = TRUE;
-+        }
-+
-+        rerun_fr.natoms = 0;
-+        if (MASTER(cr))
-+        {
-+            bNotLastFrame = read_first_frame(oenv, &status,
-+                                             opt2fn("-rerun", nfile, fnm),
-+                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
-+            if (rerun_fr.natoms != top_global->natoms)
-+            {
-+                gmx_fatal(FARGS,
-+                          "Number of atoms in trajectory (%d) does not match the "
-+                          "run input file (%d)\n",
-+                          rerun_fr.natoms, top_global->natoms);
-+            }
-+            if (ir->ePBC != epbcNONE)
-+            {
-+                if (!rerun_fr.bBox)
-+                {
-+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
-+                }
-+                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
-+                {
-+                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
-+                }
-+            }
-+        }
-+
-+        if (PAR(cr))
-+        {
-+            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
-+        }
-+
-+        if (ir->ePBC != epbcNONE)
-+        {
-+            /* Set the shift vectors.
-+             * Necessary here when have a static box different from the tpr box.
-+             */
-+            calc_shifts(rerun_fr.box, fr->shift_vec);
-+        }
-+    }
-+
-+    /* loop over MD steps or if rerunMD to end of input trajectory */
-+    bFirstStep = TRUE;
-+    /* Skip the first Nose-Hoover integration when we get the state from tpx */
-+    bStateFromTPX    = !bStateFromCP;
-+    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
-+    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
-+    bLastStep        = FALSE;
-+    bSumEkinhOld     = FALSE;
-+    bDoReplEx        = FALSE;
-+    bExchanged       = FALSE;
-+    bNeedRepartition = FALSE;
-+
-+    init_global_signals(&gs, cr, ir, repl_ex_nst);
-+
-+    step     = ir->init_step;
-+    step_rel = 0;
-+
-+    if (ir->nstlist == -1)
-+    {
-+        init_nlistheuristics(&nlh, bGStatEveryStep, step);
-+    }
-+
-+    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
-+    {
-+        /* check how many steps are left in other sims */
-+        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
-+    }
-+
-+
-+    /* and stop now if we should */
-+    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
-+                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
-+    while (!bLastStep || (bRerunMD && bNotLastFrame))
-+    {
-+
-+        wallcycle_start(wcycle, ewcSTEP);
-+
-+        if (bRerunMD)
-+        {
-+            if (rerun_fr.bStep)
-+            {
-+                step     = rerun_fr.step;
-+                step_rel = step - ir->init_step;
-+            }
-+            if (rerun_fr.bTime)
-+            {
-+                t = rerun_fr.time;
-+            }
-+            else
-+            {
-+                t = step;
-+            }
-+        }
-+        else
-+        {
-+            bLastStep = (step_rel == ir->nsteps);
-+            t         = t0 + step*ir->delta_t;
-+        }
-+
-+        if (ir->efep != efepNO || ir->bSimTemp)
-+        {
-+            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
-+               requiring different logic. */
-+
-+            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
-+            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
-+            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
-+            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
-+                            && (ir->bExpanded) && (step > 0) && (!bStartingFromCpt));
-+        }
-+
-+        bDoReplEx = ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
-+                     do_per_step(step, repl_ex_nst));
-+
-+        if (bSimAnn)
-+        {
-+            update_annealing_target_temp(&(ir->opts), t);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            if (!DOMAINDECOMP(cr) || MASTER(cr))
-+            {
-+                for (i = 0; i < state_global->natoms; i++)
-+                {
-+                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
-+                }
-+                if (rerun_fr.bV)
-+                {
-+                    for (i = 0; i < state_global->natoms; i++)
-+                    {
-+                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
-+                    }
-+                }
-+                else
-+                {
-+                    for (i = 0; i < state_global->natoms; i++)
-+                    {
-+                        clear_rvec(state_global->v[i]);
-+                    }
-+                    if (bRerunWarnNoV)
-+                    {
-+                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
-+                                "         Ekin, temperature and pressure are incorrect,\n"
-+                                "         the virial will be incorrect when constraints are present.\n"
-+                                "\n");
-+                        bRerunWarnNoV = FALSE;
-+                    }
-+                }
-+            }
-+            copy_mat(rerun_fr.box, state_global->box);
-+            copy_mat(state_global->box, state->box);
-+
-+            if (vsite && (Flags & MD_RERUN_VSITE))
-+            {
-+                if (DOMAINDECOMP(cr))
-+                {
-+                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented with domain decomposition, use a single rank");
-+                }
-+                if (graph)
-+                {
-+                    /* Following is necessary because the graph may get out of sync
-+                     * with the coordinates if we only have every N'th coordinate set
-+                     */
-+                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
-+                    shift_self(graph, state->box, state->x);
-+                }
-+                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-+                                 top->idef.iparams, top->idef.il,
-+                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-+                if (graph)
-+                {
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+            }
-+        }
-+
-+        /* Stop Center of Mass motion */
-+        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
-+
-+        if (bRerunMD)
-+        {
-+            /* for rerun MD always do Neighbour Searching */
-+            bNS      = (bFirstStep || ir->nstlist != 0);
-+            bNStList = bNS;
-+        }
-+        else
-+        {
-+            /* Determine whether or not to do Neighbour Searching and LR */
-+            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
-+
-+            bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP ||
-+                   (ir->nstlist == -1 && nlh.nabnsb > 0));
-+
-+            if (bNS && ir->nstlist == -1)
-+            {
-+                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bNeedRepartition || bDoFEP, step);
-+            }
-+        }
-+
-+        /* check whether we should stop because another simulation has
-+           stopped. */
-+        if (MULTISIM(cr))
-+        {
-+            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
-+                 (multisim_nsteps != ir->nsteps) )
-+            {
-+                if (bNS)
-+                {
-+                    if (MASTER(cr))
-+                    {
-+                        fprintf(stderr,
-+                                "Stopping simulation %d because another one has finished\n",
-+                                cr->ms->sim);
-+                    }
-+                    bLastStep         = TRUE;
-+                    gs.sig[eglsCHKPT] = 1;
-+                }
-+            }
-+        }
-+
-+        /* < 0 means stop at next step, > 0 means stop at next NS step */
-+        if ( (gs.set[eglsSTOPCOND] < 0) ||
-+             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
-+        {
-+            bLastStep = TRUE;
-+        }
-+
-+        /* Determine whether or not to update the Born radii if doing GB */
-+        bBornRadii = bFirstStep;
-+        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
-+        {
-+            bBornRadii = TRUE;
-+        }
-+
-+        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
-+        do_verbose = bVerbose &&
-+            (step % stepout == 0 || bFirstStep || bLastStep);
-+
-+        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
-+        {
-+            if (bRerunMD)
-+            {
-+                bMasterState = TRUE;
-+            }
-+            else
-+            {
-+                bMasterState = FALSE;
-+                /* Correct the new box if it is too skewed */
-+                if (DYNAMIC_BOX(*ir))
-+                {
-+                    if (correct_box(fplog, step, state->box, graph))
-+                    {
-+                        bMasterState = TRUE;
-+                    }
-+                }
-+                if (DOMAINDECOMP(cr) && bMasterState)
-+                {
-+                    dd_collect_state(cr->dd, state, state_global);
-+                }
-+            }
-+
-+            if (DOMAINDECOMP(cr))
-+            {
-+                /* Repartition the domain decomposition */
-+                wallcycle_start(wcycle, ewcDOMDEC);
-+                dd_partition_system(fplog, step, cr,
-+                                    bMasterState, nstglobalcomm,
-+                                    state_global, top_global, ir,
-+                                    state, &f, mdatoms, top, fr,
-+                                    vsite, shellfc, constr,
-+                                    nrnb, wcycle,
-+                                    do_verbose && !bPMETuneRunning);
-+                wallcycle_stop(wcycle, ewcDOMDEC);
-+                /* If using an iterative integrator, reallocate space to match the decomposition */
-+            }
-+        }
-+
-+        if (MASTER(cr) && do_log)
-+        {
-+            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
-+        }
-+
-+        if (ir->efep != efepNO)
-+        {
-+            update_mdatoms(mdatoms, state->lambda[efptMASS]);
-+        }
-+
-+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
-+        {
-+
-+            /* We need the kinetic energy at minus the half step for determining
-+             * the full step kinetic energy and possibly for T-coupling.*/
-+            /* This may not be quite working correctly yet . . . . */
-+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-+                            constr, NULL, FALSE, state->box,
-+                            top_global, &bSumEkinhOld,
-+                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-+        }
-+        clear_mat(force_vir);
-+
-+        /* We write a checkpoint at this MD step when:
-+         * either at an NS step when we signalled through gs,
-+         * or at the last step (but not when we do not want confout),
-+         * but never at the first step or with rerun.
-+         */
-+        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
-+                 (bLastStep && (Flags & MD_CONFOUT))) &&
-+                step > ir->init_step && !bRerunMD);
-+        if (bCPT)
-+        {
-+            gs.set[eglsCHKPT] = 0;
-+        }
-+
-+        /* Determine the energy and pressure:
-+         * at nstcalcenergy steps and at energy output steps (set below).
-+         */
-+        if (EI_VV(ir->eI) && (!bInitStep))
-+        {
-+            /* for vv, the first half of the integration actually corresponds
-+               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
-+               but the virial needs to be calculated on both the current step and the 'next' step. Future
-+               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
-+
-+            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
-+            bCalcVir  = bCalcEner ||
-+                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
-+        }
-+        else
-+        {
-+            bCalcEner = do_per_step(step, ir->nstcalcenergy);
-+            bCalcVir  = bCalcEner ||
-+                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
-+        }
-+
-+        /* Do we need global communication ? */
-+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
-+                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
-+                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
-+
-+        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
-+
-+        if (do_ene || do_log || bDoReplEx)
-+        {
-+            bCalcVir  = TRUE;
-+            bCalcEner = TRUE;
-+            bGStat    = TRUE;
-+        }
-+
-+        /* these CGLO_ options remain the same throughout the iteration */
-+        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
-+                      (bGStat ? CGLO_GSTAT : 0)
-+                      );
-+
-+        force_flags = (GMX_FORCE_STATECHANGED |
-+                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
-+                       GMX_FORCE_ALLFORCES |
-+                       GMX_FORCE_SEPLRF |
-+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
-+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
-+                       (bDoFEP ? GMX_FORCE_DHDL : 0)
-+                       );
-+
-+        if (fr->bTwinRange)
-+        {
-+            if (do_per_step(step, ir->nstcalclr))
-+            {
-+                force_flags |= GMX_FORCE_DO_LR;
-+            }
-+        }
-+
-+        if (shellfc)
-+        {
-+            /* Now is the time to relax the shells */
-+            count = relax_shell_flexcon(fplog, cr, bVerbose, step,
-+                                        ir, bNS, force_flags,
-+                                        top,
-+                                        constr, enerd, fcd,
-+                                        state, f, force_vir, mdatoms,
-+                                        nrnb, wcycle, graph, groups,
-+                                        shellfc, fr, bBornRadii, t, mu_tot,
-+                                        &bConverged, vsite,
-+                                        mdoutf_get_fp_field(outf));
-+            tcount += count;
-+
-+            if (bConverged)
-+            {
-+                nconverged++;
-+            }
-+        }
-+        else
-+        {
-+            /* The coordinates (x) are shifted (to get whole molecules)
-+             * in do_force.
-+             * This is parallellized as well, and does communication too.
-+             * Check comments in sim_util.c
-+             */
-+            do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
-+                     state->box, state->x, &state->hist,
-+                     f, force_vir, mdatoms, enerd, fcd,
-+                     state->lambda, graph,
-+                     fr, vsite, mu_tot, t, mdoutf_get_fp_field(outf), ed, bBornRadii,
-+                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
-+        }
-+
-+        if (bVV && !bStartingFromCpt && !bRerunMD)
-+        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
-+        {
-+            wallcycle_start(wcycle, ewcUPDATE);
-+            if (ir->eI == eiVV && bInitStep)
-+            {
-+                /* if using velocity verlet with full time step Ekin,
-+                 * take the first half step only to compute the
-+                 * virial for the first step. From there,
-+                 * revert back to the initial coordinates
-+                 * so that the input is actually the initial step.
-+                 */
-+                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
-+            }
-+            else
-+            {
-+                /* this is for NHC in the Ekin(t+dt/2) version of vv */
-+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
-+            }
-+
-+            /* If we are using twin-range interactions where the long-range component
-+             * is only evaluated every nstcalclr>1 steps, we should do a special update
-+             * step to combine the long-range forces on these steps.
-+             * For nstcalclr=1 this is not done, since the forces would have been added
-+             * directly to the short-range forces already.
-+             *
-+             * TODO Remove various aspects of VV+twin-range in master
-+             * branch, because VV integrators did not ever support
-+             * twin-range multiple time stepping with constraints.
-+             */
-+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
-+                          f, bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                          ekind, M, upd, bInitStep, etrtVELOCITY1,
-+                          cr, nrnb, constr, &top->idef);
-+
-+            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
-+            {
-+                gmx_iterate_init(&iterate, TRUE);
-+            }
-+            /* for iterations, we save these vectors, as we will be self-consistently iterating
-+               the calculations */
-+
-+            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
-+
-+            /* save the state */
-+            if (iterate.bIterationActive)
-+            {
-+                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-+            }
-+
-+            bFirstIterate = TRUE;
-+            while (bFirstIterate || iterate.bIterationActive)
-+            {
-+                if (iterate.bIterationActive)
-+                {
-+                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-+                    if (bFirstIterate && bTrotter)
-+                    {
-+                        /* The first time through, we need a decent first estimate
-+                           of veta(t+dt) to compute the constraints.  Do
-+                           this by computing the box volume part of the
-+                           trotter integration at this time. Nothing else
-+                           should be changed by this routine here.  If
-+                           !(first time), we start with the previous value
-+                           of veta.  */
-+
-+                        veta_save = state->veta;
-+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
-+                        vetanew     = state->veta;
-+                        state->veta = veta_save;
-+                    }
-+                }
-+
-+                bOK = TRUE;
-+                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
-+                {
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, shake_vir,
-+                                       cr, nrnb, wcycle, upd, constr,
-+                                       TRUE, bCalcVir, vetanew);
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+
-+                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-+                    {
-+                        /* Correct the virial for multiple time stepping */
-+                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-+                    }
-+
-+                    if (!bOK)
-+                    {
-+                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-+                    }
-+
-+                }
-+                else if (graph)
-+                {
-+                    /* Need to unshift here if a do_force has been
-+                       called in the previous step */
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+
-+                /* if VV, compute the pressure and constraints */
-+                /* For VV2, we strictly only need this if using pressure
-+                 * control, but we really would like to have accurate pressures
-+                 * printed out.
-+                 * Think about ways around this in the future?
-+                 * For now, keep this choice in comments.
-+                 */
-+                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-+                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
-+                bPres = TRUE;
-+                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-+                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
-+                {
-+                    bSumEkinhOld = TRUE;
-+                }
-+                /* for vv, the first half of the integration actually corresponds to the previous step.
-+                   So we need information from the last step in the first half of the integration */
-+                if (bGStat || do_per_step(step-1, nstglobalcomm))
-+                {
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                    constr, NULL, FALSE, state->box,
-+                                    top_global, &bSumEkinhOld,
-+                                    cglo_flags
-+                                    | CGLO_ENERGY
-+                                    | (bTemp ? CGLO_TEMPERATURE : 0)
-+                                    | (bPres ? CGLO_PRESSURE : 0)
-+                                    | (bPres ? CGLO_CONSTRAINT : 0)
-+                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
-+                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-+                                    | CGLO_SCALEEKIN
-+                                    );
-+                    /* explanation of above:
-+                       a) We compute Ekin at the full time step
-+                       if 1) we are using the AveVel Ekin, and it's not the
-+                       initial step, or 2) if we are using AveEkin, but need the full
-+                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-+                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-+                       EkinAveVel because it's needed for the pressure */
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+                }
-+                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-+                if (!bInitStep)
-+                {
-+                    if (bTrotter)
-+                    {
-+                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
-+                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
-+                    }
-+                    else
-+                    {
-+                        if (bExchanged)
-+                        {
-+                            wallcycle_stop(wcycle, ewcUPDATE);
-+                            /* We need the kinetic energy at minus the half step for determining
-+                             * the full step kinetic energy and possibly for T-coupling.*/
-+                            /* This may not be quite working correctly yet . . . . */
-+                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-+                                            constr, NULL, FALSE, state->box,
-+                                            top_global, &bSumEkinhOld,
-+                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-+                            wallcycle_start(wcycle, ewcUPDATE);
-+                        }
-+                    }
-+                }
-+
-+                if (iterate.bIterationActive &&
-+                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-+                                   state->veta, &vetanew))
-+                {
-+                    break;
-+                }
-+                bFirstIterate = FALSE;
-+            }
-+
-+            if (bTrotter && !bInitStep)
-+            {
-+                copy_mat(shake_vir, state->svir_prev);
-+                copy_mat(force_vir, state->fvir_prev);
-+                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
-+                {
-+                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
-+                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE);
-+                    enerd->term[F_EKIN] = trace(ekind->ekin);
-+                }
-+            }
-+            /* if it's the initial step, we performed this first step just to get the constraint virial */
-+            if (bInitStep && ir->eI == eiVV)
-+            {
-+                copy_rvecn(cbuf, state->v, 0, state->natoms);
-+            }
-+            wallcycle_stop(wcycle, ewcUPDATE);
-+        }
-+
-+        /* MRS -- now done iterating -- compute the conserved quantity */
-+        if (bVV)
-+        {
-+            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
-+            if (ir->eI == eiVV)
-+            {
-+                last_ekin = enerd->term[F_EKIN];
-+            }
-+            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
-+            {
-+                saved_conserved_quantity -= enerd->term[F_DISPCORR];
-+            }
-+            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
-+            if (!bRerunMD)
-+            {
-+                sum_dhdl(enerd, state->lambda, ir->fepvals);
-+            }
-+        }
-+
-+        /* ########  END FIRST UPDATE STEP  ############## */
-+        /* ########  If doing VV, we now have v(dt) ###### */
-+        if (bDoExpanded)
-+        {
-+            /* perform extended ensemble sampling in lambda - we don't
-+               actually move to the new state before outputting
-+               statistics, but if performing simulated tempering, we
-+               do update the velocities and the tau_t. */
-+
-+            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, &state->dfhist, step, state->v, mdatoms);
-+            /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
-+            copy_df_history(&state_global->dfhist, &state->dfhist);
-+        }
-+
-+        /* Now we have the energies and forces corresponding to the
-+         * coordinates at time t. We must output all of this before
-+         * the update.
-+         */
-+        do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
-+                                 ir, state, state_global, top_global, fr,
-+                                 outf, mdebin, ekind, f, f_global,
-+                                 &nchkpt,
-+                                 bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
-+                                 bSumEkinhOld);
-+        /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
-+        bIMDstep = do_IMD(ir->bIMD, step, cr, bNS, state->box, state->x, ir, t, wcycle);
-+
-+        /* kludge -- virial is lost with restart for NPT control. Must restart */
-+        if (bStartingFromCpt && bVV)
-+        {
-+            copy_mat(state->svir_prev, shake_vir);
-+            copy_mat(state->fvir_prev, force_vir);
-+        }
-+
-+        elapsed_time = walltime_accounting_get_current_elapsed_time(walltime_accounting);
-+
-+        /* Check whether everything is still allright */
-+        if (((int)gmx_get_stop_condition() > handled_stop_condition)
-+#ifdef GMX_THREAD_MPI
-+            && MASTER(cr)
-+#endif
-+            )
-+        {
-+            /* this is just make gs.sig compatible with the hack
-+               of sending signals around by MPI_Reduce with together with
-+               other floats */
-+            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
-+            {
-+                gs.sig[eglsSTOPCOND] = 1;
-+            }
-+            if (gmx_get_stop_condition() == gmx_stop_cond_next)
-+            {
-+                gs.sig[eglsSTOPCOND] = -1;
-+            }
-+            /* < 0 means stop at next step, > 0 means stop at next NS step */
-+            if (fplog)
-+            {
-+                fprintf(fplog,
-+                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
-+                        gmx_get_signal_name(),
-+                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
-+                fflush(fplog);
-+            }
-+            fprintf(stderr,
-+                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
-+                    gmx_get_signal_name(),
-+                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
-+            fflush(stderr);
-+            handled_stop_condition = (int)gmx_get_stop_condition();
-+        }
-+        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
-+                 (max_hours > 0 && elapsed_time > max_hours*60.0*60.0*0.99) &&
-+                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
-+        {
-+            /* Signal to terminate the run */
-+            gs.sig[eglsSTOPCOND] = 1;
-+            if (fplog)
-+            {
-+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
-+            }
-+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
-+        }
-+
-+        if (bResetCountersHalfMaxH && MASTER(cr) &&
-+            elapsed_time > max_hours*60.0*60.0*0.495)
-+        {
-+            gs.sig[eglsRESETCOUNTERS] = 1;
-+        }
-+
-+        if (ir->nstlist == -1 && !bRerunMD)
-+        {
-+            /* When bGStatEveryStep=FALSE, global_stat is only called
-+             * when we check the atom displacements, not at NS steps.
-+             * This means that also the bonded interaction count check is not
-+             * performed immediately after NS. Therefore a few MD steps could
-+             * be performed with missing interactions.
-+             * But wrong energies are never written to file,
-+             * since energies are only written after global_stat
-+             * has been called.
-+             */
-+            if (step >= nlh.step_nscheck)
-+            {
-+                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
-+                                                     nlh.scale_tot, state->x);
-+            }
-+            else
-+            {
-+                /* This is not necessarily true,
-+                 * but step_nscheck is determined quite conservatively.
-+                 */
-+                nlh.nabnsb = 0;
-+            }
-+        }
-+
-+        /* In parallel we only have to check for checkpointing in steps
-+         * where we do global communication,
-+         *  otherwise the other nodes don't know.
-+         */
-+        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
-+                           cpt_period >= 0 &&
-+                           (cpt_period == 0 ||
-+                            elapsed_time >= nchkpt*cpt_period*60.0)) &&
-+            gs.set[eglsCHKPT] == 0)
-+        {
-+            gs.sig[eglsCHKPT] = 1;
-+        }
-+
-+        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
-+        if (EI_VV(ir->eI))
-+        {
-+            if (!bInitStep)
-+            {
-+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-+            }
-+            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
-+            {
-+                gmx_bool bIfRandomize;
-+                bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state, upd, constr);
-+                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
-+                if (constr && bIfRandomize)
-+                {
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, tmp_vir,
-+                                       cr, nrnb, wcycle, upd, constr,
-+                                       TRUE, bCalcVir, vetanew);
-+                }
-+            }
-+        }
-+
-+        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
-+        {
-+            gmx_iterate_init(&iterate, TRUE);
-+            /* for iterations, we save these vectors, as we will be redoing the calculations */
-+            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-+        }
-+
-+        bFirstIterate = TRUE;
-+        while (bFirstIterate || iterate.bIterationActive)
-+        {
-+            /* We now restore these vectors to redo the calculation with improved extended variables */
-+            if (iterate.bIterationActive)
-+            {
-+                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-+            }
-+
-+            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
-+               so scroll down for that logic */
-+
-+            /* #########   START SECOND UPDATE STEP ################# */
-+            /* Box is changed in update() when we do pressure coupling,
-+             * but we should still use the old box for energy corrections and when
-+             * writing it to the energy file, so it matches the trajectory files for
-+             * the same timestep above. Make a copy in a separate array.
-+             */
-+            copy_mat(state->box, lastbox);
-+
-+            bOK         = TRUE;
-+            dvdl_constr = 0;
-+
-+            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
-+            {
-+                wallcycle_start(wcycle, ewcUPDATE);
-+                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-+                if (bTrotter)
-+                {
-+                    if (iterate.bIterationActive)
-+                    {
-+                        if (bFirstIterate)
-+                        {
-+                            scalevir = 1;
-+                        }
-+                        else
-+                        {
-+                            /* we use a new value of scalevir to converge the iterations faster */
-+                            scalevir = tracevir/trace(shake_vir);
-+                        }
-+                        msmul(shake_vir, scalevir, shake_vir);
-+                        m_add(force_vir, shake_vir, total_vir);
-+                        clear_mat(shake_vir);
-+                    }
-+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-+                    /* We can only do Berendsen coupling after we have summed
-+                     * the kinetic energy or virial. Since the happens
-+                     * in global_state after update, we should only do it at
-+                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
-+                     */
-+                }
-+                else
-+                {
-+                    update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-+                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
-+                }
-+
-+                if (bVV)
-+                {
-+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                    /* velocity half-step update */
-+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                                  ekind, M, upd, FALSE, etrtVELOCITY2,
-+                                  cr, nrnb, constr, &top->idef);
-+                }
-+
-+                /* Above, initialize just copies ekinh into ekin,
-+                 * it doesn't copy position (for VV),
-+                 * and entire integrator for MD.
-+                 */
-+
-+                if (ir->eI == eiVVAK)
-+                {
-+                    copy_rvecn(state->x, cbuf, 0, state->natoms);
-+                }
-+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                              ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-+                wallcycle_stop(wcycle, ewcUPDATE);
-+
-+                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
-+                                   fr->bMolPBC, graph, f,
-+                                   &top->idef, shake_vir,
-+                                   cr, nrnb, wcycle, upd, constr,
-+                                   FALSE, bCalcVir, state->veta);
-+
-+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-+                {
-+                    /* Correct the virial for multiple time stepping */
-+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-+                }
-+
-+                if (ir->eI == eiVVAK)
-+                {
-+                    /* erase F_EKIN and F_TEMP here? */
-+                    /* just compute the kinetic energy at the half step to perform a trotter step */
-+                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                    constr, NULL, FALSE, lastbox,
-+                                    top_global, &bSumEkinhOld,
-+                                    cglo_flags | CGLO_TEMPERATURE
-+                                    );
-+                    wallcycle_start(wcycle, ewcUPDATE);
-+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-+                    /* now we know the scaling, we can compute the positions again again */
-+                    copy_rvecn(cbuf, state->x, 0, state->natoms);
-+
-+                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-+
-+                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-+                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-+                                  ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-+                    wallcycle_stop(wcycle, ewcUPDATE);
-+
-+                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
-+                    /* are the small terms in the shake_vir here due
-+                     * to numerical errors, or are they important
-+                     * physically? I'm thinking they are just errors, but not completely sure.
-+                     * For now, will call without actually constraining, constr=NULL*/
-+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-+                                       state, fr->bMolPBC, graph, f,
-+                                       &top->idef, tmp_vir,
-+                                       cr, nrnb, wcycle, upd, NULL,
-+                                       FALSE, bCalcVir,
-+                                       state->veta);
-+                }
-+                if (!bOK)
-+                {
-+                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-+                }
-+
-+                if (fr->bSepDVDL && fplog && do_log)
-+                {
-+                    gmx_print_sepdvdl(fplog, "Constraint dV/dl", 0.0, dvdl_constr);
-+                }
-+                if (bVV)
-+                {
-+                    /* this factor or 2 correction is necessary
-+                       because half of the constraint force is removed
-+                       in the vv step, so we have to double it.  See
-+                       the Redmine issue #1255.  It is not yet clear
-+                       if the factor of 2 is exact, or just a very
-+                       good approximation, and this will be
-+                       investigated.  The next step is to see if this
-+                       can be done adding a dhdl contribution from the
-+                       rattle step, but this is somewhat more
-+                       complicated with the current code. Will be
-+                       investigated, hopefully for 4.6.3. However,
-+                       this current solution is much better than
-+                       having it completely wrong.
-+                     */
-+                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
-+                }
-+                else
-+                {
-+                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-+                }
-+            }
-+            else if (graph)
-+            {
-+                /* Need to unshift here */
-+                unshift_self(graph, state->box, state->x);
-+            }
-+
-+            if (vsite != NULL)
-+            {
-+                wallcycle_start(wcycle, ewcVSITECONSTR);
-+                if (graph != NULL)
-+                {
-+                    shift_self(graph, state->box, state->x);
-+                }
-+                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-+                                 top->idef.iparams, top->idef.il,
-+                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-+
-+                if (graph != NULL)
-+                {
-+                    unshift_self(graph, state->box, state->x);
-+                }
-+                wallcycle_stop(wcycle, ewcVSITECONSTR);
-+            }
-+
-+            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
-+            /* With Leap-Frog we can skip compute_globals at
-+             * non-communication steps, but we need to calculate
-+             * the kinetic energy one step before communication.
-+             */
-+            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
-+            {
-+                if (ir->nstlist == -1 && bFirstIterate)
-+                {
-+                    gs.sig[eglsNABNSB] = nlh.nabnsb;
-+                }
-+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-+                                constr,
-+                                bFirstIterate ? &gs : NULL,
-+                                (step_rel % gs.nstms == 0) &&
-+                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
-+                                lastbox,
-+                                top_global, &bSumEkinhOld,
-+                                cglo_flags
-+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
-+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
-+                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
-+                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-+                                | CGLO_CONSTRAINT
-+                                );
-+                if (ir->nstlist == -1 && bFirstIterate)
-+                {
-+                    nlh.nabnsb         = gs.set[eglsNABNSB];
-+                    gs.set[eglsNABNSB] = 0;
-+                }
-+            }
-+            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
-+            /* #############  END CALC EKIN AND PRESSURE ################# */
-+
-+            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-+               the virial that should probably be addressed eventually. state->veta has better properies,
-+               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-+               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
-+
-+            if (iterate.bIterationActive &&
-+                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-+                               trace(shake_vir), &tracevir))
-+            {
-+                break;
-+            }
-+            bFirstIterate = FALSE;
-+        }
-+
-+        if (!bVV || bRerunMD)
-+        {
-+            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
-+            sum_dhdl(enerd, state->lambda, ir->fepvals);
-+        }
-+        update_box(fplog, step, ir, mdatoms, state, f,
-+                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, upd);
-+
-+        /* ################# END UPDATE STEP 2 ################# */
-+        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
-+
-+        /* The coordinates (x) were unshifted in update */
-+        if (!bGStat)
-+        {
-+            /* We will not sum ekinh_old,
-+             * so signal that we still have to do it.
-+             */
-+            bSumEkinhOld = TRUE;
-+        }
-+
-+        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
-+
-+        /* use the directly determined last velocity, not actually the averaged half steps */
-+        if (bTrotter && ir->eI == eiVV)
-+        {
-+            enerd->term[F_EKIN] = last_ekin;
-+        }
-+        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
-+
-+        if (bVV)
-+        {
-+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
-+        }
-+        else
-+        {
-+            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
-+        }
-+        /* #########  END PREPARING EDR OUTPUT  ###########  */
-+
-+        /* Output stuff */
-+        if (MASTER(cr))
-+        {
-+            gmx_bool do_dr, do_or;
-+
-+            if (fplog && do_log && bDoExpanded)
-+            {
-+                /* only needed if doing expanded ensemble */
-+                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
-+                                          &state_global->dfhist, state->fep_state, ir->nstlog, step);
-+            }
-+            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
-+            {
-+                if (bCalcEner)
-+                {
-+                    upd_mdebin(mdebin, bDoDHDL, TRUE,
-+                               t, mdatoms->tmass, enerd, state,
-+                               ir->fepvals, ir->expandedvals, lastbox,
-+                               shake_vir, force_vir, total_vir, pres,
-+                               ekind, mu_tot, constr);
-+                }
-+                else
-+                {
-+                    upd_mdebin_step(mdebin);
-+                }
-+
-+                do_dr  = do_per_step(step, ir->nstdisreout);
-+                do_or  = do_per_step(step, ir->nstorireout);
-+
-+                print_ebin(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or, do_log ? fplog : NULL,
-+                           step, t,
-+                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
-+            }
-+            if (ir->ePull != epullNO)
-+            {
-+                pull_print_output(ir->pull, step, t);
-+            }
-+
-+            if (do_per_step(step, ir->nstlog))
-+            {
-+                if (fflush(fplog) != 0)
-+                {
-+                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
-+                }
-+            }
-+        }
-+        if (bDoExpanded)
-+        {
-+            /* Have to do this part _after_ outputting the logfile and the edr file */
-+            /* Gets written into the state at the beginning of next loop*/
-+            state->fep_state = lamnew;
-+        }
-+        /* Print the remaining wall clock time for the run */
-+        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
-+        {
-+            if (shellfc)
-+            {
-+                fprintf(stderr, "\n");
-+            }
-+            print_time(stderr, walltime_accounting, step, ir, cr);
-+        }
-+
-+        /* Ion/water position swapping.
-+         * Not done in last step since trajectory writing happens before this call
-+         * in the MD loop and exchanges would be lost anyway. */
-+        bNeedRepartition = FALSE;
-+        if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
-+            do_per_step(step, ir->swap->nstswap))
-+        {
-+            bNeedRepartition = do_swapcoords(cr, step, t, ir, wcycle,
-+                                             bRerunMD ? rerun_fr.x   : state->x,
-+                                             bRerunMD ? rerun_fr.box : state->box,
-+                                             top_global, MASTER(cr) && bVerbose, bRerunMD);
-+
-+            if (bNeedRepartition && DOMAINDECOMP(cr))
-+            {
-+                dd_collect_state(cr->dd, state, state_global);
-+            }
-+        }
-+
-+        /* Replica exchange */
-+        bExchanged = FALSE;
-+        if (bDoReplEx)
-+        {
-+            bExchanged = replica_exchange(fplog, cr, repl_ex,
-+                                          state_global, enerd,
-+                                          state, step, t);
-+        }
-+
-+        if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
-+        {
-+            dd_partition_system(fplog, step, cr, TRUE, 1,
-+                                state_global, top_global, ir,
-+                                state, &f, mdatoms, top, fr,
-+                                vsite, shellfc, constr,
-+                                nrnb, wcycle, FALSE);
-+        }
-+
-+        bFirstStep       = FALSE;
-+        bInitStep        = FALSE;
-+        bStartingFromCpt = FALSE;
-+
-+        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
-+        /* With all integrators, except VV, we need to retain the pressure
-+         * at the current step for coupling at the next step.
-+         */
-+        if ((state->flags & (1<<estPRES_PREV)) &&
-+            (bGStatEveryStep ||
-+             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
-+        {
-+            /* Store the pressure in t_state for pressure coupling
-+             * at the next MD step.
-+             */
-+            copy_mat(pres, state->pres_prev);
-+        }
-+
-+        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
-+
-+        if ( (membed != NULL) && (!bLastStep) )
-+        {
-+            rescale_membed(step_rel, membed, state_global->x);
-+        }
-+
-+        if (bRerunMD)
-+        {
-+            if (MASTER(cr))
-+            {
-+                /* read next frame from input trajectory */
-+                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
-+            }
-+
-+            if (PAR(cr))
-+            {
-+                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
-+            }
-+        }
-+
-+        if (!bRerunMD || !rerun_fr.bStep)
-+        {
-+            /* increase the MD step number */
-+            step++;
-+            step_rel++;
-+        }
-+
-+        cycles = wallcycle_stop(wcycle, ewcSTEP);
-+        if (DOMAINDECOMP(cr) && wcycle)
-+        {
-+            dd_cycles_add(cr->dd, cycles, ddCyclStep);
-+        }
-+
-+        if (bPMETuneRunning || bPMETuneTry)
-+        {
-+            /* PME grid + cut-off optimization with GPUs or PME nodes */
-+
-+            /* Count the total cycles over the last steps */
-+            cycles_pmes += cycles;
-+
-+            /* We can only switch cut-off at NS steps */
-+            if (step % ir->nstlist == 0)
-+            {
-+                /* PME grid + cut-off optimization with GPUs or PME nodes */
-+                if (bPMETuneTry)
-+                {
-+                    if (DDMASTER(cr->dd))
-+                    {
-+                        /* PME node load is too high, start tuning */
-+                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
-+                    }
-+                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-+
-+                    if (bPMETuneRunning &&
-+                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
-+                        !(cr->duty & DUTY_PME))
-+                    {
-+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
-+                         * With GPUs + separate PME ranks, we don't want DLB.
-+                         * This could happen when we scan coarse grids and
-+                         * it would then never be turned off again.
-+                         * This would hurt performance at the final, optimal
-+                         * grid spacing, where DLB almost never helps.
-+                         * Also, DLB can limit the cut-off for PME tuning.
-+                         */
-+                        dd_dlb_set_lock(cr->dd, TRUE);
-+                    }
-+
-+                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
-+                    {
-+                        bPMETuneTry     = FALSE;
-+                    }
-+                }
-+                if (bPMETuneRunning)
-+                {
-+                    /* init_step might not be a multiple of nstlist,
-+                     * but the first cycle is always skipped anyhow.
-+                     */
-+                    bPMETuneRunning =
-+                        pme_load_balance(pme_loadbal, cr,
-+                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
-+                                         fplog,
-+                                         ir, state, cycles_pmes,
-+                                         fr->ic, fr->nbv, &fr->pmedata,
-+                                         step);
-+
-+                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
-+                    fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-+                    fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-+                    fr->rlist         = fr->ic->rlist;
-+                    fr->rlistlong     = fr->ic->rlistlong;
-+                    fr->rcoulomb      = fr->ic->rcoulomb;
-+                    fr->rvdw          = fr->ic->rvdw;
-+
-+                    if (ir->eDispCorr != edispcNO)
-+                    {
-+                        calc_enervirdiff(NULL, ir->eDispCorr, fr);
-+                    }
-+
-+                    if (!bPMETuneRunning &&
-+                        DOMAINDECOMP(cr) &&
-+                        dd_dlb_is_locked(cr->dd))
-+                    {
-+                        /* Unlock the DLB=auto, DLB is allowed to activate
-+                         * (but we don't expect it to activate in most cases).
-+                         */
-+                        dd_dlb_set_lock(cr->dd, FALSE);
-+                    }
-+                }
-+                cycles_pmes = 0;
-+            }
-+        }
-+
-+        if (step_rel == wcycle_get_reset_counters(wcycle) ||
-+            gs.set[eglsRESETCOUNTERS] != 0)
-+        {
-+            /* Reset all the counters related to performance over the run */
-+            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
-+                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
-+            wcycle_set_reset_counters(wcycle, -1);
-+            if (!(cr->duty & DUTY_PME))
-+            {
-+                /* Tell our PME node to reset its counters */
-+                gmx_pme_send_resetcounters(cr, step);
-+            }
-+            /* Correct max_hours for the elapsed time */
-+            max_hours                -= elapsed_time/(60.0*60.0);
-+            bResetCountersHalfMaxH    = FALSE;
-+            gs.set[eglsRESETCOUNTERS] = 0;
-+        }
-+
-+        /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
-+        IMD_prep_energies_send_positions(ir->bIMD && MASTER(cr), bIMDstep, ir->imd, enerd, step, bCalcEner, wcycle);
-+
-+    }
-+    /* End of main MD loop */
-+    debug_gmx();
-+
-+    /* Closing TNG files can include compressing data. Therefore it is good to do that
-+     * before stopping the time measurements. */
-+    mdoutf_tng_close(outf);
-+
-+    /* Stop measuring walltime */
-+    walltime_accounting_end(walltime_accounting);
-+
-+    if (bRerunMD && MASTER(cr))
-+    {
-+        close_trj(status);
-+    }
-+
-+    if (!(cr->duty & DUTY_PME))
-+    {
-+        /* Tell the PME only node to finish */
-+        gmx_pme_send_finish(cr);
-+    }
-+
-+    if (MASTER(cr))
-+    {
-+        if (ir->nstcalcenergy > 0 && !bRerunMD)
-+        {
-+            print_ebin(mdoutf_get_fp_ene(outf), FALSE, FALSE, FALSE, fplog, step, t,
-+                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
-+        }
-+    }
-+
-+    done_mdoutf(outf);
-+    debug_gmx();
-+
-+    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
-+    {
-+        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
-+        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
-+    }
-+
-+    if (pme_loadbal != NULL)
-+    {
-+        pme_loadbal_done(pme_loadbal, cr, fplog,
-+                         fr->nbv != NULL && fr->nbv->bUseGPU);
-+    }
-+
-+    if (shellfc && fplog)
-+    {
-+        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
-+                (nconverged*100.0)/step_rel);
-+        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
-+                tcount/step_rel);
-+    }
-+
-+    if (repl_ex_nst > 0 && MASTER(cr))
-+    {
-+        print_replica_exchange_statistics(fplog, repl_ex);
-+    }
-+
-+    /* IMD cleanup, if bIMD is TRUE. */
-+    IMD_finalize(ir->bIMD, ir->imd);
-+
-+    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
-+
-+    return 0;
-+}
-diff --git a/src/programs/mdrun/mdrun.cpp b/src/programs/mdrun/mdrun.cpp
-index 6bac3f0..e9fbf48 100644
---- a/src/programs/mdrun/mdrun.cpp
-+++ b/src/programs/mdrun/mdrun.cpp
-@@ -55,6 +55,12 @@
- 
- #include "gromacs/commandline/pargs.h"
- #include "gromacs/fileio/filenm.h"
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain; 
-+extern void(*plumedcmd)(plumed,const char*,const void*);
-+/* END PLUMED */
- 
- int gmx_mdrun(int argc, char *argv[])
- {
-@@ -428,6 +434,7 @@ int gmx_mdrun(int argc, char *argv[])
-         { efMTX, "-mtx",    "nm",       ffOPTWR },
-         { efNDX, "-dn",     "dipole",   ffOPTWR },
-         { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-plumed", "plumed",   ffOPTRD },   /* PLUMED */
-         { efDAT, "-membed", "membed",   ffOPTRD },
-         { efTOP, "-mp",     "membed",   ffOPTRD },
-         { efNDX, "-mn",     "membed",   ffOPTRD },
-@@ -780,6 +787,32 @@ int gmx_mdrun(int argc, char *argv[])
-     ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-     ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
- 
-+    /* PLUMED */
-+    plumedswitch=0;
-+    if (opt2bSet("-plumed",NFILE,fnm)) plumedswitch=1;
-+    if(plumedswitch){
-+      plumedcmd=plumed_cmd;
-+      int plumed_is_there=0;
-+      int real_precision=sizeof(real);
-+      real energyUnits=1.0;
-+      real lengthUnits=1.0;
-+      real timeUnits=1.0;
-+  
-+      if(!plumed_installed()){
-+        gmx_fatal(FARGS,"Plumed is not available. Check your PLUMED_KERNEL variable.");
-+      }
-+      plumedmain=plumed_create();
-+      plumed_cmd(plumedmain,"setRealPrecision",&real_precision);
-+      // this is not necessary for gromacs units:
-+      plumed_cmd(plumedmain,"setMDEnergyUnits",&energyUnits);
-+      plumed_cmd(plumedmain,"setMDLengthUnits",&lengthUnits);
-+      plumed_cmd(plumedmain,"setMDTimeUnits",&timeUnits);
-+      //
-+      plumed_cmd(plumedmain,"setPlumedDat",ftp2fn(efDAT,NFILE,fnm));
-+      plumedswitch=1;
-+    }
-+    /* END PLUMED */
-+
-     rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-                   nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-                   dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
-@@ -788,6 +821,12 @@ int gmx_mdrun(int argc, char *argv[])
-                   nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-                   pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
- 
-+    /* PLUMED */
-+    if(plumedswitch){
-+      plumed_finalize(plumedmain);
-+    }
-+    /* END PLUMED */
-+
-     /* Log file has to be closed in mdrunner if we are appending to it
-        (fplog not set here) */
-     if (MASTER(cr) && !bAppendFiles)
-diff --git a/src/programs/mdrun/mdrun.cpp.preplumed b/src/programs/mdrun/mdrun.cpp.preplumed
-new file mode 100644
-index 0000000..6bac3f0
---- /dev/null
-+++ b/src/programs/mdrun/mdrun.cpp.preplumed
-@@ -0,0 +1,799 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#include "mdrun_main.h"
-+
-+#ifdef HAVE_CONFIG_H
-+#include "config.h"
-+#endif
-+
-+#include <stdio.h>
-+
-+#include "gromacs/legacyheaders/checkpoint.h"
-+#include "gromacs/legacyheaders/copyrite.h"
-+#include "gromacs/legacyheaders/gmx_fatal.h"
-+#include "gromacs/legacyheaders/macros.h"
-+#include "gromacs/legacyheaders/main.h"
-+#include "gromacs/legacyheaders/mdrun.h"
-+#include "gromacs/legacyheaders/network.h"
-+#include "gromacs/legacyheaders/readinp.h"
-+#include "gromacs/legacyheaders/typedefs.h"
-+#include "gromacs/legacyheaders/types/commrec.h"
-+
-+#include "gromacs/commandline/pargs.h"
-+#include "gromacs/fileio/filenm.h"
-+
-+int gmx_mdrun(int argc, char *argv[])
-+{
-+    const char   *desc[] = {
-+        "[THISMODULE] is the main computational chemistry engine",
-+        "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
-+        "but it can also perform Stochastic Dynamics, Energy Minimization,",
-+        "test particle insertion or (re)calculation of energies.",
-+        "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
-+        "builds a Hessian matrix from single conformation.",
-+        "For usual Normal Modes-like calculations, make sure that",
-+        "the structure provided is properly energy-minimized.",
-+        "The generated matrix can be diagonalized by [gmx-nmeig].[PAR]",
-+        "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
-+        "and distributes the topology over ranks if needed.",
-+        "[TT]mdrun[tt] produces at least four output files.",
-+        "A single log file ([TT]-g[tt]) is written, unless the option",
-+        "[TT]-seppot[tt] is used, in which case each rank writes a log file.",
-+        "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
-+        "optionally forces.",
-+        "The structure file ([TT]-c[tt]) contains the coordinates and",
-+        "velocities of the last step.",
-+        "The energy file ([TT]-e[tt]) contains energies, the temperature,",
-+        "pressure, etc, a lot of these things are also printed in the log file.",
-+        "Optionally coordinates can be written to a compressed trajectory file",
-+        "([TT]-x[tt]).[PAR]",
-+        "The option [TT]-dhdl[tt] is only used when free energy calculation is",
-+        "turned on.[PAR]",
-+        "A simulation can be run in parallel using two different parallelization",
-+        "schemes: MPI parallelization and/or OpenMP thread parallelization.",
-+        "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
-+        "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
-+        "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
-+        "are supported when [TT]mdrun[tt] is compiled with OpenMP. Full OpenMP support",
-+        "is only available with the Verlet cut-off scheme, with the (older)",
-+        "group scheme only PME-only ranks can use OpenMP parallelization.",
-+        "In all cases [TT]mdrun[tt] will by default try to use all the available",
-+        "hardware resources. With a normal MPI library only the options",
-+        "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
-+        "for PME-only ranks, can be used to control the number of threads.",
-+        "With thread-MPI there are additional options [TT]-nt[tt], which sets",
-+        "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
-+        "of thread-MPI threads.",
-+        "The number of OpenMP threads used by [TT]mdrun[tt] can also be set with",
-+        "the standard environment variable, [TT]OMP_NUM_THREADS[tt].",
-+        "The [TT]GMX_PME_NUM_THREADS[tt] environment variable can be used to specify",
-+        "the number of threads used by the PME-only ranks.[PAR]",
-+        "Note that combined MPI+OpenMP parallelization is in many cases",
-+        "slower than either on its own. However, at high parallelization, using the",
-+        "combination is often beneficial as it reduces the number of domains and/or",
-+        "the number of MPI ranks. (Less and larger domains can improve scaling,",
-+        "with separate PME ranks, using fewer MPI ranks reduces communication costs.)",
-+        "OpenMP-only parallelization is typically faster than MPI-only parallelization",
-+        "on a single CPU(-die). Since we currently don't have proper hardware",
-+        "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
-+        "automatically use OpenMP-only parallelization when you use up to 4",
-+        "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
-+        "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
-+        "parallelization is used (except with GPUs, see below).",
-+        "[PAR]",
-+        "To quickly test the performance of the new Verlet cut-off scheme",
-+        "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
-+        "the [TT]-testverlet[tt] option. This should not be used for production,",
-+        "since it can slightly modify potentials and it will remove charge groups",
-+        "making analysis difficult, as the [TT].tpr[tt] file will still contain",
-+        "charge groups. For production simulations it is highly recommended",
-+        "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
-+        "[PAR]",
-+        "With GPUs (only supported with the Verlet cut-off scheme), the number",
-+        "of GPUs should match the number of particle-particle ranks, i.e.",
-+        "excluding PME-only ranks. With thread-MPI, unless set on the command line, the number",
-+        "of MPI threads will automatically be set to the number of GPUs detected.",
-+        "To use a subset of the available GPUs, or to manually provide a mapping of",
-+        "GPUs to PP ranks, you can use the [TT]-gpu_id[tt] option. The argument of [TT]-gpu_id[tt] is",
-+        "a string of digits (without delimiter) representing device id-s of the GPUs to be used.",
-+        "For example, \"[TT]02[tt]\" specifies using GPUs 0 and 2 in the first and second PP ranks per compute node",
-+        "respectively. To select different sets of GPU-s",
-+        "on different nodes of a compute cluster, use the [TT]GMX_GPU_ID[tt] environment",
-+        "variable instead. The format for [TT]GMX_GPU_ID[tt] is identical to ",
-+        "[TT]-gpu_id[tt], with the difference that an environment variable can have",
-+        "different values on different compute nodes. Multiple MPI ranks on each node",
-+        "can share GPUs. This is accomplished by specifying the id(s) of the GPU(s)",
-+        "multiple times, e.g. \"[TT]0011[tt]\" for four ranks sharing two GPUs in this node.",
-+        "This works within a single simulation, or a multi-simulation, with any form of MPI.",
-+        "[PAR]",
-+        "With the Verlet cut-off scheme and verlet-buffer-tolerance set,",
-+        "the pair-list update interval nstlist can be chosen freely with",
-+        "the option [TT]-nstlist[tt]. [TT]mdrun[tt] will then adjust",
-+        "the pair-list cut-off to maintain accuracy, and not adjust nstlist.",
-+        "Otherwise, by default, [TT]mdrun[tt] will try to increase the",
-+        "value of nstlist set in the [TT].mdp[tt] file to improve the",
-+        "performance. For CPU-only runs, nstlist might increase to 20, for",
-+        "GPU runs up to 40. For medium to high parallelization or with",
-+        "fast GPUs, a (user-supplied) larger nstlist value can give much",
-+        "better performance.",
-+        "[PAR]",
-+        "When using PME with separate PME ranks or with a GPU, the two major",
-+        "compute tasks, the non-bonded force calculation and the PME calculation",
-+        "run on different compute resources. If this load is not balanced,",
-+        "some of the resources will be idle part of time. With the Verlet",
-+        "cut-off scheme this load is automatically balanced when the PME load",
-+        "is too high (but not when it is too low). This is done by scaling",
-+        "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
-+        "few hundred steps different settings are tried and the fastest is chosen",
-+        "for the rest of the simulation. This does not affect the accuracy of",
-+        "the results, but it does affect the decomposition of the Coulomb energy",
-+        "into particle and mesh contributions. The auto-tuning can be turned off",
-+        "with the option [TT]-notunepme[tt].",
-+        "[PAR]",
-+        "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,",
-+        "when all (logical) cores on a compute node are used by [TT]mdrun[tt],",
-+        "even when no multi-threading is used,",
-+        "as this usually results in significantly better performance.",
-+        "If the queuing systems or the OpenMP library pinned threads, we honor",
-+        "this and don't pin again, even though the layout may be sub-optimal.",
-+        "If you want to have [TT]mdrun[tt] override an already set thread affinity",
-+        "or pin threads when using less cores, use [TT]-pin on[tt].",
-+        "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,",
-+        "there are multiple logical cores per physical core.",
-+        "The option [TT]-pinstride[tt] sets the stride in logical cores for",
-+        "pinning consecutive threads. Without SMT, 1 is usually the best choice.",
-+        "With Intel Hyper-Threading 2 is best when using half or less of the",
-+        "logical cores, 1 otherwise. The default value of 0 do exactly that:",
-+        "it minimizes the threads per logical core, to optimize performance.",
-+        "If you want to run multiple [TT]mdrun[tt] jobs on the same physical node,"
-+        "you should set [TT]-pinstride[tt] to 1 when using all logical cores.",
-+        "When running multiple [TT]mdrun[tt] (or other) simulations on the same physical",
-+        "node, some simulations need to start pinning from a non-zero core",
-+        "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
-+        "the offset in logical cores for pinning.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] is started with more than 1 rank,",
-+        "parallelization with domain decomposition is used.",
-+        "[PAR]",
-+        "With domain decomposition, the spatial decomposition can be set",
-+        "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
-+        "The user only needs to change this when the system is very inhomogeneous.",
-+        "Dynamic load balancing is set with the option [TT]-dlb[tt],",
-+        "which can give a significant performance improvement,",
-+        "especially for inhomogeneous systems. The only disadvantage of",
-+        "dynamic load balancing is that runs are no longer binary reproducible,",
-+        "but in most cases this is not important.",
-+        "By default the dynamic load balancing is automatically turned on",
-+        "when the measured performance loss due to load imbalance is 5% or more.",
-+        "At low parallelization these are the only important options",
-+        "for domain decomposition.",
-+        "At high parallelization the options in the next two sections",
-+        "could be important for increasing the performace.",
-+        "[PAR]",
-+        "When PME is used with domain decomposition, separate ranks can",
-+        "be assigned to do only the PME mesh calculation;",
-+        "this is computationally more efficient starting at about 12 ranks,",
-+        "or even fewer when OpenMP parallelization is used.",
-+        "The number of PME ranks is set with option [TT]-npme[tt],",
-+        "but this cannot be more than half of the ranks.",
-+        "By default [TT]mdrun[tt] makes a guess for the number of PME",
-+        "ranks when the number of ranks is larger than 16. With GPUs,",
-+        "using separate PME ranks is not selected automatically,",
-+        "since the optimal setup depends very much on the details",
-+        "of the hardware. In all cases, you might gain performance",
-+        "by optimizing [TT]-npme[tt]. Performance statistics on this issue",
-+        "are written at the end of the log file.",
-+        "For good load balancing at high parallelization, the PME grid x and y",
-+        "dimensions should be divisible by the number of PME ranks",
-+        "(the simulation will run correctly also when this is not the case).",
-+        "[PAR]",
-+        "This section lists all options that affect the domain decomposition.",
-+        "[PAR]",
-+        "Option [TT]-rdd[tt] can be used to set the required maximum distance",
-+        "for inter charge-group bonded interactions.",
-+        "Communication for two-body bonded interactions below the non-bonded",
-+        "cut-off distance always comes for free with the non-bonded communication.",
-+        "Atoms beyond the non-bonded cut-off are only communicated when they have",
-+        "missing bonded interactions; this means that the extra cost is minor",
-+        "and nearly indepedent of the value of [TT]-rdd[tt].",
-+        "With dynamic load balancing option [TT]-rdd[tt] also sets",
-+        "the lower limit for the domain decomposition cell sizes.",
-+        "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
-+        "the initial coordinates. The chosen value will be a balance",
-+        "between interaction range and communication cost.",
-+        "[PAR]",
-+        "When inter charge-group bonded interactions are beyond",
-+        "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
-+        "For pair interactions and tabulated bonds",
-+        "that do not generate exclusions, this check can be turned off",
-+        "with the option [TT]-noddcheck[tt].",
-+        "[PAR]",
-+        "When constraints are present, option [TT]-rcon[tt] influences",
-+        "the cell size limit as well.",
-+        "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
-+        "should not be beyond the smallest cell size. A error message is",
-+        "generated when this happens and the user should change the decomposition",
-+        "or decrease the LINCS order and increase the number of LINCS iterations.",
-+        "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
-+        "in a conservative fashion. For high parallelization it can be useful",
-+        "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
-+        "[PAR]",
-+        "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
-+        "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
-+        "the cells can scale down by at least this factor. This option is used",
-+        "for the automated spatial decomposition (when not using [TT]-dd[tt])",
-+        "as well as for determining the number of grid pulses, which in turn",
-+        "sets the minimum allowed cell size. Under certain circumstances",
-+        "the value of [TT]-dds[tt] might need to be adjusted to account for",
-+        "high or low spatial inhomogeneity of the system.",
-+        "[PAR]",
-+        "The option [TT]-gcom[tt] can be used to only do global communication",
-+        "every n steps.",
-+        "This can improve performance for highly parallel simulations",
-+        "where this global communication step becomes the bottleneck.",
-+        "For a global thermostat and/or barostat the temperature",
-+        "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
-+        "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
-+        "With [TT]-rerun[tt] an input trajectory can be given for which ",
-+        "forces and energies will be (re)calculated. Neighbor searching will be",
-+        "performed for every frame, unless [TT]nstlist[tt] is zero",
-+        "(see the [TT].mdp[tt] file).[PAR]",
-+        "ED (essential dynamics) sampling and/or additional flooding potentials",
-+        "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]",
-+        "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool",
-+        "or by using options in the essdyn menu of the WHAT IF program.",
-+        "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
-+        "contains projections of positions, velocities and forces onto selected",
-+        "eigenvectors.[PAR]",
-+        "When user-defined potential functions have been selected in the",
-+        "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
-+        "a formatted table with potential functions. The file is read from",
-+        "either the current directory or from the [TT]GMXLIB[tt] directory.",
-+        "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
-+        "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
-+        "normal Coulomb.",
-+        "When pair interactions are present, a separate table for pair interaction",
-+        "functions is read using the [TT]-tablep[tt] option.[PAR]",
-+        "When tabulated bonded functions are present in the topology,",
-+        "interaction functions are read using the [TT]-tableb[tt] option.",
-+        "For each different tabulated interaction type the table file name is",
-+        "modified in a different way: before the file extension an underscore is",
-+        "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
-+        "and finally the table number of the interaction type.[PAR]",
-+        "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
-+        "coordinates and forces when pulling is selected",
-+        "in the [TT].mdp[tt] file.[PAR]",
-+        "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
-+        "simulated in parallel.",
-+        "As many input files/directories are required as the number of systems. ",
-+        "The [TT]-multidir[tt] option takes a list of directories (one for each ",
-+        "system) and runs in each of them, using the input/output file names, ",
-+        "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
-+        "directories.",
-+        "With [TT]-multi[tt], the system number is appended to the run input ",
-+        "and each output filename, for instance [TT]topol.tpr[tt] becomes",
-+        "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
-+        "The number of ranks per system is the total number of ranks",
-+        "divided by the number of systems.",
-+        "One use of this option is for NMR refinement: when distance",
-+        "or orientation restraints are present these can be ensemble averaged",
-+        "over all the systems.[PAR]",
-+        "With [TT]-replex[tt] replica exchange is attempted every given number",
-+        "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
-+        "[TT]-multidir[tt] option, described above.",
-+        "All run input files should use a different coupling temperature,",
-+        "the order of the files is not important. The random seed is set with",
-+        "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
-+        "is performed after every exchange.[PAR]",
-+        "Finally some experimental algorithms can be tested when the",
-+        "appropriate options have been given. Currently under",
-+        "investigation are: polarizability.",
-+        "[PAR]",
-+        "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
-+        "a protein into a membrane. The data file should contain the options",
-+        "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
-+        "both apply to this as well.",
-+        "[PAR]",
-+        "The option [TT]-pforce[tt] is useful when you suspect a simulation",
-+        "crashes due to too large forces. With this option coordinates and",
-+        "forces of atoms with a force larger than a certain value will",
-+        "be printed to stderr.",
-+        "[PAR]",
-+        "Checkpoints containing the complete state of the system are written",
-+        "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
-+        "unless option [TT]-cpt[tt] is set to -1.",
-+        "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
-+        "make sure that a recent state of the system is always available,",
-+        "even when the simulation is terminated while writing a checkpoint.",
-+        "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
-+        "with the step number.",
-+        "A simulation can be continued by reading the full state from file",
-+        "with option [TT]-cpi[tt]. This option is intelligent in the way that",
-+        "if no checkpoint file is found, Gromacs just assumes a normal run and",
-+        "starts from the first step of the [TT].tpr[tt] file. By default the output",
-+        "will be appending to the existing output files. The checkpoint file",
-+        "contains checksums of all output files, such that you will never",
-+        "loose data when some output files are modified, corrupt or removed.",
-+        "There are three scenarios with [TT]-cpi[tt]:[PAR]",
-+        "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
-+        "[TT]*[tt] all files are present with names and checksums matching those stored",
-+        "in the checkpoint file: files are appended[PAR]",
-+        "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
-+        "With [TT]-noappend[tt] new output files are opened and the simulation",
-+        "part number is added to all output file names.",
-+        "Note that in all cases the checkpoint file itself is not renamed",
-+        "and will be overwritten, unless its name does not match",
-+        "the [TT]-cpo[tt] option.",
-+        "[PAR]",
-+        "With checkpointing the output is appended to previously written",
-+        "output files, unless [TT]-noappend[tt] is used or none of the previous",
-+        "output files are present (except for the checkpoint file).",
-+        "The integrity of the files to be appended is verified using checksums",
-+        "which are stored in the checkpoint file. This ensures that output can",
-+        "not be mixed up or corrupted due to file appending. When only some",
-+        "of the previous output files are present, a fatal error is generated",
-+        "and no old output files are modified and no new output files are opened.",
-+        "The result with appending will be the same as from a single run.",
-+        "The contents will be binary identical, unless you use a different number",
-+        "of ranks or dynamic load balancing or the FFT library uses optimizations",
-+        "through timing.",
-+        "[PAR]",
-+        "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
-+        "file is written at the first neighbor search step where the run time",
-+        "exceeds [TT]-maxh[tt]*0.99 hours.",
-+        "[PAR]",
-+        "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
-+        "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
-+        "pressed), it will stop after the next neighbor search step ",
-+        "(with nstlist=0 at the next step).",
-+        "In both cases all the usual output will be written to file.",
-+        "When running with MPI, a signal to one of the [TT]mdrun[tt] ranks",
-+        "is sufficient, this signal should not be sent to mpirun or",
-+        "the [TT]mdrun[tt] process that is the parent of the others.",
-+        "[PAR]",
-+        "Interactive molecular dynamics (IMD) can be activated by using at least one",
-+        "of the three IMD switches: The [TT]-imdterm[tt] switch allows to terminate the",
-+        "simulation from the molecular viewer (e.g. VMD). With [TT]-imdwait[tt],",
-+        "[TT]mdrun[tt] pauses whenever no IMD client is connected. Pulling from the",
-+        "IMD remote can be turned on by [TT]-imdpull[tt].",
-+        "The port [TT]mdrun[tt] listens to can be altered by [TT]-imdport[tt].The",
-+        "file pointed to by [TT]-if[tt] contains atom indices and forces if IMD",
-+        "pulling is used."
-+        "[PAR]",
-+        "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
-+    };
-+    t_commrec    *cr;
-+    t_filenm      fnm[] = {
-+        { efTPX, NULL,      NULL,       ffREAD },
-+        { efTRN, "-o",      NULL,       ffWRITE },
-+        { efCOMPRESSED, "-x", NULL,     ffOPTWR },
-+        { efCPT, "-cpi",    NULL,       ffOPTRD },
-+        { efCPT, "-cpo",    NULL,       ffOPTWR },
-+        { efSTO, "-c",      "confout",  ffWRITE },
-+        { efEDR, "-e",      "ener",     ffWRITE },
-+        { efLOG, "-g",      "md",       ffWRITE },
-+        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
-+        { efXVG, "-field",  "field",    ffOPTWR },
-+        { efXVG, "-table",  "table",    ffOPTRD },
-+        { efXVG, "-tabletf", "tabletf",    ffOPTRD },
-+        { efXVG, "-tablep", "tablep",   ffOPTRD },
-+        { efXVG, "-tableb", "table",    ffOPTRD },
-+        { efTRX, "-rerun",  "rerun",    ffOPTRD },
-+        { efXVG, "-tpi",    "tpi",      ffOPTWR },
-+        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
-+        { efEDI, "-ei",     "sam",      ffOPTRD },
-+        { efXVG, "-eo",     "edsam",    ffOPTWR },
-+        { efXVG, "-devout", "deviatie", ffOPTWR },
-+        { efXVG, "-runav",  "runaver",  ffOPTWR },
-+        { efXVG, "-px",     "pullx",    ffOPTWR },
-+        { efXVG, "-pf",     "pullf",    ffOPTWR },
-+        { efXVG, "-ro",     "rotation", ffOPTWR },
-+        { efLOG, "-ra",     "rotangles", ffOPTWR },
-+        { efLOG, "-rs",     "rotslabs", ffOPTWR },
-+        { efLOG, "-rt",     "rottorque", ffOPTWR },
-+        { efMTX, "-mtx",    "nm",       ffOPTWR },
-+        { efNDX, "-dn",     "dipole",   ffOPTWR },
-+        { efRND, "-multidir", NULL,      ffOPTRDMULT},
-+        { efDAT, "-membed", "membed",   ffOPTRD },
-+        { efTOP, "-mp",     "membed",   ffOPTRD },
-+        { efNDX, "-mn",     "membed",   ffOPTRD },
-+        { efXVG, "-if",     "imdforces", ffOPTWR },
-+        { efXVG, "-swap",   "swapions", ffOPTWR }
-+    };
-+#define NFILE asize(fnm)
-+
-+    /* Command line options ! */
-+    gmx_bool        bDDBondCheck  = TRUE;
-+    gmx_bool        bDDBondComm   = TRUE;
-+    gmx_bool        bTunePME      = TRUE;
-+    gmx_bool        bTestVerlet   = FALSE;
-+    gmx_bool        bVerbose      = FALSE;
-+    gmx_bool        bCompact      = TRUE;
-+    gmx_bool        bSepPot       = FALSE;
-+    gmx_bool        bRerunVSite   = FALSE;
-+    gmx_bool        bConfout      = TRUE;
-+    gmx_bool        bReproducible = FALSE;
-+    gmx_bool        bIMDwait      = FALSE;
-+    gmx_bool        bIMDterm      = FALSE;
-+    gmx_bool        bIMDpull      = FALSE;
-+
-+    int             npme          = -1;
-+    int             nstlist       = 0;
-+    int             nmultisim     = 0;
-+    int             nstglobalcomm = -1;
-+    int             repl_ex_nst   = 0;
-+    int             repl_ex_seed  = -1;
-+    int             repl_ex_nex   = 0;
-+    int             nstepout      = 100;
-+    int             resetstep     = -1;
-+    gmx_int64_t     nsteps        = -2;   /* the value -2 means that the mdp option will be used */
-+    int             imdport       = 8888; /* can be almost anything, 8888 is easy to remember */
-+
-+    rvec            realddxyz          = {0, 0, 0};
-+    const char     *ddno_opt[ddnoNR+1] =
-+    { NULL, "interleave", "pp_pme", "cartesian", NULL };
-+    const char     *dddlb_opt[] =
-+    { NULL, "auto", "no", "yes", NULL };
-+    const char     *thread_aff_opt[threadaffNR+1] =
-+    { NULL, "auto", "on", "off", NULL };
-+    const char     *nbpu_opt[] =
-+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
-+    real            rdd                   = 0.0, rconstr = 0.0, dlb_scale = 0.8, pforce = -1;
-+    char           *ddcsx                 = NULL, *ddcsy = NULL, *ddcsz = NULL;
-+    real            cpt_period            = 15.0, max_hours = -1;
-+    gmx_bool        bAppendFiles          = TRUE;
-+    gmx_bool        bKeepAndNumCPT        = FALSE;
-+    gmx_bool        bResetCountersHalfWay = FALSE;
-+    output_env_t    oenv                  = NULL;
-+    const char     *deviceOptions         = "";
-+
-+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
-+     * But unfortunately we are not allowed to call a function here,
-+     * since declarations follow below.
-+     */
-+    gmx_hw_opt_t    hw_opt = {
-+        0, 0, 0, 0, threadaffSEL, 0, 0,
-+        { NULL, FALSE, 0, NULL }
-+    };
-+
-+    t_pargs         pa[] = {
-+
-+        { "-dd",      FALSE, etRVEC, {&realddxyz},
-+          "Domain decomposition grid, 0 is optimize" },
-+        { "-ddorder", FALSE, etENUM, {ddno_opt},
-+          "DD rank order" },
-+        { "-npme",    FALSE, etINT, {&npme},
-+          "Number of separate ranks to be used for PME, -1 is guess" },
-+        { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
-+          "Total number of threads to start (0 is guess)" },
-+        { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
-+          "Number of thread-MPI threads to start (0 is guess)" },
-+        { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
-+          "Number of OpenMP threads per MPI rank to start (0 is guess)" },
-+        { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
-+          "Number of OpenMP threads per MPI rank to start (0 is -ntomp)" },
-+        { "-pin",     FALSE, etENUM, {thread_aff_opt},
-+          "Set thread affinities" },
-+        { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
-+          "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
-+        { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
-+          "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
-+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
-+          "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" },
-+        { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
-+          "Check for all bonded interactions with DD" },
-+        { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
-+          "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
-+        { "-rdd",     FALSE, etREAL, {&rdd},
-+          "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
-+        { "-rcon",    FALSE, etREAL, {&rconstr},
-+          "Maximum distance for P-LINCS (nm), 0 is estimate" },
-+        { "-dlb",     FALSE, etENUM, {dddlb_opt},
-+          "Dynamic load balancing (with DD)" },
-+        { "-dds",     FALSE, etREAL, {&dlb_scale},
-+          "Fraction in (0,1) by whose reciprocal the initial DD cell size will be increased in order to "
-+          "provide a margin in which dynamic load balancing can act while preserving the minimum cell size." },
-+        { "-ddcsx",   FALSE, etSTR, {&ddcsx},
-+          "HIDDENA string containing a vector of the relative sizes in the x "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-ddcsy",   FALSE, etSTR, {&ddcsy},
-+          "HIDDENA string containing a vector of the relative sizes in the y "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-ddcsz",   FALSE, etSTR, {&ddcsz},
-+          "HIDDENA string containing a vector of the relative sizes in the z "
-+          "direction of the corresponding DD cells. Only effective with static "
-+          "load balancing." },
-+        { "-gcom",    FALSE, etINT, {&nstglobalcomm},
-+          "Global communication frequency" },
-+        { "-nb",      FALSE, etENUM, {&nbpu_opt},
-+          "Calculate non-bonded interactions on" },
-+        { "-nstlist", FALSE, etINT, {&nstlist},
-+          "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
-+        { "-tunepme", FALSE, etBOOL, {&bTunePME},
-+          "Optimize PME load between PP/PME ranks or GPU/CPU" },
-+        { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
-+          "Test the Verlet non-bonded scheme" },
-+        { "-v",       FALSE, etBOOL, {&bVerbose},
-+          "Be loud and noisy" },
-+        { "-compact", FALSE, etBOOL, {&bCompact},
-+          "Write a compact log file" },
-+        { "-seppot",  FALSE, etBOOL, {&bSepPot},
-+          "Write separate V and dVdl terms for each interaction type and rank to the log file(s)" },
-+        { "-pforce",  FALSE, etREAL, {&pforce},
-+          "Print all forces larger than this (kJ/mol nm)" },
-+        { "-reprod",  FALSE, etBOOL, {&bReproducible},
-+          "Try to avoid optimizations that affect binary reproducibility" },
-+        { "-cpt",     FALSE, etREAL, {&cpt_period},
-+          "Checkpoint interval (minutes)" },
-+        { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
-+          "Keep and number checkpoint files" },
-+        { "-append",  FALSE, etBOOL, {&bAppendFiles},
-+          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
-+        { "-nsteps",  FALSE, etINT64, {&nsteps},
-+          "Run this number of steps, overrides .mdp file option" },
-+        { "-maxh",   FALSE, etREAL, {&max_hours},
-+          "Terminate after 0.99 times this time (hours)" },
-+        { "-multi",   FALSE, etINT, {&nmultisim},
-+          "Do multiple simulations in parallel" },
-+        { "-replex",  FALSE, etINT, {&repl_ex_nst},
-+          "Attempt replica exchange periodically with this period (steps)" },
-+        { "-nex",  FALSE, etINT, {&repl_ex_nex},
-+          "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
-+        { "-reseed",  FALSE, etINT, {&repl_ex_seed},
-+          "Seed for replica exchange, -1 is generate a seed" },
-+        { "-imdport",    FALSE, etINT, {&imdport},
-+          "HIDDENIMD listening port" },
-+        { "-imdwait",  FALSE, etBOOL, {&bIMDwait},
-+          "HIDDENPause the simulation while no IMD client is connected" },
-+        { "-imdterm",  FALSE, etBOOL, {&bIMDterm},
-+          "HIDDENAllow termination of the simulation from IMD client" },
-+        { "-imdpull",  FALSE, etBOOL, {&bIMDpull},
-+          "HIDDENAllow pulling in the simulation from IMD client" },
-+        { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
-+          "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
-+        { "-confout", FALSE, etBOOL, {&bConfout},
-+          "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
-+        { "-stepout", FALSE, etINT, {&nstepout},
-+          "HIDDENFrequency of writing the remaining wall clock time for the run" },
-+        { "-resetstep", FALSE, etINT, {&resetstep},
-+          "HIDDENReset cycle counters after these many time steps" },
-+        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
-+          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
-+    };
-+    unsigned long   Flags, PCA_Flags;
-+    ivec            ddxyz;
-+    int             dd_node_order;
-+    gmx_bool        bAddPart;
-+    FILE           *fplog, *fpmulti;
-+    int             sim_part, sim_part_fn;
-+    const char     *part_suffix = ".part";
-+    char            suffix[STRLEN];
-+    int             rc;
-+    char          **multidir = NULL;
-+
-+
-+    cr = init_commrec();
-+
-+    PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
-+
-+    /* Comment this in to do fexist calls only on master
-+     * works not with rerun or tables at the moment
-+     * also comment out the version of init_forcerec in md.c
-+     * with NULL instead of opt2fn
-+     */
-+    /*
-+       if (!MASTER(cr))
-+       {
-+       PCA_Flags |= PCA_NOT_READ_NODE;
-+       }
-+     */
-+
-+    if (!parse_common_args(&argc, argv, PCA_Flags, NFILE, fnm, asize(pa), pa,
-+                           asize(desc), desc, 0, NULL, &oenv))
-+    {
-+        return 0;
-+    }
-+
-+
-+    /* we set these early because they might be used in init_multisystem()
-+       Note that there is the potential for npme>nnodes until the number of
-+       threads is set later on, if there's thread parallelization. That shouldn't
-+       lead to problems. */
-+    dd_node_order = nenum(ddno_opt);
-+    cr->npmenodes = npme;
-+
-+    hw_opt.thread_affinity = nenum(thread_aff_opt);
-+
-+    /* now check the -multi and -multidir option */
-+    if (opt2bSet("-multidir", NFILE, fnm))
-+    {
-+        if (nmultisim > 0)
-+        {
-+            gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
-+        }
-+        nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
-+    }
-+
-+
-+    if (repl_ex_nst != 0 && nmultisim < 2)
-+    {
-+        gmx_fatal(FARGS, "Need at least two replicas for replica exchange (option -multi)");
-+    }
-+
-+    if (repl_ex_nex < 0)
-+    {
-+        gmx_fatal(FARGS, "Replica exchange number of exchanges needs to be positive");
-+    }
-+
-+    if (nmultisim > 1)
-+    {
-+#ifndef GMX_THREAD_MPI
-+        gmx_bool bParFn = (multidir == NULL);
-+        init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
-+#else
-+        gmx_fatal(FARGS, "mdrun -multi is not supported with the thread library. "
-+                  "Please compile GROMACS with MPI support");
-+#endif
-+    }
-+
-+    bAddPart = !bAppendFiles;
-+
-+    /* Check if there is ANY checkpoint file available */
-+    sim_part    = 1;
-+    sim_part_fn = sim_part;
-+    if (opt2bSet("-cpi", NFILE, fnm))
-+    {
-+        if (bSepPot && bAppendFiles)
-+        {
-+            gmx_fatal(FARGS, "Output file appending is not supported with -seppot");
-+        }
-+
-+        bAppendFiles =
-+            read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
-+                                                          fnm, cr),
-+                                            &sim_part_fn, NULL, cr,
-+                                            bAppendFiles, NFILE, fnm,
-+                                            part_suffix, &bAddPart);
-+        if (sim_part_fn == 0 && MULTIMASTER(cr))
-+        {
-+            fprintf(stdout, "No previous checkpoint file present, assuming this is a new run.\n");
-+        }
-+        else
-+        {
-+            sim_part = sim_part_fn + 1;
-+        }
-+
-+        if (MULTISIM(cr) && MASTER(cr))
-+        {
-+            if (MULTIMASTER(cr))
-+            {
-+                /* Log file is not yet available, so if there's a
-+                 * problem we can only write to stderr. */
-+                fpmulti = stderr;
-+            }
-+            else
-+            {
-+                fpmulti = NULL;
-+            }
-+            check_multi_int(fpmulti, cr->ms, sim_part, "simulation part", TRUE);
-+        }
-+    }
-+    else
-+    {
-+        bAppendFiles = FALSE;
-+    }
-+
-+    if (!bAppendFiles)
-+    {
-+        sim_part_fn = sim_part;
-+    }
-+
-+    if (bAddPart)
-+    {
-+        /* Rename all output files (except checkpoint files) */
-+        /* create new part name first (zero-filled) */
-+        sprintf(suffix, "%s%04d", part_suffix, sim_part_fn);
-+
-+        add_suffix_to_output_names(fnm, NFILE, suffix);
-+        if (MULTIMASTER(cr))
-+        {
-+            fprintf(stdout, "Checkpoint file is from part %d, new output files will be suffixed '%s'.\n", sim_part-1, suffix);
-+        }
-+    }
-+
-+    Flags = opt2bSet("-rerun", NFILE, fnm) ? MD_RERUN : 0;
-+    Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
-+    Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
-+    Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
-+    Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
-+    Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
-+    Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
-+    Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
-+    Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
-+    Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
-+    Flags = Flags | (opt2parg_bSet("-append", asize(pa), pa) ? MD_APPENDFILESSET : 0);
-+    Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0);
-+    Flags = Flags | (sim_part > 1    ? MD_STARTFROMCPT : 0);
-+    Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
-+    Flags = Flags | (bIMDwait      ? MD_IMDWAIT      : 0);
-+    Flags = Flags | (bIMDterm      ? MD_IMDTERM      : 0);
-+    Flags = Flags | (bIMDpull      ? MD_IMDPULL      : 0);
-+
-+    /* We postpone opening the log file if we are appending, so we can
-+       first truncate the old log file and append to the correct position
-+       there instead.  */
-+    if ((MASTER(cr) || bSepPot) && !bAppendFiles)
-+    {
-+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
-+                     !bSepPot, Flags & MD_APPENDFILES, &fplog);
-+        please_cite(fplog, "Hess2008b");
-+        please_cite(fplog, "Spoel2005a");
-+        please_cite(fplog, "Lindahl2001a");
-+        please_cite(fplog, "Berendsen95a");
-+    }
-+    else if (!MASTER(cr) && bSepPot)
-+    {
-+        gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr, !bSepPot, Flags, &fplog);
-+    }
-+    else
-+    {
-+        fplog = NULL;
-+    }
-+
-+    ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
-+    ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
-+    ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
-+
-+    rc = mdrunner(&hw_opt, fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
-+                  nstglobalcomm, ddxyz, dd_node_order, rdd, rconstr,
-+                  dddlb_opt[0], dlb_scale, ddcsx, ddcsy, ddcsz,
-+                  nbpu_opt[0], nstlist,
-+                  nsteps, nstepout, resetstep,
-+                  nmultisim, repl_ex_nst, repl_ex_nex, repl_ex_seed,
-+                  pforce, cpt_period, max_hours, deviceOptions, imdport, Flags);
-+
-+    /* Log file has to be closed in mdrunner if we are appending to it
-+       (fplog not set here) */
-+    if (MASTER(cr) && !bAppendFiles)
-+    {
-+        gmx_log_close(fplog);
-+    }
-+
-+    return rc;
-+}
-diff --git a/src/programs/mdrun/repl_ex.c b/src/programs/mdrun/repl_ex.c
-index 46a9bc0..cfb0b7f 100644
---- a/src/programs/mdrun/repl_ex.c
-+++ b/src/programs/mdrun/repl_ex.c
-@@ -51,6 +51,12 @@
- #include "domdec.h"
- #include "gromacs/random/random.h"
- 
-+/* PLUMED */
-+#include "../../../Plumed.h"
-+extern int    plumedswitch;
-+extern plumed plumedmain;
-+/* END PLUMED */
-+
- #define PROBABILITYCUTOFF 100
- /* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
- 
-@@ -112,14 +118,16 @@ static gmx_bool repl_quantity(const gmx_multisim_t *ms,
-     qall[re->repl] = q;
-     gmx_sum_sim(ms->nsim, qall, ms);
- 
--    bDiff = FALSE;
--    for (s = 1; s < ms->nsim; s++)
--    {
--        if (qall[s] != qall[0])
--        {
-+    /* PLUMED */
-+    //bDiff = FALSE;
-+    //for (s = 1; s < ms->nsim; s++)
-+    //{
-+    //    if (qall[s] != qall[0])
-+    //    {
-             bDiff = TRUE;
--        }
--    }
-+    //    }
-+    //}
-+    /* END PLUMED */
- 
-     if (bDiff)
-     {
-@@ -269,6 +277,10 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-         re->ind[i] = i;
-     }
- 
-+    /* PLUMED */
-+    // plumed2: check if we want alternative patterns (i.e. for bias-exchange metaD)
-+    // in those cases replicas can share the same temperature.
-+    /*
-     if (re->type < ereENDSINGLE)
-     {
- 
-@@ -277,11 +289,12 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-             for (j = i+1; j < re->nrepl; j++)
-             {
-                 if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
--                {
-+                {*/
-                     /* Unordered replicas are supposed to work, but there
-                      * is still an issues somewhere.
-                      * Note that at this point still re->ind[i]=i.
-                      */
-+                 /*
-                     gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
-                               i, j,
-                               erename[re->type],
-@@ -299,6 +312,8 @@ gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-             }
-         }
-     }
-+    */
-+    /* END PLUMED */
- 
-     /* keep track of all the swaps, starting with the initial placement. */
-     snew(re->allswaps, re->nrepl);
-@@ -982,6 +997,10 @@ test_for_replica_exchange(FILE                 *fplog,
-         pind[i] = re->ind[i];
-     }
- 
-+    /* PLUMED */
-+    int plumed_test_exchange_pattern=0;
-+    /* END PLUMED */
-+
-     if (bMultiEx)
-     {
-         /* multiple random switch exchange */
-@@ -1057,6 +1076,31 @@ test_for_replica_exchange(FILE                 *fplog,
-         /* standard nearest neighbor replica exchange */
- 
-         m = (step / re->nst) % 2;
-+        /* PLUMED */
-+        if(plumedswitch){
-+          int partner=re->repl;
-+          plumed_cmd(plumedmain,"getExchangesFlag",&plumed_test_exchange_pattern);
-+          if(plumed_test_exchange_pattern>0){
-+            int *list;
-+            snew(list,re->nrepl);
-+            plumed_cmd(plumedmain,"setNumberOfReplicas",&(re->nrepl));
-+            plumed_cmd(plumedmain,"getExchangesList",list);
-+            for(i=0; i<re->nrepl; i++) re->ind[i]=list[i];
-+            sfree(list);
-+          }
-+
-+          for(i=1; i<re->nrepl; i++) {
-+            if (i % 2 != m) continue;
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+            if(re->repl==a) partner=b;
-+            if(re->repl==b) partner=a;
-+          }
-+          plumed_cmd(plumedmain,"GREX setPartner",&partner);
-+          plumed_cmd(plumedmain,"GREX calculate",NULL);
-+          plumed_cmd(plumedmain,"GREX shareAllDeltaBias",NULL);
-+        }
-+        /* END PLUMED */
-         for (i = 1; i < re->nrepl; i++)
-         {
-             a = re->ind[i-1];
-@@ -1066,6 +1110,18 @@ test_for_replica_exchange(FILE                 *fplog,
-             if (i % 2 == m)
-             {
-                 delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                /* PLUMED */
-+                if(plumedswitch){
-+                  real adb,bdb,dplumed;
-+                  char buf[300];
-+                  sprintf(buf,"GREX getDeltaBias %d",a); plumed_cmd(plumedmain,buf,&adb);
-+                  sprintf(buf,"GREX getDeltaBias %d",b); plumed_cmd(plumedmain,buf,&bdb);
-+                  dplumed=adb*re->beta[a]+bdb*re->beta[b];
-+                  delta+=dplumed;
-+                  if (bPrint)
-+                    fprintf(fplog,"dplumed = %10.3e  dE_Term = %10.3e (kT)\n",dplumed,delta);
-+                }
-+                /* END PLUMED */
-                 if (delta <= 0)
-                 {
-                     /* accepted */
-@@ -1092,11 +1148,22 @@ test_for_replica_exchange(FILE                 *fplog,
- 
-                 if (bEx[i])
-                 {
-+                  /* PLUMED */
-+                  if(!plumed_test_exchange_pattern) {
-+                    /* standard neighbour swapping */
-                     /* swap these two */
-                     tmp       = pind[i-1];
-                     pind[i-1] = pind[i];
-                     pind[i]   = tmp;
-                     re->nexchange[i]++;  /* statistics for back compatibility */
-+                  } else {
-+                    /* alternative swapping patterns */
-+                    tmp       = pind[a];
-+                    pind[a]   = pind[b];
-+                    pind[b]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                  }
-+                  /* END PLUMED */
-                 }
-             }
-             else
-@@ -1112,6 +1179,15 @@ test_for_replica_exchange(FILE                 *fplog,
-         re->nattempt[m]++;
-     }
- 
-+    /* PLUMED */
-+    if(plumed_test_exchange_pattern>0) {
-+      for (i = 0; i < re->nrepl; i++)
-+      {
-+          re->ind[i] = i;
-+      }
-+    }
-+    /* END PLUMED */
-+
-     /* record which moves were made and accepted */
-     for (i = 0; i < re->nrepl; i++)
-     {
-@@ -1316,6 +1392,10 @@ gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *
-     /* The order in which multiple exchanges will occur. */
-     gmx_bool bThisReplicaExchanged = FALSE;
- 
-+    /* PLUMED */
-+    if(plumedswitch)plumed_cmd(plumedmain,"GREX prepare",NULL);
-+    /* END PLUMED */
-+
-     if (MASTER(cr))
-     {
-         replica_id  = re->repl;
-diff --git a/src/programs/mdrun/repl_ex.c.preplumed b/src/programs/mdrun/repl_ex.c.preplumed
-new file mode 100644
-index 0000000..46a9bc0
---- /dev/null
-+++ b/src/programs/mdrun/repl_ex.c.preplumed
-@@ -0,0 +1,1439 @@
-+/*
-+ * This file is part of the GROMACS molecular simulation package.
-+ *
-+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
-+ * Copyright (c) 2001-2004, The GROMACS development team.
-+ * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by
-+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
-+ * and including many others, as listed in the AUTHORS file in the
-+ * top-level source directory and at http://www.gromacs.org.
-+ *
-+ * GROMACS is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public License
-+ * as published by the Free Software Foundation; either version 2.1
-+ * of the License, or (at your option) any later version.
-+ *
-+ * GROMACS is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with GROMACS; if not, see
-+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
-+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
-+ *
-+ * If you want to redistribute modifications to GROMACS, please
-+ * consider that scientific software is very special. Version
-+ * control is crucial - bugs must be traceable. We will be happy to
-+ * consider code for inclusion in the official distribution, but
-+ * derived work must not be called official GROMACS. Details are found
-+ * in the README & COPYING files - if they are missing, get the
-+ * official version at http://www.gromacs.org.
-+ *
-+ * To help us fund GROMACS development, we humbly ask that you cite
-+ * the research papers on the package. Check out http://www.gromacs.org.
-+ */
-+#ifdef HAVE_CONFIG_H
-+#include <config.h>
-+#endif
-+
-+#include <math.h>
-+#include "repl_ex.h"
-+#include "network.h"
-+#include "gromacs/random/random.h"
-+#include "gromacs/utility/smalloc.h"
-+#include "physics.h"
-+#include "copyrite.h"
-+#include "macros.h"
-+#include "vec.h"
-+#include "names.h"
-+#include "domdec.h"
-+#include "gromacs/random/random.h"
-+
-+#define PROBABILITYCUTOFF 100
-+/* we don't bother evaluating if events are more rare than exp(-100) = 3.7x10^-44 */
-+
-+enum {
-+    ereTEMP, ereLAMBDA, ereENDSINGLE, ereTL, ereNR
-+};
-+const char *erename[ereNR] = { "temperature", "lambda", "end_single_marker", "temperature and lambda"};
-+/* end_single_marker merely notes the end of single variable replica exchange. All types higher than
-+   it are multiple replica exchange methods */
-+/* Eventually, should add 'pressure', 'temperature and pressure', 'lambda_and_pressure', 'temperature_lambda_pressure'?;
-+   Let's wait until we feel better about the pressure control methods giving exact ensembles.  Right now, we assume constant pressure  */
-+
-+typedef struct gmx_repl_ex
-+{
-+    int       repl;
-+    int       nrepl;
-+    real      temp;
-+    int       type;
-+    real    **q;
-+    gmx_bool  bNPT;
-+    real     *pres;
-+    int      *ind;
-+    int      *allswaps;
-+    int       nst;
-+    int       nex;
-+    int       seed;
-+    int       nattempt[2];
-+    real     *prob_sum;
-+    int     **nmoves;
-+    int      *nexchange;
-+    gmx_rng_t rng;
-+
-+    /* these are helper arrays for replica exchange; allocated here so they
-+       don't have to be allocated each time */
-+    int      *destinations;
-+    int     **cyclic;
-+    int     **order;
-+    int      *tmpswap;
-+    gmx_bool *incycle;
-+    gmx_bool *bEx;
-+
-+    /* helper arrays to hold the quantities that are exchanged */
-+    real  *prob;
-+    real  *Epot;
-+    real  *beta;
-+    real  *Vol;
-+    real **de;
-+
-+} t_gmx_repl_ex;
-+
-+static gmx_bool repl_quantity(const gmx_multisim_t *ms,
-+                              struct gmx_repl_ex *re, int ere, real q)
-+{
-+    real    *qall;
-+    gmx_bool bDiff;
-+    int      i, s;
-+
-+    snew(qall, ms->nsim);
-+    qall[re->repl] = q;
-+    gmx_sum_sim(ms->nsim, qall, ms);
-+
-+    bDiff = FALSE;
-+    for (s = 1; s < ms->nsim; s++)
-+    {
-+        if (qall[s] != qall[0])
-+        {
-+            bDiff = TRUE;
-+        }
-+    }
-+
-+    if (bDiff)
-+    {
-+        /* Set the replica exchange type and quantities */
-+        re->type = ere;
-+
-+        snew(re->q[ere], re->nrepl);
-+        for (s = 0; s < ms->nsim; s++)
-+        {
-+            re->q[ere][s] = qall[s];
-+        }
-+    }
-+    sfree(qall);
-+    return bDiff;
-+}
-+
-+gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-+                                    const gmx_multisim_t *ms,
-+                                    const t_state *state,
-+                                    const t_inputrec *ir,
-+                                    int nst, int nex, int init_seed)
-+{
-+    real                temp, pres;
-+    int                 i, j, k;
-+    struct gmx_repl_ex *re;
-+    gmx_bool            bTemp;
-+    gmx_bool            bLambda = FALSE;
-+
-+    fprintf(fplog, "\nInitializing Replica Exchange\n");
-+
-+    if (ms == NULL || ms->nsim == 1)
-+    {
-+        gmx_fatal(FARGS, "Nothing to exchange with only one replica, maybe you forgot to set the -multi option of mdrun?");
-+    }
-+    if (!EI_DYNAMICS(ir->eI))
-+    {
-+        gmx_fatal(FARGS, "Replica exchange is only supported by dynamical simulations");
-+        /* Note that PAR(cr) is defined by cr->nnodes > 1, which is
-+         * distinct from MULTISIM(cr). A multi-simulation only runs
-+         * with real MPI parallelism, but this does not imply PAR(cr)
-+         * is true!
-+         *
-+         * Since we are using a dynamical integrator, the only
-+         * decomposition is DD, so PAR(cr) and DOMAINDECOMP(cr) are
-+         * synonymous. The only way for cr->nnodes > 1 to be true is
-+         * if we are using DD. */
-+    }
-+
-+    snew(re, 1);
-+
-+    re->repl     = ms->sim;
-+    re->nrepl    = ms->nsim;
-+    snew(re->q, ereENDSINGLE);
-+
-+    fprintf(fplog, "Repl  There are %d replicas:\n", re->nrepl);
-+
-+    check_multi_int(fplog, ms, state->natoms, "the number of atoms", FALSE);
-+    check_multi_int(fplog, ms, ir->eI, "the integrator", FALSE);
-+    check_multi_int64(fplog, ms, ir->init_step+ir->nsteps, "init_step+nsteps", FALSE);
-+    check_multi_int64(fplog, ms, (ir->init_step+nst-1)/nst,
-+                      "first exchange step: init_step/-replex", FALSE);
-+    check_multi_int(fplog, ms, ir->etc, "the temperature coupling", FALSE);
-+    check_multi_int(fplog, ms, ir->opts.ngtc,
-+                    "the number of temperature coupling groups", FALSE);
-+    check_multi_int(fplog, ms, ir->epc, "the pressure coupling", FALSE);
-+    check_multi_int(fplog, ms, ir->efep, "free energy", FALSE);
-+    check_multi_int(fplog, ms, ir->fepvals->n_lambda, "number of lambda states", FALSE);
-+
-+    re->temp = ir->opts.ref_t[0];
-+    for (i = 1; (i < ir->opts.ngtc); i++)
-+    {
-+        if (ir->opts.ref_t[i] != re->temp)
-+        {
-+            fprintf(fplog, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
-+            fprintf(stderr, "\nWARNING: The temperatures of the different temperature coupling groups are not identical\n\n");
-+        }
-+    }
-+
-+    re->type = -1;
-+    bTemp    = repl_quantity(ms, re, ereTEMP, re->temp);
-+    if (ir->efep != efepNO)
-+    {
-+        bLambda = repl_quantity(ms, re, ereLAMBDA, (real)ir->fepvals->init_fep_state);
-+    }
-+    if (re->type == -1)  /* nothing was assigned */
-+    {
-+        gmx_fatal(FARGS, "The properties of the %d systems are all the same, there is nothing to exchange", re->nrepl);
-+    }
-+    if (bLambda && bTemp)
-+    {
-+        re->type = ereTL;
-+    }
-+
-+    if (bTemp)
-+    {
-+        please_cite(fplog, "Sugita1999a");
-+        if (ir->epc != epcNO)
-+        {
-+            re->bNPT = TRUE;
-+            fprintf(fplog, "Repl  Using Constant Pressure REMD.\n");
-+            please_cite(fplog, "Okabe2001a");
-+        }
-+        if (ir->etc == etcBERENDSEN)
-+        {
-+            gmx_fatal(FARGS, "REMD with the %s thermostat does not produce correct potential energy distributions, consider using the %s thermostat instead",
-+                      ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
-+        }
-+    }
-+    if (bLambda)
-+    {
-+        if (ir->fepvals->delta_lambda != 0)   /* check this? */
-+        {
-+            gmx_fatal(FARGS, "delta_lambda is not zero");
-+        }
-+    }
-+    if (re->bNPT)
-+    {
-+        snew(re->pres, re->nrepl);
-+        if (ir->epct == epctSURFACETENSION)
-+        {
-+            pres = ir->ref_p[ZZ][ZZ];
-+        }
-+        else
-+        {
-+            pres = 0;
-+            j    = 0;
-+            for (i = 0; i < DIM; i++)
-+            {
-+                if (ir->compress[i][i] != 0)
-+                {
-+                    pres += ir->ref_p[i][i];
-+                    j++;
-+                }
-+            }
-+            pres /= j;
-+        }
-+        re->pres[re->repl] = pres;
-+        gmx_sum_sim(re->nrepl, re->pres, ms);
-+    }
-+
-+    /* Make an index for increasing replica order */
-+    /* only makes sense if one or the other is varying, not both!
-+       if both are varying, we trust the order the person gave. */
-+    snew(re->ind, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->ind[i] = i;
-+    }
-+
-+    if (re->type < ereENDSINGLE)
-+    {
-+
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = i+1; j < re->nrepl; j++)
-+            {
-+                if (re->q[re->type][re->ind[j]] < re->q[re->type][re->ind[i]])
-+                {
-+                    /* Unordered replicas are supposed to work, but there
-+                     * is still an issues somewhere.
-+                     * Note that at this point still re->ind[i]=i.
-+                     */
-+                    gmx_fatal(FARGS, "Replicas with indices %d < %d have %ss %g > %g, please order your replicas on increasing %s",
-+                              i, j,
-+                              erename[re->type],
-+                              re->q[re->type][i], re->q[re->type][j],
-+                              erename[re->type]);
-+
-+                    k          = re->ind[i];
-+                    re->ind[i] = re->ind[j];
-+                    re->ind[j] = k;
-+                }
-+                else if (re->q[re->type][re->ind[j]] == re->q[re->type][re->ind[i]])
-+                {
-+                    gmx_fatal(FARGS, "Two replicas have identical %ss", erename[re->type]);
-+                }
-+            }
-+        }
-+    }
-+
-+    /* keep track of all the swaps, starting with the initial placement. */
-+    snew(re->allswaps, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->allswaps[i] = re->ind[i];
-+    }
-+
-+    switch (re->type)
-+    {
-+        case ereTEMP:
-+            fprintf(fplog, "\nReplica exchange in temperature\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5.1f", re->q[re->type][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        case ereLAMBDA:
-+            fprintf(fplog, "\nReplica exchange in lambda\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %3d", (int)re->q[re->type][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        case ereTL:
-+            fprintf(fplog, "\nReplica exchange in temperature and lambda state\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5.1f", re->q[ereTEMP][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            for (i = 0; i < re->nrepl; i++)
-+            {
-+                fprintf(fplog, " %5d", (int)re->q[ereLAMBDA][re->ind[i]]);
-+            }
-+            fprintf(fplog, "\n");
-+            break;
-+        default:
-+            gmx_incons("Unknown replica exchange quantity");
-+    }
-+    if (re->bNPT)
-+    {
-+        fprintf(fplog, "\nRepl  p");
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            fprintf(fplog, " %5.2f", re->pres[re->ind[i]]);
-+        }
-+
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            if ((i > 0) && (re->pres[re->ind[i]] < re->pres[re->ind[i-1]]))
-+            {
-+                fprintf(fplog, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
-+                fprintf(stderr, "\nWARNING: The reference pressures decrease with increasing temperatures\n\n");
-+            }
-+        }
-+    }
-+    re->nst = nst;
-+    if (init_seed == -1)
-+    {
-+        if (MASTERSIM(ms))
-+        {
-+            re->seed = (int)gmx_rng_make_seed();
-+        }
-+        else
-+        {
-+            re->seed = 0;
-+        }
-+        gmx_sumi_sim(1, &(re->seed), ms);
-+    }
-+    else
-+    {
-+        re->seed = init_seed;
-+    }
-+    fprintf(fplog, "\nReplica exchange interval: %d\n", re->nst);
-+    fprintf(fplog, "\nReplica random seed: %d\n", re->seed);
-+    re->rng = gmx_rng_init(re->seed);
-+
-+    re->nattempt[0] = 0;
-+    re->nattempt[1] = 0;
-+
-+    snew(re->prob_sum, re->nrepl);
-+    snew(re->nexchange, re->nrepl);
-+    snew(re->nmoves, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->nmoves[i], re->nrepl);
-+    }
-+    fprintf(fplog, "Replica exchange information below: x=exchange, pr=probability\n");
-+
-+    /* generate space for the helper functions so we don't have to snew each time */
-+
-+    snew(re->destinations, re->nrepl);
-+    snew(re->incycle, re->nrepl);
-+    snew(re->tmpswap, re->nrepl);
-+    snew(re->cyclic, re->nrepl);
-+    snew(re->order, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->cyclic[i], re->nrepl);
-+        snew(re->order[i], re->nrepl);
-+    }
-+    /* allocate space for the functions storing the data for the replicas */
-+    /* not all of these arrays needed in all cases, but they don't take
-+       up much space, since the max size is nrepl**2 */
-+    snew(re->prob, re->nrepl);
-+    snew(re->bEx, re->nrepl);
-+    snew(re->beta, re->nrepl);
-+    snew(re->Vol, re->nrepl);
-+    snew(re->Epot, re->nrepl);
-+    snew(re->de, re->nrepl);
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        snew(re->de[i], re->nrepl);
-+    }
-+    re->nex = nex;
-+    return re;
-+}
-+
-+static void exchange_reals(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, real *v, int n)
-+{
-+    real *buf;
-+    int   i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-+           buf,n*sizeof(real),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(real), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+
-+static void exchange_ints(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, int *v, int n)
-+{
-+    int *buf;
-+    int  i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
-+             buf,n*sizeof(int),MPI_BYTE,MSRANK(ms,b),0,
-+             ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(int), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_doubles(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, double *v, int n)
-+{
-+    double *buf;
-+    int     i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v,  n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-+           buf,n*sizeof(double),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf, n*sizeof(double), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            v[i] = buf[i];
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_rvecs(const gmx_multisim_t gmx_unused *ms, int gmx_unused b, rvec *v, int n)
-+{
-+    rvec *buf;
-+    int   i;
-+
-+    if (v)
-+    {
-+        snew(buf, n);
-+#ifdef GMX_MPI
-+        /*
-+           MPI_Sendrecv(v[0],  n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-+           buf[0],n*sizeof(rvec),MPI_BYTE,MSRANK(ms,b),0,
-+           ms->mpi_comm_masters,MPI_STATUS_IGNORE);
-+         */
-+        {
-+            MPI_Request mpi_req;
-+
-+            MPI_Isend(v[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
-+                      ms->mpi_comm_masters, &mpi_req);
-+            MPI_Recv(buf[0], n*sizeof(rvec), MPI_BYTE, MSRANK(ms, b), 0,
-+                     ms->mpi_comm_masters, MPI_STATUS_IGNORE);
-+            MPI_Wait(&mpi_req, MPI_STATUS_IGNORE);
-+        }
-+#endif
-+        for (i = 0; i < n; i++)
-+        {
-+            copy_rvec(buf[i], v[i]);
-+        }
-+        sfree(buf);
-+    }
-+}
-+
-+static void exchange_state(const gmx_multisim_t *ms, int b, t_state *state)
-+{
-+    /* When t_state changes, this code should be updated. */
-+    int ngtc, nnhpres;
-+    ngtc    = state->ngtc * state->nhchainlength;
-+    nnhpres = state->nnhpres* state->nhchainlength;
-+    exchange_rvecs(ms, b, state->box, DIM);
-+    exchange_rvecs(ms, b, state->box_rel, DIM);
-+    exchange_rvecs(ms, b, state->boxv, DIM);
-+    exchange_reals(ms, b, &(state->veta), 1);
-+    exchange_reals(ms, b, &(state->vol0), 1);
-+    exchange_rvecs(ms, b, state->svir_prev, DIM);
-+    exchange_rvecs(ms, b, state->fvir_prev, DIM);
-+    exchange_rvecs(ms, b, state->pres_prev, DIM);
-+    exchange_doubles(ms, b, state->nosehoover_xi, ngtc);
-+    exchange_doubles(ms, b, state->nosehoover_vxi, ngtc);
-+    exchange_doubles(ms, b, state->nhpres_xi, nnhpres);
-+    exchange_doubles(ms, b, state->nhpres_vxi, nnhpres);
-+    exchange_doubles(ms, b, state->therm_integral, state->ngtc);
-+    exchange_rvecs(ms, b, state->x, state->natoms);
-+    exchange_rvecs(ms, b, state->v, state->natoms);
-+    exchange_rvecs(ms, b, state->sd_X, state->natoms);
-+}
-+
-+static void copy_rvecs(rvec *s, rvec *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            copy_rvec(s[i], d[i]);
-+        }
-+    }
-+}
-+
-+static void copy_doubles(const double *s, double *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+static void copy_reals(const real *s, real *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+static void copy_ints(const int *s, int *d, int n)
-+{
-+    int i;
-+
-+    if (d != NULL)
-+    {
-+        for (i = 0; i < n; i++)
-+        {
-+            d[i] = s[i];
-+        }
-+    }
-+}
-+
-+#define scopy_rvecs(v, n)   copy_rvecs(state->v, state_local->v, n);
-+#define scopy_doubles(v, n) copy_doubles(state->v, state_local->v, n);
-+#define scopy_reals(v, n) copy_reals(state->v, state_local->v, n);
-+#define scopy_ints(v, n)   copy_ints(state->v, state_local->v, n);
-+
-+static void copy_state_nonatomdata(t_state *state, t_state *state_local)
-+{
-+    /* When t_state changes, this code should be updated. */
-+    int ngtc, nnhpres;
-+    ngtc    = state->ngtc * state->nhchainlength;
-+    nnhpres = state->nnhpres* state->nhchainlength;
-+    scopy_rvecs(box, DIM);
-+    scopy_rvecs(box_rel, DIM);
-+    scopy_rvecs(boxv, DIM);
-+    state_local->veta = state->veta;
-+    state_local->vol0 = state->vol0;
-+    scopy_rvecs(svir_prev, DIM);
-+    scopy_rvecs(fvir_prev, DIM);
-+    scopy_rvecs(pres_prev, DIM);
-+    scopy_doubles(nosehoover_xi, ngtc);
-+    scopy_doubles(nosehoover_vxi, ngtc);
-+    scopy_doubles(nhpres_xi, nnhpres);
-+    scopy_doubles(nhpres_vxi, nnhpres);
-+    scopy_doubles(therm_integral, state->ngtc);
-+    scopy_rvecs(x, state->natoms);
-+    scopy_rvecs(v, state->natoms);
-+    scopy_rvecs(sd_X, state->natoms);
-+    copy_ints(&(state->fep_state), &(state_local->fep_state), 1);
-+    scopy_reals(lambda, efptNR);
-+}
-+
-+static void scale_velocities(t_state *state, real fac)
-+{
-+    int i;
-+
-+    if (state->v)
-+    {
-+        for (i = 0; i < state->natoms; i++)
-+        {
-+            svmul(fac, state->v[i], state->v[i]);
-+        }
-+    }
-+}
-+
-+static void print_transition_matrix(FILE *fplog, int n, int **nmoves, int *nattempt)
-+{
-+    int   i, j, ntot;
-+    float Tprint;
-+
-+    ntot = nattempt[0] + nattempt[1];
-+    fprintf(fplog, "\n");
-+    fprintf(fplog, "Repl");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "    ");  /* put the title closer to the center */
-+    }
-+    fprintf(fplog, "Empirical Transition Matrix\n");
-+
-+    fprintf(fplog, "Repl");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%8d", (i+1));
-+    }
-+    fprintf(fplog, "\n");
-+
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "Repl");
-+        for (j = 0; j < n; j++)
-+        {
-+            Tprint = 0.0;
-+            if (nmoves[i][j] > 0)
-+            {
-+                Tprint = nmoves[i][j]/(2.0*ntot);
-+            }
-+            fprintf(fplog, "%8.4f", Tprint);
-+        }
-+        fprintf(fplog, "%3d\n", i);
-+    }
-+}
-+
-+static void print_ind(FILE *fplog, const char *leg, int n, int *ind, gmx_bool *bEx)
-+{
-+    int i;
-+
-+    fprintf(fplog, "Repl %2s %2d", leg, ind[0]);
-+    for (i = 1; i < n; i++)
-+    {
-+        fprintf(fplog, " %c %2d", (bEx != 0 && bEx[i]) ? 'x' : ' ', ind[i]);
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static void print_allswitchind(FILE *fplog, int n, int *pind, int *allswaps, int *tmpswap)
-+{
-+    int i;
-+
-+    for (i = 0; i < n; i++)
-+    {
-+        tmpswap[i] = allswaps[i];
-+    }
-+    for (i = 0; i < n; i++)
-+    {
-+        allswaps[i] = tmpswap[pind[i]];
-+    }
-+
-+    fprintf(fplog, "\nAccepted Exchanges:   ");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%d ", pind[i]);
-+    }
-+    fprintf(fplog, "\n");
-+
-+    /* the "Order After Exchange" is the state label corresponding to the configuration that
-+       started in state listed in order, i.e.
-+
-+       3 0 1 2
-+
-+       means that the:
-+       configuration starting in simulation 3 is now in simulation 0,
-+       configuration starting in simulation 0 is now in simulation 1,
-+       configuration starting in simulation 1 is now in simulation 2,
-+       configuration starting in simulation 2 is now in simulation 3
-+     */
-+    fprintf(fplog, "Order After Exchange: ");
-+    for (i = 0; i < n; i++)
-+    {
-+        fprintf(fplog, "%d ", allswaps[i]);
-+    }
-+    fprintf(fplog, "\n\n");
-+}
-+
-+static void print_prob(FILE *fplog, const char *leg, int n, real *prob)
-+{
-+    int  i;
-+    char buf[8];
-+
-+    fprintf(fplog, "Repl %2s ", leg);
-+    for (i = 1; i < n; i++)
-+    {
-+        if (prob[i] >= 0)
-+        {
-+            sprintf(buf, "%4.2f", prob[i]);
-+            fprintf(fplog, "  %3s", buf[0] == '1' ? "1.0" : buf+1);
-+        }
-+        else
-+        {
-+            fprintf(fplog, "     ");
-+        }
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static void print_count(FILE *fplog, const char *leg, int n, int *count)
-+{
-+    int i;
-+
-+    fprintf(fplog, "Repl %2s ", leg);
-+    for (i = 1; i < n; i++)
-+    {
-+        fprintf(fplog, " %4d", count[i]);
-+    }
-+    fprintf(fplog, "\n");
-+}
-+
-+static real calc_delta(FILE *fplog, gmx_bool bPrint, struct gmx_repl_ex *re, int a, int b, int ap, int bp)
-+{
-+
-+    real   ediff, dpV, delta = 0;
-+    real  *Epot = re->Epot;
-+    real  *Vol  = re->Vol;
-+    real **de   = re->de;
-+    real  *beta = re->beta;
-+
-+    /* Two cases; we are permuted and not.  In all cases, setting ap = a and bp = b will reduce
-+       to the non permuted case */
-+
-+    switch (re->type)
-+    {
-+        case ereTEMP:
-+            /*
-+             * Okabe et. al. Chem. Phys. Lett. 335 (2001) 435-439
-+             */
-+            ediff = Epot[b] - Epot[a];
-+            delta = -(beta[bp] - beta[ap])*ediff;
-+            break;
-+        case ereLAMBDA:
-+            /* two cases:  when we are permuted, and not.  */
-+            /* non-permuted:
-+               ediff =  E_new - E_old
-+                     =  [H_b(x_a) + H_a(x_b)] - [H_b(x_b) + H_a(x_a)]
-+                     =  [H_b(x_a) - H_a(x_a)] + [H_a(x_b) - H_b(x_b)]
-+                     =  de[b][a] + de[a][b] */
-+
-+            /* permuted:
-+               ediff =  E_new - E_old
-+                     =  [H_bp(x_a) + H_ap(x_b)] - [H_bp(x_b) + H_ap(x_a)]
-+                     =  [H_bp(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_bp(x_b)]
-+                     =  [H_bp(x_a) - H_a(x_a) + H_a(x_a) - H_ap(x_a)] + [H_ap(x_b) - H_b(x_b) + H_b(x_b) - H_bp(x_b)]
-+                     =  [H_bp(x_a) - H_a(x_a)] - [H_ap(x_a) - H_a(x_a)] + [H_ap(x_b) - H_b(x_b)] - H_bp(x_b) - H_b(x_b)]
-+                     =  (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b])    */
-+            /* but, in the current code implementation, we flip configurations, not indices . . .
-+               So let's examine that.
-+                     =  [H_b(x_ap) - H_a(x_a)] - [H_a(x_ap) - H_a(x_a)] + [H_a(x_bp) - H_b(x_b)] - H_b(x_bp) - H_b(x_b)]
-+                     =  [H_b(x_ap) - H_a(x_ap)]  + [H_a(x_bp) - H_b(x_pb)]
-+                     = (de[b][ap] - de[a][ap]) + (de[a][bp] - de[b][bp]
-+                     So, if we exchange b<=> bp and a<=> ap, we return to the same result.
-+                     So the simple solution is to flip the
-+                     position of perturbed and original indices in the tests.
-+             */
-+
-+            ediff = (de[bp][a] - de[ap][a]) + (de[ap][b] - de[bp][b]);
-+            delta = ediff*beta[a]; /* assume all same temperature in this case */
-+            break;
-+        case ereTL:
-+            /* not permuted:  */
-+            /* delta =  reduced E_new - reduced E_old
-+                     =  [beta_b H_b(x_a) + beta_a H_a(x_b)] - [beta_b H_b(x_b) + beta_a H_a(x_a)]
-+                     =  [beta_b H_b(x_a) - beta_a H_a(x_a)] + [beta_a H_a(x_b) - beta_b H_b(x_b)]
-+                     =  [beta_b dH_b(x_a) + beta_b H_a(x_a) - beta_a H_a(x_a)] +
-+                        [beta_a dH_a(x_b) + beta_a H_b(x_b) - beta_b H_b(x_b)]
-+                     =  [beta_b dH_b(x_a) + [beta_a dH_a(x_b) +
-+                        beta_b (H_a(x_a) - H_b(x_b)]) - beta_a (H_a(x_a) - H_b(x_b))
-+                     =  beta_b dH_b(x_a) + beta_a dH_a(x_b) - (beta_b - beta_a)(H_b(x_b) - H_a(x_a) */
-+            /* delta = beta[b]*de[b][a] + beta[a]*de[a][b] - (beta[b] - beta[a])*(Epot[b] - Epot[a]; */
-+            /* permuted (big breath!) */
-+            /*   delta =  reduced E_new - reduced E_old
-+                     =  [beta_bp H_bp(x_a) + beta_ap H_ap(x_b)] - [beta_bp H_bp(x_b) + beta_ap H_ap(x_a)]
-+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-+                     =  [beta_bp H_bp(x_a) - beta_ap H_ap(x_a)] + [beta_ap H_ap(x_b) - beta_bp H_bp(x_b)]
-+                        - beta_pb H_a(x_a) + beta_ap H_a(x_a) + beta_pb H_a(x_a) - beta_ap H_a(x_a)
-+                        - beta_ap H_b(x_b) + beta_bp H_b(x_b) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-+                     =  [(beta_bp H_bp(x_a) - beta_bp H_a(x_a)) - (beta_ap H_ap(x_a) - beta_ap H_a(x_a))] +
-+                        [(beta_ap H_ap(x_b)  - beta_ap H_b(x_b)) - (beta_bp H_bp(x_b) - beta_bp H_b(x_b))]
-+             + beta_pb H_a(x_a) - beta_ap H_a(x_a) + beta_ap H_b(x_b) - beta_bp H_b(x_b)
-+                     =  [beta_bp (H_bp(x_a) - H_a(x_a)) - beta_ap (H_ap(x_a) - H_a(x_a))] +
-+                        [beta_ap (H_ap(x_b) - H_b(x_b)) - beta_bp (H_bp(x_b) - H_b(x_b))]
-+             + beta_pb (H_a(x_a) - H_b(x_b))  - beta_ap (H_a(x_a) - H_b(x_b))
-+                     =  ([beta_bp de[bp][a] - beta_ap de[ap][a]) + beta_ap de[ap][b]  - beta_bp de[bp][b])
-+             + (beta_pb-beta_ap)(H_a(x_a) - H_b(x_b))  */
-+            delta = beta[bp]*(de[bp][a] - de[bp][b]) + beta[ap]*(de[ap][b] - de[ap][a]) - (beta[bp]-beta[ap])*(Epot[b]-Epot[a]);
-+            break;
-+        default:
-+            gmx_incons("Unknown replica exchange quantity");
-+    }
-+    if (bPrint)
-+    {
-+        fprintf(fplog, "Repl %d <-> %d  dE_term = %10.3e (kT)\n", a, b, delta);
-+    }
-+    if (re->bNPT)
-+    {
-+        /* revist the calculation for 5.0.  Might be some improvements. */
-+        dpV = (beta[ap]*re->pres[ap]-beta[bp]*re->pres[bp])*(Vol[b]-Vol[a])/PRESFAC;
-+        if (bPrint)
-+        {
-+            fprintf(fplog, "  dpV = %10.3e  d = %10.3e\n", dpV, delta + dpV);
-+        }
-+        delta += dpV;
-+    }
-+    return delta;
-+}
-+
-+static void
-+test_for_replica_exchange(FILE                 *fplog,
-+                          const gmx_multisim_t *ms,
-+                          struct gmx_repl_ex   *re,
-+                          gmx_enerdata_t       *enerd,
-+                          real                  vol,
-+                          gmx_int64_t           step,
-+                          real                  time)
-+{
-+    int       m, i, j, a, b, ap, bp, i0, i1, tmp;
-+    real      ediff = 0, delta = 0, dpV = 0;
-+    gmx_bool  bPrint, bMultiEx;
-+    gmx_bool *bEx      = re->bEx;
-+    real     *prob     = re->prob;
-+    int      *pind     = re->destinations; /* permuted index */
-+    gmx_bool  bEpot    = FALSE;
-+    gmx_bool  bDLambda = FALSE;
-+    gmx_bool  bVol     = FALSE;
-+    gmx_rng_t rng;
-+
-+    bMultiEx = (re->nex > 1);  /* multiple exchanges at each state */
-+    fprintf(fplog, "Replica exchange at step " "%"GMX_PRId64 " time %.5f\n", step, time);
-+
-+    if (re->bNPT)
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->Vol[i] = 0;
-+        }
-+        bVol               = TRUE;
-+        re->Vol[re->repl]  = vol;
-+    }
-+    if ((re->type == ereTEMP || re->type == ereTL))
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->Epot[i] = 0;
-+        }
-+        bEpot              = TRUE;
-+        re->Epot[re->repl] = enerd->term[F_EPOT];
-+        /* temperatures of different states*/
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->beta[i] = 1.0/(re->q[ereTEMP][i]*BOLTZ);
-+        }
-+    }
-+    else
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->beta[i] = 1.0/(re->temp*BOLTZ);  /* we have a single temperature */
-+        }
-+    }
-+    if (re->type == ereLAMBDA || re->type == ereTL)
-+    {
-+        bDLambda = TRUE;
-+        /* lambda differences. */
-+        /* de[i][j] is the energy of the jth simulation in the ith Hamiltonian
-+           minus the energy of the jth simulation in the jth Hamiltonian */
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = 0; j < re->nrepl; j++)
-+            {
-+                re->de[i][j] = 0;
-+            }
-+        }
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            re->de[i][re->repl] = (enerd->enerpart_lambda[(int)re->q[ereLAMBDA][i]+1]-enerd->enerpart_lambda[0]);
-+        }
-+    }
-+
-+    /* now actually do the communication */
-+    if (bVol)
-+    {
-+        gmx_sum_sim(re->nrepl, re->Vol, ms);
-+    }
-+    if (bEpot)
-+    {
-+        gmx_sum_sim(re->nrepl, re->Epot, ms);
-+    }
-+    if (bDLambda)
-+    {
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            gmx_sum_sim(re->nrepl, re->de[i], ms);
-+        }
-+    }
-+
-+    /* make a duplicate set of indices for shuffling */
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        pind[i] = re->ind[i];
-+    }
-+
-+    if (bMultiEx)
-+    {
-+        /* multiple random switch exchange */
-+        int nself = 0;
-+        for (i = 0; i < re->nex + nself; i++)
-+        {
-+            double rnd[2];
-+
-+            gmx_rng_cycle_2uniform(step, i*2, re->seed, RND_SEED_REPLEX, rnd);
-+            /* randomly select a pair  */
-+            /* in theory, could reduce this by identifying only which switches had a nonneglibible
-+               probability of occurring (log p > -100) and only operate on those switches */
-+            /* find out which state it is from, and what label that state currently has. Likely
-+               more work that useful. */
-+            i0 = (int)(re->nrepl*rnd[0]);
-+            i1 = (int)(re->nrepl*rnd[1]);
-+            if (i0 == i1)
-+            {
-+                nself++;
-+                continue;  /* self-exchange, back up and do it again */
-+            }
-+
-+            a  = re->ind[i0]; /* what are the indices of these states? */
-+            b  = re->ind[i1];
-+            ap = pind[i0];
-+            bp = pind[i1];
-+
-+            bPrint = FALSE; /* too noisy */
-+            /* calculate the energy difference */
-+            /* if the code changes to flip the STATES, rather than the configurations,
-+               use the commented version of the code */
-+            /* delta = calc_delta(fplog,bPrint,re,a,b,ap,bp); */
-+            delta = calc_delta(fplog, bPrint, re, ap, bp, a, b);
-+
-+            /* we actually only use the first space in the prob and bEx array,
-+               since there are actually many switches between pairs. */
-+
-+            if (delta <= 0)
-+            {
-+                /* accepted */
-+                prob[0] = 1;
-+                bEx[0]  = TRUE;
-+            }
-+            else
-+            {
-+                if (delta > PROBABILITYCUTOFF)
-+                {
-+                    prob[0] = 0;
-+                }
-+                else
-+                {
-+                    prob[0] = exp(-delta);
-+                }
-+                /* roll a number to determine if accepted */
-+                gmx_rng_cycle_2uniform(step, i*2+1, re->seed, RND_SEED_REPLEX, rnd);
-+                bEx[0] = rnd[0] < prob[0];
-+            }
-+            re->prob_sum[0] += prob[0];
-+
-+            if (bEx[0])
-+            {
-+                /* swap the states */
-+                tmp      = pind[i0];
-+                pind[i0] = pind[i1];
-+                pind[i1] = tmp;
-+            }
-+        }
-+        re->nattempt[0]++;  /* keep track of total permutation trials here */
-+        print_allswitchind(fplog, re->nrepl, pind, re->allswaps, re->tmpswap);
-+    }
-+    else
-+    {
-+        /* standard nearest neighbor replica exchange */
-+
-+        m = (step / re->nst) % 2;
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            a = re->ind[i-1];
-+            b = re->ind[i];
-+
-+            bPrint = (re->repl == a || re->repl == b);
-+            if (i % 2 == m)
-+            {
-+                delta = calc_delta(fplog, bPrint, re, a, b, a, b);
-+                if (delta <= 0)
-+                {
-+                    /* accepted */
-+                    prob[i] = 1;
-+                    bEx[i]  = TRUE;
-+                }
-+                else
-+                {
-+                    double rnd[2];
-+
-+                    if (delta > PROBABILITYCUTOFF)
-+                    {
-+                        prob[i] = 0;
-+                    }
-+                    else
-+                    {
-+                        prob[i] = exp(-delta);
-+                    }
-+                    /* roll a number to determine if accepted */
-+                    gmx_rng_cycle_2uniform(step, i, re->seed, RND_SEED_REPLEX, rnd);
-+                    bEx[i] = rnd[0] < prob[i];
-+                }
-+                re->prob_sum[i] += prob[i];
-+
-+                if (bEx[i])
-+                {
-+                    /* swap these two */
-+                    tmp       = pind[i-1];
-+                    pind[i-1] = pind[i];
-+                    pind[i]   = tmp;
-+                    re->nexchange[i]++;  /* statistics for back compatibility */
-+                }
-+            }
-+            else
-+            {
-+                prob[i] = -1;
-+                bEx[i]  = FALSE;
-+            }
-+        }
-+        /* print some statistics */
-+        print_ind(fplog, "ex", re->nrepl, re->ind, bEx);
-+        print_prob(fplog, "pr", re->nrepl, prob);
-+        fprintf(fplog, "\n");
-+        re->nattempt[m]++;
-+    }
-+
-+    /* record which moves were made and accepted */
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        re->nmoves[re->ind[i]][pind[i]] += 1;
-+        re->nmoves[pind[i]][re->ind[i]] += 1;
-+    }
-+    fflush(fplog); /* make sure we can see what the last exchange was */
-+}
-+
-+static void write_debug_x(t_state *state)
-+{
-+    int i;
-+
-+    if (debug)
-+    {
-+        for (i = 0; i < state->natoms; i += 10)
-+        {
-+            fprintf(debug, "dx %5d %10.5f %10.5f %10.5f\n", i, state->x[i][XX], state->x[i][YY], state->x[i][ZZ]);
-+        }
-+    }
-+}
-+
-+static void
-+cyclic_decomposition(const int *destinations,
-+                     int      **cyclic,
-+                     gmx_bool  *incycle,
-+                     const int  nrepl,
-+                     int       *nswap)
-+{
-+
-+    int i, j, c, p;
-+    int maxlen = 1;
-+    for (i = 0; i < nrepl; i++)
-+    {
-+        incycle[i] = FALSE;
-+    }
-+    for (i = 0; i < nrepl; i++)  /* one cycle for each replica */
-+    {
-+        if (incycle[i])
-+        {
-+            cyclic[i][0] = -1;
-+            continue;
-+        }
-+        cyclic[i][0] = i;
-+        incycle[i]   = TRUE;
-+        c            = 1;
-+        p            = i;
-+        for (j = 0; j < nrepl; j++) /* potentially all cycles are part, but we will break first */
-+        {
-+            p = destinations[p];    /* start permuting */
-+            if (p == i)
-+            {
-+                cyclic[i][c] = -1;
-+                if (c > maxlen)
-+                {
-+                    maxlen = c;
-+                }
-+                break; /* we've reached the original element, the cycle is complete, and we marked the end. */
-+            }
-+            else
-+            {
-+                cyclic[i][c] = p;  /* each permutation gives a new member of the cycle */
-+                incycle[p]   = TRUE;
-+                c++;
-+            }
-+        }
-+    }
-+    *nswap = maxlen - 1;
-+
-+    if (debug)
-+    {
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            fprintf(debug, "Cycle %d:", i);
-+            for (j = 0; j < nrepl; j++)
-+            {
-+                if (cyclic[i][j] < 0)
-+                {
-+                    break;
-+                }
-+                fprintf(debug, "%2d", cyclic[i][j]);
-+            }
-+            fprintf(debug, "\n");
-+        }
-+        fflush(debug);
-+    }
-+}
-+
-+static void
-+compute_exchange_order(FILE     *fplog,
-+                       int     **cyclic,
-+                       int     **order,
-+                       const int nrepl,
-+                       const int maxswap)
-+{
-+    int i, j;
-+
-+    for (j = 0; j < maxswap; j++)
-+    {
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            if (cyclic[i][j+1] >= 0)
-+            {
-+                order[cyclic[i][j+1]][j] = cyclic[i][j];
-+                order[cyclic[i][j]][j]   = cyclic[i][j+1];
-+            }
-+        }
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            if (order[i][j] < 0)
-+            {
-+                order[i][j] = i; /* if it's not exchanging, it should stay this round*/
-+            }
-+        }
-+    }
-+
-+    if (debug)
-+    {
-+        fprintf(fplog, "Replica Exchange Order\n");
-+        for (i = 0; i < nrepl; i++)
-+        {
-+            fprintf(fplog, "Replica %d:", i);
-+            for (j = 0; j < maxswap; j++)
-+            {
-+                if (order[i][j] < 0)
-+                {
-+                    break;
-+                }
-+                fprintf(debug, "%2d", order[i][j]);
-+            }
-+            fprintf(fplog, "\n");
-+        }
-+        fflush(fplog);
-+    }
-+}
-+
-+static void
-+prepare_to_do_exchange(FILE               *fplog,
-+                       struct gmx_repl_ex *re,
-+                       const int           replica_id,
-+                       int                *maxswap,
-+                       gmx_bool           *bThisReplicaExchanged)
-+{
-+    int i, j;
-+    /* Hold the cyclic decomposition of the (multiple) replica
-+     * exchange. */
-+    gmx_bool bAnyReplicaExchanged = FALSE;
-+    *bThisReplicaExchanged = FALSE;
-+
-+    for (i = 0; i < re->nrepl; i++)
-+    {
-+        if (re->destinations[i] != re->ind[i])
-+        {
-+            /* only mark as exchanged if the index has been shuffled */
-+            bAnyReplicaExchanged = TRUE;
-+            break;
-+        }
-+    }
-+    if (bAnyReplicaExchanged)
-+    {
-+        /* reinitialize the placeholder arrays */
-+        for (i = 0; i < re->nrepl; i++)
-+        {
-+            for (j = 0; j < re->nrepl; j++)
-+            {
-+                re->cyclic[i][j] = -1;
-+                re->order[i][j]  = -1;
-+            }
-+        }
-+
-+        /* Identify the cyclic decomposition of the permutation (very
-+         * fast if neighbor replica exchange). */
-+        cyclic_decomposition(re->destinations, re->cyclic, re->incycle, re->nrepl, maxswap);
-+
-+        /* Now translate the decomposition into a replica exchange
-+         * order at each step. */
-+        compute_exchange_order(fplog, re->cyclic, re->order, re->nrepl, *maxswap);
-+
-+        /* Did this replica do any exchange at any point? */
-+        for (j = 0; j < *maxswap; j++)
-+        {
-+            if (replica_id != re->order[replica_id][j])
-+            {
-+                *bThisReplicaExchanged = TRUE;
-+                break;
-+            }
-+        }
-+    }
-+}
-+
-+gmx_bool replica_exchange(FILE *fplog, const t_commrec *cr, struct gmx_repl_ex *re,
-+                          t_state *state, gmx_enerdata_t *enerd,
-+                          t_state *state_local, gmx_int64_t step, real time)
-+{
-+    int i, j;
-+    int replica_id = 0;
-+    int exchange_partner;
-+    int maxswap = 0;
-+    /* Number of rounds of exchanges needed to deal with any multiple
-+     * exchanges. */
-+    /* Where each replica ends up after the exchange attempt(s). */
-+    /* The order in which multiple exchanges will occur. */
-+    gmx_bool bThisReplicaExchanged = FALSE;
-+
-+    if (MASTER(cr))
-+    {
-+        replica_id  = re->repl;
-+        test_for_replica_exchange(fplog, cr->ms, re, enerd, det(state_local->box), step, time);
-+        prepare_to_do_exchange(fplog, re, replica_id, &maxswap, &bThisReplicaExchanged);
-+    }
-+    /* Do intra-simulation broadcast so all processors belonging to
-+     * each simulation know whether they need to participate in
-+     * collecting the state. Otherwise, they might as well get on with
-+     * the next thing to do. */
-+    if (DOMAINDECOMP(cr))
-+    {
-+#ifdef GMX_MPI
-+        MPI_Bcast(&bThisReplicaExchanged, sizeof(gmx_bool), MPI_BYTE, MASTERRANK(cr),
-+                  cr->mpi_comm_mygroup);
-+#endif
-+    }
-+
-+    if (bThisReplicaExchanged)
-+    {
-+        /* Exchange the states */
-+        /* Collect the global state on the master node */
-+        if (DOMAINDECOMP(cr))
-+        {
-+            dd_collect_state(cr->dd, state_local, state);
-+        }
-+        else
-+        {
-+            copy_state_nonatomdata(state_local, state);
-+        }
-+
-+        if (MASTER(cr))
-+        {
-+            /* There will be only one swap cycle with standard replica
-+             * exchange, but there may be multiple swap cycles if we
-+             * allow multiple swaps. */
-+
-+            for (j = 0; j < maxswap; j++)
-+            {
-+                exchange_partner = re->order[replica_id][j];
-+
-+                if (exchange_partner != replica_id)
-+                {
-+                    /* Exchange the global states between the master nodes */
-+                    if (debug)
-+                    {
-+                        fprintf(debug, "Exchanging %d with %d\n", replica_id, exchange_partner);
-+                    }
-+                    exchange_state(cr->ms, exchange_partner, state);
-+                }
-+            }
-+            /* For temperature-type replica exchange, we need to scale
-+             * the velocities. */
-+            if (re->type == ereTEMP || re->type == ereTL)
-+            {
-+                scale_velocities(state, sqrt(re->q[ereTEMP][replica_id]/re->q[ereTEMP][re->destinations[replica_id]]));
-+            }
-+
-+        }
-+
-+        /* With domain decomposition the global state is distributed later */
-+        if (!DOMAINDECOMP(cr))
-+        {
-+            /* Copy the global state to the local state data structure */
-+            copy_state_nonatomdata(state, state_local);
-+        }
-+    }
-+
-+    return bThisReplicaExchanged;
-+}
-+
-+void print_replica_exchange_statistics(FILE *fplog, struct gmx_repl_ex *re)
-+{
-+    int  i;
-+
-+    fprintf(fplog, "\nReplica exchange statistics\n");
-+
-+    if (re->nex == 0)
-+    {
-+        fprintf(fplog, "Repl  %d attempts, %d odd, %d even\n",
-+                re->nattempt[0]+re->nattempt[1], re->nattempt[1], re->nattempt[0]);
-+
-+        fprintf(fplog, "Repl  average probabilities:\n");
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            if (re->nattempt[i%2] == 0)
-+            {
-+                re->prob[i] = 0;
-+            }
-+            else
-+            {
-+                re->prob[i] =  re->prob_sum[i]/re->nattempt[i%2];
-+            }
-+        }
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_prob(fplog, "", re->nrepl, re->prob);
-+
-+        fprintf(fplog, "Repl  number of exchanges:\n");
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_count(fplog, "", re->nrepl, re->nexchange);
-+
-+        fprintf(fplog, "Repl  average number of exchanges:\n");
-+        for (i = 1; i < re->nrepl; i++)
-+        {
-+            if (re->nattempt[i%2] == 0)
-+            {
-+                re->prob[i] = 0;
-+            }
-+            else
-+            {
-+                re->prob[i] =  ((real)re->nexchange[i])/re->nattempt[i%2];
-+            }
-+        }
-+        print_ind(fplog, "", re->nrepl, re->ind, NULL);
-+        print_prob(fplog, "", re->nrepl, re->prob);
-+
-+        fprintf(fplog, "\n");
-+    }
-+    /* print the transition matrix */
-+    print_transition_matrix(fplog, re->nrepl, re->nmoves, re->nattempt);
-+}